xref: /onnv-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 3489:aac22029cb02)
13446Smrj /*
23446Smrj  * CDDL HEADER START
33446Smrj  *
43446Smrj  * The contents of this file are subject to the terms of the
53446Smrj  * Common Development and Distribution License (the "License").
63446Smrj  * You may not use this file except in compliance with the License.
73446Smrj  *
83446Smrj  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93446Smrj  * or http://www.opensolaris.org/os/licensing.
103446Smrj  * See the License for the specific language governing permissions
113446Smrj  * and limitations under the License.
123446Smrj  *
133446Smrj  * When distributing Covered Code, include this CDDL HEADER in each
143446Smrj  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153446Smrj  * If applicable, add the following below this CDDL HEADER, with the
163446Smrj  * fields enclosed by brackets "[]" replaced with your own identifying
173446Smrj  * information: Portions Copyright [yyyy] [name of copyright owner]
183446Smrj  *
193446Smrj  * CDDL HEADER END
203446Smrj  */
213446Smrj 
223446Smrj /*
233446Smrj  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
243446Smrj  * Use is subject to license terms.
253446Smrj  */
263446Smrj 
273446Smrj #pragma ident	"%Z%%M%	%I%	%E% SMI"
283446Smrj 
293446Smrj #include <sys/types.h>
303446Smrj #include <sys/machparam.h>
313446Smrj #include <sys/x86_archext.h>
323446Smrj #include <sys/systm.h>
333446Smrj #include <sys/mach_mmu.h>
343446Smrj 
353446Smrj #include <sys/multiboot.h>
363446Smrj 
373446Smrj extern multiboot_header_t mb_header;
383446Smrj extern int have_cpuid(void);
393446Smrj extern uint32_t get_cpuid_edx(uint32_t *eax);
403446Smrj 
413446Smrj #include <sys/inttypes.h>
423446Smrj #include <sys/bootinfo.h>
433446Smrj #include <sys/mach_mmu.h>
443446Smrj #include <sys/boot_console.h>
453446Smrj 
463446Smrj #include "dboot_printf.h"
473446Smrj #include "dboot_xboot.h"
483446Smrj #include "dboot_elfload.h"
493446Smrj 
503446Smrj /*
513446Smrj  * This file contains code that runs to transition us from either a multiboot
523446Smrj  * compliant loader (32 bit non-paging) or Xen domain loader to regular kernel
533446Smrj  * execution. Its task is to setup the kernel memory image and page tables.
543446Smrj  *
553446Smrj  * The code executes as:
563446Smrj  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
573446Smrj  * 	- 32 bit program for Xen 32 bit
583446Smrj  *	- 64 bit program for Xen 64 bit (at least that's my assumption for now)
593446Smrj  *
603446Smrj  * Under Xen, we must create mappings for any memory beyond the initial
613446Smrj  * start of day allocation (such as the kernel itself).
623446Smrj  *
633446Smrj  * When not under Xen, the mapping between maddr_t and paddr_t is 1:1.
643446Smrj  * Since we are running in real mode, so all such memory is accessible.
653446Smrj  */
663446Smrj 
673446Smrj /*
683446Smrj  * Standard bits used in PTE (page level) and PTP (internal levels)
693446Smrj  */
703446Smrj x86pte_t ptp_bits = PT_VALID | PT_REF | PT_USER | PT_WRITABLE | PT_USER;
713446Smrj x86pte_t pte_bits = PT_VALID | PT_REF | PT_MOD | PT_NOCONSIST | PT_WRITABLE;
723446Smrj 
733446Smrj /*
743446Smrj  * This is the target addresses (physical) where the kernel text and data
753446Smrj  * nucleus pages will be unpacked. On Xen this is actually a virtual address.
763446Smrj  */
773446Smrj paddr_t ktext_phys;
783446Smrj uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
793446Smrj 
803446Smrj static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
813446Smrj 
823446Smrj /*
833446Smrj  * The stack is setup in assembler before entering startup_kernel()
843446Smrj  */
853446Smrj char stack_space[STACK_SIZE];
863446Smrj 
873446Smrj /*
883446Smrj  * Used to track physical memory allocation
893446Smrj  */
903446Smrj static paddr_t next_avail_addr = 0;
913446Smrj 
923446Smrj multiboot_info_t *mb_info;
933446Smrj 
943446Smrj /*
953446Smrj  * This contains information passed to the kernel
963446Smrj  */
973446Smrj struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
983446Smrj struct xboot_info *bi;
993446Smrj 
1003446Smrj /*
1013446Smrj  * Page table and memory stuff.
1023446Smrj  */
1033446Smrj static uint64_t max_mem;			/* maximum memory address */
1043446Smrj 
1053446Smrj /*
1063446Smrj  * Information about processor MMU
1073446Smrj  */
1083446Smrj int amd64_support = 0;
1093446Smrj int largepage_support = 0;
1103446Smrj int pae_support = 0;
1113446Smrj int pge_support = 0;
1123446Smrj int NX_support = 0;
1133446Smrj 
1143446Smrj /*
1153446Smrj  * Low 32 bits of kernel entry address passed back to assembler.
1163446Smrj  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
1173446Smrj  */
1183446Smrj uint32_t entry_addr_low;
1193446Smrj 
1203446Smrj /*
1213446Smrj  * Memlists for the kernel. We shouldn't need a lot of these.
1223446Smrj  */
123*3489Sjosephb #define	MAX_MEMLIST (50)
1243446Smrj struct boot_memlist memlists[MAX_MEMLIST];
1253446Smrj uint_t memlists_used = 0;
126*3489Sjosephb struct boot_memlist pcimemlists[MAX_MEMLIST];
127*3489Sjosephb uint_t pcimemlists_used = 0;
1283446Smrj 
1293446Smrj #define	MAX_MODULES (10)
1303446Smrj struct boot_modules modules[MAX_MODULES];
1313446Smrj uint_t modules_used = 0;
1323446Smrj 
1333446Smrj /*
1343446Smrj  * Debugging macros
1353446Smrj  */
1363446Smrj uint_t prom_debug = 0;
1373446Smrj uint_t map_debug = 0;
1383446Smrj 
1393446Smrj /*
1403446Smrj  * The Xen/Grub specific code builds the initial memlists. This code does
1413446Smrj  * sort/merge/link for final use.
1423446Smrj  */
1433446Smrj static void
1443446Smrj sort_physinstall(void)
1453446Smrj {
1463446Smrj 	int i;
1473446Smrj 	int j;
1483446Smrj 	struct boot_memlist tmp;
1493446Smrj 
1503446Smrj 	/*
1513446Smrj 	 * Now sort the memlists, in case they weren't in order.
1523446Smrj 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
1533446Smrj 	 */
1543446Smrj 	DBG_MSG("Sorting phys-installed list\n");
1553446Smrj 	for (j = memlists_used - 1; j > 0; --j) {
1563446Smrj 		for (i = 0; i < j; ++i) {
1573446Smrj 			if (memlists[i].addr < memlists[i + 1].addr)
1583446Smrj 				continue;
1593446Smrj 			tmp = memlists[i];
1603446Smrj 			memlists[i] = memlists[i + 1];
1613446Smrj 			memlists[i + 1] = tmp;
1623446Smrj 		}
1633446Smrj 	}
1643446Smrj 
1653446Smrj 	/*
1663446Smrj 	 * Merge any memlists that don't have holes between them.
1673446Smrj 	 */
1683446Smrj 	for (i = 0; i <= memlists_used - 1; ++i) {
1693446Smrj 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
1703446Smrj 			continue;
1713446Smrj 
1723446Smrj 		if (prom_debug)
1733446Smrj 			dboot_printf(
1743446Smrj 			    "merging mem segs %" PRIx64 "...%" PRIx64
1753446Smrj 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
1763446Smrj 			    memlists[i].addr,
1773446Smrj 			    memlists[i].addr + memlists[i].size,
1783446Smrj 			    memlists[i + 1].addr,
1793446Smrj 			    memlists[i + 1].addr + memlists[i + 1].size);
1803446Smrj 
1813446Smrj 		memlists[i].size += memlists[i + 1].size;
1823446Smrj 		for (j = i + 1; j < memlists_used - 1; ++j)
1833446Smrj 			memlists[j] = memlists[j + 1];
1843446Smrj 		--memlists_used;
1853446Smrj 		DBG(memlists_used);
1863446Smrj 		--i;	/* after merging we need to reexamine, so do this */
1873446Smrj 	}
1883446Smrj 
1893446Smrj 	if (prom_debug) {
1903446Smrj 		dboot_printf("\nFinal memlists:\n");
1913446Smrj 		for (i = 0; i < memlists_used; ++i) {
1923446Smrj 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
1933446Smrj 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
1943446Smrj 		}
1953446Smrj 	}
1963446Smrj 
1973446Smrj 	/*
1983446Smrj 	 * link together the memlists with native size pointers
1993446Smrj 	 */
2003446Smrj 	memlists[0].next = 0;
2013446Smrj 	memlists[0].prev = 0;
2023446Smrj 	for (i = 1; i < memlists_used; ++i) {
2033446Smrj 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
2043446Smrj 		memlists[i].next = 0;
2053446Smrj 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
2063446Smrj 	}
2073446Smrj 	bi->bi_phys_install = (native_ptr_t)memlists;
2083446Smrj 	DBG(bi->bi_phys_install);
2093446Smrj }
2103446Smrj 
2113446Smrj x86pte_t
2123446Smrj get_pteval(paddr_t table, uint_t index)
2133446Smrj {
2143446Smrj 	if (pae_support)
2153446Smrj 		return (((x86pte_t *)(uintptr_t)table)[index]);
2163446Smrj 	return (((x86pte32_t *)(uintptr_t)table)[index]);
2173446Smrj }
2183446Smrj 
2193446Smrj /*ARGSUSED*/
2203446Smrj void
2213446Smrj set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
2223446Smrj {
2233446Smrj 	uintptr_t tab_addr = (uintptr_t)table;
2243446Smrj 
2253446Smrj 	if (pae_support)
2263446Smrj 		((x86pte_t *)tab_addr)[index] = pteval;
2273446Smrj 	else
2283446Smrj 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
2293446Smrj 	if (level == top_level && level == 2)
2303446Smrj 		reload_cr3();
2313446Smrj }
2323446Smrj 
2333446Smrj paddr_t
2343446Smrj make_ptable(x86pte_t *pteval, uint_t level)
2353446Smrj {
2363446Smrj 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
2373446Smrj 
2383446Smrj 	if (level == top_level && level == 2)
2393446Smrj 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
2403446Smrj 	else
2413446Smrj 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
2423446Smrj 
2433446Smrj 	if (map_debug)
2443446Smrj 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
2453446Smrj 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
2463446Smrj 	return (new_table);
2473446Smrj }
2483446Smrj 
2493446Smrj x86pte_t *
2503446Smrj map_pte(paddr_t table, uint_t index)
2513446Smrj {
2523446Smrj 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
2533446Smrj }
2543446Smrj 
2553446Smrj #if 0	/* useful if debugging */
2563446Smrj /*
2573446Smrj  * dump out the contents of page tables...
2583446Smrj  */
2593446Smrj static void
2603446Smrj dump_tables(void)
2613446Smrj {
2623446Smrj 	uint_t save_index[4];	/* for recursion */
2633446Smrj 	char *save_table[4];	/* for recursion */
2643446Smrj 	uint_t	l;
2653446Smrj 	uint64_t va;
2663446Smrj 	uint64_t pgsize;
2673446Smrj 	int index;
2683446Smrj 	int i;
2693446Smrj 	x86pte_t pteval;
2703446Smrj 	char *table;
2713446Smrj 	static char *tablist = "\t\t\t";
2723446Smrj 	char *tabs = tablist + 3 - top_level;
2733446Smrj 	uint_t pa, pa1;
2743446Smrj 
2753446Smrj 	dboot_printf("Finished pagetables:\n");
2763446Smrj 	table = (char *)top_page_table;
2773446Smrj 	l = top_level;
2783446Smrj 	va = 0;
2793446Smrj 	for (index = 0; index < ptes_per_table; ++index) {
2803446Smrj 		pgsize = 1ull << shift_amt[l];
2813446Smrj 		if (pae_support)
2823446Smrj 			pteval = ((x86pte_t *)table)[index];
2833446Smrj 		else
2843446Smrj 			pteval = ((x86pte32_t *)table)[index];
2853446Smrj 		if (pteval == 0)
2863446Smrj 			goto next_entry;
2873446Smrj 
2883446Smrj 		dboot_printf("%s %lx[0x%x] = %" PRIx64 ", va=%" PRIx64,
2893446Smrj 		    tabs + l, table, index, (uint64_t)pteval, va);
2903446Smrj 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
2913446Smrj 		dboot_printf(" physaddr=%" PRIx64 "\n", pa);
2923446Smrj 
2933446Smrj 		/*
2943446Smrj 		 * Don't try to walk hypervisor private pagetables
2953446Smrj 		 */
2963446Smrj 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
2973446Smrj 			save_table[l] = table;
2983446Smrj 			save_index[l] = index;
2993446Smrj 			--l;
3003446Smrj 			index = -1;
3013446Smrj 			table = (char *)(uintptr_t)
3023446Smrj 			    ma_to_pa(pteval & MMU_PAGEMASK);
3033446Smrj 			goto recursion;
3043446Smrj 		}
3053446Smrj 
3063446Smrj 		/*
3073446Smrj 		 * shorten dump for consecutive mappings
3083446Smrj 		 */
3093446Smrj 		for (i = 1; index + i < ptes_per_table; ++i) {
3103446Smrj 			if (pae_support)
3113446Smrj 				pteval = ((x86pte_t *)table)[index + i];
3123446Smrj 			else
3133446Smrj 				pteval = ((x86pte32_t *)table)[index + i];
3143446Smrj 			if (pteval == 0)
3153446Smrj 				break;
3163446Smrj 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
3173446Smrj 			if (pa1 != pa + i * pgsize)
3183446Smrj 				break;
3193446Smrj 		}
3203446Smrj 		if (i > 2) {
3213446Smrj 			dboot_printf("%s...\n", tabs + l);
3223446Smrj 			va += pgsize * (i - 2);
3233446Smrj 			index += i - 2;
3243446Smrj 		}
3253446Smrj next_entry:
3263446Smrj 		va += pgsize;
3273446Smrj 		if (l == 3 && index == 256)	/* VA hole */
3283446Smrj 			va = 0xffff800000000000ull;
3293446Smrj recursion:
3303446Smrj 		;
3313446Smrj 	}
3323446Smrj 	if (l < top_level) {
3333446Smrj 		++l;
3343446Smrj 		index = save_index[l];
3353446Smrj 		table = save_table[l];
3363446Smrj 		goto recursion;
3373446Smrj 	}
3383446Smrj }
3393446Smrj #endif
3403446Smrj 
3413446Smrj /*
3423446Smrj  * Add a mapping for the physical page at the given virtual address.
3433446Smrj  */
3443446Smrj static void
3453446Smrj map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
3463446Smrj {
3473446Smrj 	x86pte_t *ptep;
3483446Smrj 	x86pte_t pteval;
3493446Smrj 
3503446Smrj 	pteval = pa_to_ma(pa) | pte_bits;
3513446Smrj 	if (level > 0)
3523446Smrj 		pteval |= PT_PAGESIZE;
3533446Smrj 	if (va >= target_kernel_text && pge_support)
3543446Smrj 		pteval |= PT_GLOBAL;
3553446Smrj 
3563446Smrj 	if (map_debug && pa != va)
3573446Smrj 		dboot_printf("mapping pa=0x%" PRIx64 " va=0x%" PRIx64
3583446Smrj 		    " pte=0x%" PRIx64 " l=%d\n",
3593446Smrj 		    (uint64_t)pa, (uint64_t)va, pteval, level);
3603446Smrj 
3613446Smrj 	/*
3623446Smrj 	 * Find the pte that will map this address. This creates any
3633446Smrj 	 * missing intermediate level page tables
3643446Smrj 	 */
3653446Smrj 	ptep = find_pte(va, NULL, level, 0);
3663446Smrj 
3673446Smrj 	/*
3683446Smrj 	 * On Xen we must use hypervisor calls to modify the PTE, since
3693446Smrj 	 * paging is active. On real hardware we just write to the pagetables
3703446Smrj 	 * which aren't in use yet.
3713446Smrj 	 */
3723446Smrj 	if (va < 1024 * 1024)
3733446Smrj 		pteval |= PT_NOCACHE;		/* for video RAM */
3743446Smrj 	if (pae_support)
3753446Smrj 		*ptep = pteval;
3763446Smrj 	else
3773446Smrj 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
3783446Smrj }
3793446Smrj 
3803446Smrj /*
3813446Smrj  * During memory allocation, find the highest address not used yet.
3823446Smrj  */
3833446Smrj static void
3843446Smrj check_higher(paddr_t a)
3853446Smrj {
3863446Smrj 	if (a < next_avail_addr)
3873446Smrj 		return;
3883446Smrj 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
3893446Smrj 	DBG(next_avail_addr);
3903446Smrj }
3913446Smrj 
3923446Smrj /*
393*3489Sjosephb  * This is called to remove start..end from the
394*3489Sjosephb  * possible range of PCI addresses.
395*3489Sjosephb  */
396*3489Sjosephb const uint64_t pci_lo_limit = 0x00100000ul;
397*3489Sjosephb const uint64_t pci_hi_limit = 0xfff00000ul;
398*3489Sjosephb static void
399*3489Sjosephb exclude_from_pci(uint64_t start, uint64_t end)
400*3489Sjosephb {
401*3489Sjosephb 	int i;
402*3489Sjosephb 	int j;
403*3489Sjosephb 	struct boot_memlist *ml;
404*3489Sjosephb 
405*3489Sjosephb 	for (i = 0; i < pcimemlists_used; ++i) {
406*3489Sjosephb 		ml = &pcimemlists[i];
407*3489Sjosephb 
408*3489Sjosephb 		/* delete the entire range? */
409*3489Sjosephb 		if (start <= ml->addr && ml->addr + ml->size <= end) {
410*3489Sjosephb 			--pcimemlists_used;
411*3489Sjosephb 			for (j = i; j < pcimemlists_used; ++j)
412*3489Sjosephb 				pcimemlists[j] = pcimemlists[j + 1];
413*3489Sjosephb 			--i;	/* to revisit the new one at this index */
414*3489Sjosephb 		}
415*3489Sjosephb 
416*3489Sjosephb 		/* split a range? */
417*3489Sjosephb 		else if (ml->addr < start && end < ml->addr + ml->size) {
418*3489Sjosephb 
419*3489Sjosephb 			++pcimemlists_used;
420*3489Sjosephb 			if (pcimemlists_used > MAX_MEMLIST)
421*3489Sjosephb 				dboot_panic("too many pcimemlists");
422*3489Sjosephb 
423*3489Sjosephb 			for (j = pcimemlists_used - 1; j > i; --j)
424*3489Sjosephb 				pcimemlists[j] = pcimemlists[j - 1];
425*3489Sjosephb 			ml->size = start - ml->addr;
426*3489Sjosephb 
427*3489Sjosephb 			++ml;
428*3489Sjosephb 			ml->size = (ml->addr + ml->size) - end;
429*3489Sjosephb 			ml->addr = end;
430*3489Sjosephb 			++i;	/* skip on to next one */
431*3489Sjosephb 		}
432*3489Sjosephb 
433*3489Sjosephb 		/* cut memory off the start? */
434*3489Sjosephb 		else if (ml->addr < end && end < ml->addr + ml->size) {
435*3489Sjosephb 			ml->size -= end - ml->addr;
436*3489Sjosephb 			ml->addr = end;
437*3489Sjosephb 		}
438*3489Sjosephb 
439*3489Sjosephb 		/* cut memory off the end? */
440*3489Sjosephb 		else if (ml->addr <= start && start < ml->addr + ml->size) {
441*3489Sjosephb 			ml->size = start - ml->addr;
442*3489Sjosephb 		}
443*3489Sjosephb 	}
444*3489Sjosephb }
445*3489Sjosephb 
446*3489Sjosephb /*
4473446Smrj  * Walk through the module information finding the last used address.
4483446Smrj  * The first available address will become the top level page table.
4493446Smrj  *
4503446Smrj  * We then build the phys_install memlist from the multiboot information.
4513446Smrj  */
4523446Smrj static void
4533446Smrj init_mem_alloc(void)
4543446Smrj {
4553446Smrj 	mb_memory_map_t *mmap;
4563446Smrj 	mb_module_t *mod;
4573446Smrj 	uint64_t start;
4583446Smrj 	uint64_t end;
4593446Smrj 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
4603446Smrj 	extern char _end[];
4613446Smrj 	int i;
4623446Smrj 
4633446Smrj 	DBG_MSG("Entered init_mem_alloc()\n");
4643446Smrj 	DBG((uintptr_t)mb_info);
4653446Smrj 
4663446Smrj 	/*
4673446Smrj 	 * search the modules to find the last used address
4683446Smrj 	 * we'll build the module list while we're walking through here
4693446Smrj 	 */
4703446Smrj 	DBG_MSG("\nFinding Modules\n");
4713446Smrj 	check_higher((paddr_t)&_end);
4723446Smrj 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
4733446Smrj 	    i < mb_info->mods_count;
4743446Smrj 	    ++mod, ++i) {
4753446Smrj 		if (prom_debug) {
4763446Smrj 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
4773446Smrj 			    i, (char *)(mod->mod_name),
4783446Smrj 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
4793446Smrj 		}
4803446Smrj 		modules[i].bm_addr = mod->mod_start;
4813446Smrj 		modules[i].bm_size = mod->mod_end;
4823446Smrj 
4833446Smrj 		check_higher(mod->mod_end);
4843446Smrj 	}
4853446Smrj 	bi->bi_modules = (native_ptr_t)modules;
4863446Smrj 	DBG(bi->bi_modules);
4873446Smrj 	bi->bi_module_cnt = mb_info->mods_count;
4883446Smrj 	DBG(bi->bi_module_cnt);
4893446Smrj 
4903446Smrj 	/*
491*3489Sjosephb 	 * start out by assuming PCI can use all physical addresses
492*3489Sjosephb 	 */
493*3489Sjosephb 	pcimemlists[0].addr = pci_lo_limit;
494*3489Sjosephb 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
495*3489Sjosephb 	pcimemlists_used = 1;
496*3489Sjosephb 
497*3489Sjosephb 	/*
4983446Smrj 	 * Walk through the memory map from multiboot and build our memlist
4993446Smrj 	 * structures. Note these will have native format pointers.
5003446Smrj 	 */
5013446Smrj 	DBG_MSG("\nFinding Memory Map\n");
5023446Smrj 	DBG(mb_info->flags);
5033446Smrj 	max_mem = 0;
5043446Smrj 	if (mb_info->flags & 0x40) {
5053446Smrj 		DBG(mb_info->mmap_addr);
5063446Smrj 		DBG(mb_info->mmap_length);
5073446Smrj 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
5083446Smrj 
5093446Smrj 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
5103446Smrj 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
5113446Smrj 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
5123446Smrj 		    + sizeof (mmap->size))) {
5133446Smrj 
5143446Smrj 			start = ((uint64_t)mmap->base_addr_high << 32) +
5153446Smrj 			    mmap->base_addr_low;
5163446Smrj 			end = start + ((uint64_t)mmap->length_high << 32) +
5173446Smrj 			    mmap->length_low;
5183446Smrj 
519*3489Sjosephb 			if (prom_debug)
5203446Smrj 				dboot_printf("\ttype: %d %" PRIx64 "..%"
5213446Smrj 				    PRIx64 "\n", mmap->type, start, end);
5223446Smrj 
5233446Smrj 			/*
5243446Smrj 			 * page align start and end
5253446Smrj 			 */
5263446Smrj 			start = (start + page_offset) & ~page_offset;
5273446Smrj 			end &= ~page_offset;
5283446Smrj 			if (end <= start)
5293446Smrj 				continue;
5303446Smrj 
531*3489Sjosephb 			exclude_from_pci(start, end);
532*3489Sjosephb 
533*3489Sjosephb 			/*
534*3489Sjosephb 			 * only type 1 is usable RAM
535*3489Sjosephb 			 */
536*3489Sjosephb 			if (mmap->type != 1)
537*3489Sjosephb 				continue;
538*3489Sjosephb 
5393446Smrj 			if (end > max_mem)
5403446Smrj 				max_mem = end;
5413446Smrj 
5423446Smrj 			memlists[memlists_used].addr = start;
5433446Smrj 			memlists[memlists_used].size = end - start;
544*3489Sjosephb 			++memlists_used;
545*3489Sjosephb 			if (memlists_used > MAX_MEMLIST)
546*3489Sjosephb 				dboot_panic("too many memlists");
5473446Smrj 		}
5483446Smrj 	} else if (mb_info->flags & 0x01) {
5493446Smrj 		DBG(mb_info->mem_lower);
5503446Smrj 		memlists[memlists_used].addr = 0;
5513446Smrj 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
5523446Smrj 		++memlists_used;
5533446Smrj 		DBG(mb_info->mem_upper);
5543446Smrj 		memlists[memlists_used].addr = 1024 * 1024;
5553446Smrj 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
5563446Smrj 		++memlists_used;
557*3489Sjosephb 		exclude_from_pci(memlists[0].addr,
558*3489Sjosephb 		    memlists[0].addr + memlists[memlists_used].size);
559*3489Sjosephb 		exclude_from_pci(memlists[1].addr,
560*3489Sjosephb 		    memlists[1].addr + memlists[memlists_used].size);
5613446Smrj 	} else {
5623446Smrj 		dboot_panic("No memory info from boot loader!!!\n");
5633446Smrj 	}
5643446Smrj 
5653446Smrj 	check_higher(bi->bi_cmdline);
5663446Smrj 
5673446Smrj 	/*
5683446Smrj 	 * finish processing the physinstall list
5693446Smrj 	 */
5703446Smrj 	sort_physinstall();
571*3489Sjosephb 
572*3489Sjosephb 	/*
573*3489Sjosephb 	 * Finish off the pcimemlist
574*3489Sjosephb 	 */
575*3489Sjosephb 	if (prom_debug) {
576*3489Sjosephb 		for (i = 0; i < pcimemlists_used; ++i) {
577*3489Sjosephb 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
578*3489Sjosephb 				    PRIx64 "\n", pcimemlists[i].addr,
579*3489Sjosephb 				pcimemlists[i].addr + pcimemlists[i].size);
580*3489Sjosephb 		}
581*3489Sjosephb 	}
582*3489Sjosephb 	pcimemlists[0].next = 0;
583*3489Sjosephb 	pcimemlists[0].prev = 0;
584*3489Sjosephb 	for (i = 1; i < pcimemlists_used; ++i) {
585*3489Sjosephb 		pcimemlists[i].prev =
586*3489Sjosephb 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
587*3489Sjosephb 		pcimemlists[i].next = 0;
588*3489Sjosephb 		pcimemlists[i - 1].next =
589*3489Sjosephb 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
590*3489Sjosephb 	}
591*3489Sjosephb 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
592*3489Sjosephb 	DBG(bi->bi_pcimem);
5933446Smrj }
5943446Smrj 
5953446Smrj /*
5963446Smrj  * Simple memory allocator, allocates aligned physical memory.
5973446Smrj  * Note that startup_kernel() only allocates memory, never frees.
5983446Smrj  * Memory usage just grows in an upward direction.
5993446Smrj  */
6003446Smrj static void *
6013446Smrj do_mem_alloc(uint32_t size, uint32_t align)
6023446Smrj {
6033446Smrj 	uint_t i;
6043446Smrj 	uint64_t best;
6053446Smrj 	uint64_t start;
6063446Smrj 	uint64_t end;
6073446Smrj 
6083446Smrj 	/*
6093446Smrj 	 * make sure size is a multiple of pagesize
6103446Smrj 	 */
6113446Smrj 	size = RNDUP(size, MMU_PAGESIZE);
6123446Smrj 	next_avail_addr = RNDUP(next_avail_addr, align);
6133446Smrj 
6143446Smrj 	/*
6153446Smrj 	 * a really large bootarchive that causes you to run out of memory
6163446Smrj 	 * may cause this to blow up
6173446Smrj 	 */
6183446Smrj 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
6193446Smrj 	best = (uint64_t)-size;
6203446Smrj 	for (i = 0; i < memlists_used; ++i) {
6213446Smrj 		start = memlists[i].addr;
6223446Smrj 		end = start + memlists[i].size;
6233446Smrj 
6243446Smrj 		/*
6253446Smrj 		 * did we find the desired address?
6263446Smrj 		 */
6273446Smrj 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
6283446Smrj 			best = next_avail_addr;
6293446Smrj 			goto done;
6303446Smrj 		}
6313446Smrj 
6323446Smrj 		/*
6333446Smrj 		 * if not is this address the best so far?
6343446Smrj 		 */
6353446Smrj 		if (start > next_avail_addr && start < best &&
6363446Smrj 		    RNDUP(start, align) + size <= end)
6373446Smrj 			best = RNDUP(start, align);
6383446Smrj 	}
6393446Smrj 
6403446Smrj 	/*
6413446Smrj 	 * We didn't find exactly the address we wanted, due to going off the
6423446Smrj 	 * end of a memory region. Return the best found memory address.
6433446Smrj 	 */
6443446Smrj done:
6453446Smrj 	next_avail_addr = best + size;
6463446Smrj 	(void) memset((void *)(uintptr_t)best, 0, size);
6473446Smrj 	return ((void *)(uintptr_t)best);
6483446Smrj }
6493446Smrj 
6503446Smrj void *
6513446Smrj mem_alloc(uint32_t size)
6523446Smrj {
6533446Smrj 	return (do_mem_alloc(size, MMU_PAGESIZE));
6543446Smrj }
6553446Smrj 
6563446Smrj 
6573446Smrj /*
6583446Smrj  * Build page tables to map all of memory used so far as well as the kernel.
6593446Smrj  */
6603446Smrj static void
6613446Smrj build_page_tables(void)
6623446Smrj {
6633446Smrj 	uint32_t psize;
6643446Smrj 	uint32_t level;
6653446Smrj 	uint32_t off;
6663446Smrj 	uint32_t i;
6673446Smrj 	uint64_t start;
6683446Smrj 	uint64_t end;
6693446Smrj 	uint64_t next_mapping;
6703446Smrj 
6713446Smrj 	/*
6723446Smrj 	 * If we're not using Xen, we need to create the top level pagetable.
6733446Smrj 	 */
6743446Smrj 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
6753446Smrj 	DBG((uintptr_t)top_page_table);
6763446Smrj 
6773446Smrj 	/*
6783446Smrj 	 * Determine if we'll use large mappings for kernel, then map it.
6793446Smrj 	 */
6803446Smrj 	if (largepage_support) {
6813446Smrj 		psize = lpagesize;
6823446Smrj 		level = 1;
6833446Smrj 	} else {
6843446Smrj 		psize = MMU_PAGESIZE;
6853446Smrj 		level = 0;
6863446Smrj 	}
6873446Smrj 
6883446Smrj 	DBG_MSG("Mapping kernel\n");
6893446Smrj 	DBG(ktext_phys);
6903446Smrj 	DBG(target_kernel_text);
6913446Smrj 	DBG(ksize);
6923446Smrj 	DBG(psize);
6933446Smrj 	for (off = 0; off < ksize; off += psize)
6943446Smrj 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
6953446Smrj 
6963446Smrj 	/*
6973446Smrj 	 * The kernel will need a 1 page window to work with page tables
6983446Smrj 	 */
6993446Smrj 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
7003446Smrj 	DBG(bi->bi_pt_window);
7013446Smrj 	bi->bi_pte_to_pt_window =
7023446Smrj 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
7033446Smrj 	DBG(bi->bi_pte_to_pt_window);
7043446Smrj 
7053446Smrj 	/*
7063446Smrj 	 * Under multiboot we need 1:1 mappings for all of low memory, which
7073446Smrj 	 * includes our pagetables. The following code works because our
7083446Smrj 	 * simple memory allocator only grows usage in an upwards direction.
7093446Smrj 	 *
7103446Smrj 	 * We map *all* possible addresses below 1 Meg, since things like
7113446Smrj 	 * the video RAM are down there.
7123446Smrj 	 *
7133446Smrj 	 * Skip memory between 1M and _start, this acts as a reserve
7143446Smrj 	 * of memory usable for DMA.
7153446Smrj 	 */
7163446Smrj 	next_mapping = (uintptr_t)_start & MMU_PAGEMASK;
7173446Smrj 	if (map_debug)
7183446Smrj 		dboot_printf("1:1 map pa=0..1Meg\n");
7193446Smrj 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE)
7203446Smrj 		map_pa_at_va(start, start, 0);
7213446Smrj 
7223446Smrj 	for (i = 0; i < memlists_used; ++i) {
7233446Smrj 		start = memlists[i].addr;
7243446Smrj 		if (start < next_mapping)
7253446Smrj 			start = next_mapping;
7263446Smrj 
7273446Smrj 		end = start + memlists[i].size;
7283446Smrj 
7293446Smrj 		if (map_debug)
7303446Smrj 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
7313446Smrj 			    start, end);
7323446Smrj 		while (start < end && start < next_avail_addr) {
7333446Smrj 			map_pa_at_va(start, start, 0);
7343446Smrj 			start += MMU_PAGESIZE;
7353446Smrj 		}
7363446Smrj 	}
7373446Smrj 
7383446Smrj 	DBG_MSG("\nPage tables constructed\n");
7393446Smrj }
7403446Smrj 
7413446Smrj #define	NO_MULTIBOOT	\
7423446Smrj "multiboot is no longer used to boot the Solaris Operating System.\n\
7433446Smrj The grub entry should be changed to:\n\
7443446Smrj kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
7453446Smrj module$ /platform/i86pc/$ISADIR/boot_archive\n\
7463446Smrj See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
7473446Smrj 
7483446Smrj /*
7493446Smrj  * startup_kernel has a pretty simple job. It builds pagetables which reflect
7503446Smrj  * 1:1 mappings for all memory in use. It then also adds mappings for
7513446Smrj  * the kernel nucleus at virtual address of target_kernel_text using large page
7523446Smrj  * mappings. The page table pages are also accessible at 1:1 mapped
7533446Smrj  * virtual addresses.
7543446Smrj  */
7553446Smrj /*ARGSUSED*/
7563446Smrj void
7573446Smrj startup_kernel(void)
7583446Smrj {
7593446Smrj 	char *cmdline;
7603446Smrj 	uintptr_t addr;
7613446Smrj 
7623446Smrj 	/*
7633446Smrj 	 * At this point we are executing in a 32 bit real mode.
7643446Smrj 	 */
7653446Smrj 	cmdline = (char *)mb_info->cmdline;
7663446Smrj 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
7673446Smrj 	map_debug = (strstr(cmdline, "map_debug") != NULL);
7683446Smrj 	bcons_init(cmdline);
7693446Smrj 	DBG_MSG("\n\nSolaris prekernel set: ");
7703446Smrj 	DBG_MSG(cmdline);
7713446Smrj 	DBG_MSG("\n");
7723446Smrj 
7733446Smrj 	if (strstr(cmdline, "multiboot") != NULL) {
7743446Smrj 		dboot_panic(NO_MULTIBOOT);
7753446Smrj 	}
7763446Smrj 
7773446Smrj 	/*
7783446Smrj 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
7793446Smrj 	 */
7803446Smrj 	addr = (uintptr_t)boot_info;
7813446Smrj 	addr = (addr + 0xf) & ~0xf;
7823446Smrj 	bi = (struct xboot_info *)addr;
7833446Smrj 	DBG((uintptr_t)bi);
7843446Smrj 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
7853446Smrj 
7863446Smrj 	/*
7873446Smrj 	 * Need correct target_kernel_text value
7883446Smrj 	 */
7893446Smrj #if defined(_BOOT_TARGET_amd64)
7903446Smrj 	target_kernel_text = KERNEL_TEXT_amd64;
7913446Smrj #else
7923446Smrj 	target_kernel_text = KERNEL_TEXT_i386;
7933446Smrj #endif
7943446Smrj 	DBG(target_kernel_text);
7953446Smrj 
7963446Smrj 	/*
7973446Smrj 	 * use cpuid to enable MMU features
7983446Smrj 	 */
7993446Smrj 	if (have_cpuid()) {
8003446Smrj 		uint32_t eax, edx;
8013446Smrj 
8023446Smrj 		eax = 1;
8033446Smrj 		edx = get_cpuid_edx(&eax);
8043446Smrj 		if (edx & CPUID_INTC_EDX_PSE)
8053446Smrj 			largepage_support = 1;
8063446Smrj 		if (edx & CPUID_INTC_EDX_PGE)
8073446Smrj 			pge_support = 1;
8083446Smrj 		if (edx & CPUID_INTC_EDX_PAE)
8093446Smrj 			pae_support = 1;
8103446Smrj 
8113446Smrj 		eax = 0x80000000;
8123446Smrj 		edx = get_cpuid_edx(&eax);
8133446Smrj 		if (eax >= 0x80000001) {
8143446Smrj 			eax = 0x80000001;
8153446Smrj 			edx = get_cpuid_edx(&eax);
8163446Smrj 			if (edx & CPUID_AMD_EDX_LM)
8173446Smrj 				amd64_support = 1;
8183446Smrj 			if (edx & CPUID_AMD_EDX_NX)
8193446Smrj 				NX_support = 1;
8203446Smrj 		}
8213446Smrj 	} else {
8223446Smrj 		dboot_printf("cpuid not supported\n");
8233446Smrj 	}
8243446Smrj 
8253446Smrj #if defined(_BOOT_TARGET_amd64)
8263446Smrj 	if (amd64_support == 0)
8273446Smrj 		dboot_panic("long mode not supported, rebooting\n");
8283446Smrj 	else if (pae_support == 0)
8293446Smrj 		dboot_panic("long mode, but no PAE; rebooting\n");
8303446Smrj #endif
8313446Smrj 
8323446Smrj 	/*
8333446Smrj 	 * initialize our memory allocator
8343446Smrj 	 */
8353446Smrj 	init_mem_alloc();
8363446Smrj 
8373446Smrj 	/*
8383446Smrj 	 * configure mmu information
8393446Smrj 	 */
8403446Smrj #if !defined(_BOOT_TARGET_amd64)
8413446Smrj 	if (pae_support && (max_mem > FOUR_GIG || NX_support)) {
8423446Smrj #endif
8433446Smrj 		shift_amt = shift_amt_pae;
8443446Smrj 		ptes_per_table = 512;
8453446Smrj 		pte_size = 8;
8463446Smrj 		lpagesize = TWO_MEG;
8473446Smrj #if defined(_BOOT_TARGET_amd64)
8483446Smrj 		top_level = 3;
8493446Smrj #else
8503446Smrj 		top_level = 2;
8513446Smrj #endif
8523446Smrj #if !defined(_BOOT_TARGET_amd64)
8533446Smrj 	} else {
8543446Smrj 		pae_support = 0;
8553446Smrj 		NX_support = 0;
8563446Smrj 		shift_amt = shift_amt_nopae;
8573446Smrj 		ptes_per_table = 1024;
8583446Smrj 		pte_size = 4;
8593446Smrj 		lpagesize = FOUR_MEG;
8603446Smrj 		top_level = 1;
8613446Smrj 	}
8623446Smrj #endif
8633446Smrj 
8643446Smrj 	DBG(pge_support);
8653446Smrj 	DBG(NX_support);
8663446Smrj 	DBG(largepage_support);
8673446Smrj 	DBG(amd64_support);
8683446Smrj 	DBG(top_level);
8693446Smrj 	DBG(pte_size);
8703446Smrj 	DBG(ptes_per_table);
8713446Smrj 	DBG(lpagesize);
8723446Smrj 
8733446Smrj 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
8743446Smrj 
8753446Smrj #if defined(_BOOT_TARGET_amd64)
8763446Smrj 	/*
8773446Smrj 	 * For grub, copy kernel bits from the ELF64 file to final place.
8783446Smrj 	 */
8793446Smrj 	DBG_MSG("\nAllocating nucleus pages.\n");
8803446Smrj 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
8813446Smrj 	if (ktext_phys == 0)
8823446Smrj 		dboot_panic("failed to allocate aligned kernel memory\n");
8833446Smrj 	if (dboot_elfload64(mb_header.load_addr) != 0)
8843446Smrj 		dboot_panic("failed to parse kernel ELF image, rebooting\n");
8853446Smrj 
8863446Smrj #endif
8873446Smrj 	DBG(ktext_phys);
8883446Smrj 
8893446Smrj 	/*
8903446Smrj 	 * Allocate page tables.
8913446Smrj 	 */
8923446Smrj 	build_page_tables();
8933446Smrj 
8943446Smrj 	/*
8953446Smrj 	 * return to assembly code to switch to running kernel
8963446Smrj 	 */
8973446Smrj 	entry_addr_low = (uint32_t)target_kernel_text;
8983446Smrj 	DBG(entry_addr_low);
8993446Smrj 	bi->bi_use_largepage = largepage_support;
9003446Smrj 	bi->bi_use_pae = pae_support;
9013446Smrj 	bi->bi_use_pge = pge_support;
9023446Smrj 	bi->bi_use_nx = NX_support;
9033446Smrj 	bi->bi_next_paddr = next_avail_addr;
9043446Smrj 	DBG(bi->bi_next_paddr);
9053446Smrj 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
9063446Smrj 	DBG(bi->bi_next_vaddr);
9073446Smrj 	bi->bi_mb_info = (uintptr_t)mb_info;
9083446Smrj 	bi->bi_top_page_table = (uintptr_t)top_page_table;
9093446Smrj 
9103446Smrj 	bi->bi_kseg_size = FOUR_MEG;
9113446Smrj 	DBG(bi->bi_kseg_size);
9123446Smrj 
9133446Smrj #if 0		/* useful if debugging initial page tables */
9143446Smrj 	if (prom_debug)
9153446Smrj 		dump_tables();
9163446Smrj #endif
9173446Smrj 
9183446Smrj 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
9193446Smrj }
920