xref: /onnv-gate/usr/src/uts/i86pc/dboot/dboot_startkern.c (revision 11387:0072514d53c7)
13446Smrj /*
23446Smrj  * CDDL HEADER START
33446Smrj  *
43446Smrj  * The contents of this file are subject to the terms of the
53446Smrj  * Common Development and Distribution License (the "License").
63446Smrj  * You may not use this file except in compliance with the License.
73446Smrj  *
83446Smrj  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93446Smrj  * or http://www.opensolaris.org/os/licensing.
103446Smrj  * See the License for the specific language governing permissions
113446Smrj  * and limitations under the License.
123446Smrj  *
133446Smrj  * When distributing Covered Code, include this CDDL HEADER in each
143446Smrj  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153446Smrj  * If applicable, add the following below this CDDL HEADER, with the
163446Smrj  * fields enclosed by brackets "[]" replaced with your own identifying
173446Smrj  * information: Portions Copyright [yyyy] [name of copyright owner]
183446Smrj  *
193446Smrj  * CDDL HEADER END
203446Smrj  */
213446Smrj 
223446Smrj /*
239489SJoe.Bonasera@sun.com  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
243446Smrj  * Use is subject to license terms.
253446Smrj  */
263446Smrj 
273446Smrj 
283446Smrj #include <sys/types.h>
293446Smrj #include <sys/machparam.h>
303446Smrj #include <sys/x86_archext.h>
313446Smrj #include <sys/systm.h>
323446Smrj #include <sys/mach_mmu.h>
335084Sjohnlev #include <sys/multiboot.h>
343446Smrj 
355084Sjohnlev #if defined(__xpv)
365084Sjohnlev 
375084Sjohnlev #include <sys/hypervisor.h>
385084Sjohnlev uintptr_t xen_virt_start;
395084Sjohnlev pfn_t *mfn_to_pfn_mapping;
405084Sjohnlev 
415084Sjohnlev #else /* !__xpv */
423446Smrj 
433446Smrj extern multiboot_header_t mb_header;
443446Smrj extern int have_cpuid(void);
455084Sjohnlev 
465084Sjohnlev #endif /* !__xpv */
473446Smrj 
483446Smrj #include <sys/inttypes.h>
493446Smrj #include <sys/bootinfo.h>
503446Smrj #include <sys/mach_mmu.h>
513446Smrj #include <sys/boot_console.h>
523446Smrj 
535084Sjohnlev #include "dboot_asm.h"
543446Smrj #include "dboot_printf.h"
553446Smrj #include "dboot_xboot.h"
563446Smrj #include "dboot_elfload.h"
573446Smrj 
583446Smrj /*
593446Smrj  * This file contains code that runs to transition us from either a multiboot
605084Sjohnlev  * compliant loader (32 bit non-paging) or a XPV domain loader to
615084Sjohnlev  * regular kernel execution. Its task is to setup the kernel memory image
625084Sjohnlev  * and page tables.
633446Smrj  *
643446Smrj  * The code executes as:
653446Smrj  *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
665084Sjohnlev  * 	- a 32 bit program for the 32-bit PV hypervisor
675084Sjohnlev  *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
683446Smrj  *
695084Sjohnlev  * Under the PV hypervisor, we must create mappings for any memory beyond the
705084Sjohnlev  * initial start of day allocation (such as the kernel itself).
713446Smrj  *
725084Sjohnlev  * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
733446Smrj  * Since we are running in real mode, so all such memory is accessible.
743446Smrj  */
753446Smrj 
763446Smrj /*
773446Smrj  * Standard bits used in PTE (page level) and PTP (internal levels)
783446Smrj  */
795084Sjohnlev x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
805084Sjohnlev x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
813446Smrj 
823446Smrj /*
833446Smrj  * This is the target addresses (physical) where the kernel text and data
845084Sjohnlev  * nucleus pages will be unpacked. On the hypervisor this is actually a
855084Sjohnlev  * virtual address.
863446Smrj  */
873446Smrj paddr_t ktext_phys;
883446Smrj uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
893446Smrj 
903446Smrj static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
913446Smrj 
923446Smrj /*
933446Smrj  * The stack is setup in assembler before entering startup_kernel()
943446Smrj  */
953446Smrj char stack_space[STACK_SIZE];
963446Smrj 
973446Smrj /*
983446Smrj  * Used to track physical memory allocation
993446Smrj  */
1003446Smrj static paddr_t next_avail_addr = 0;
1013446Smrj 
1025084Sjohnlev #if defined(__xpv)
1035084Sjohnlev /*
1045084Sjohnlev  * Additional information needed for hypervisor memory allocation.
1055084Sjohnlev  * Only memory up to scratch_end is mapped by page tables.
1065084Sjohnlev  * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
1075084Sjohnlev  * to derive a pfn from a pointer, you subtract mfn_base.
1085084Sjohnlev  */
1095084Sjohnlev 
1105084Sjohnlev static paddr_t scratch_end = 0;	/* we can't write all of mem here */
1115084Sjohnlev static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
1125084Sjohnlev start_info_t *xen_info;
1135084Sjohnlev 
1145084Sjohnlev #else	/* __xpv */
1155084Sjohnlev 
1165084Sjohnlev /*
1175084Sjohnlev  * If on the metal, then we have a multiboot loader.
1185084Sjohnlev  */
1193446Smrj multiboot_info_t *mb_info;
1203446Smrj 
1215084Sjohnlev #endif	/* __xpv */
1225084Sjohnlev 
1233446Smrj /*
1243446Smrj  * This contains information passed to the kernel
1253446Smrj  */
1263446Smrj struct xboot_info boot_info[2];	/* extra space to fix alignement for amd64 */
1273446Smrj struct xboot_info *bi;
1283446Smrj 
1293446Smrj /*
1303446Smrj  * Page table and memory stuff.
1313446Smrj  */
1325084Sjohnlev static paddr_t max_mem;			/* maximum memory address */
1333446Smrj 
1343446Smrj /*
1353446Smrj  * Information about processor MMU
1363446Smrj  */
1373446Smrj int amd64_support = 0;
1383446Smrj int largepage_support = 0;
1393446Smrj int pae_support = 0;
1403446Smrj int pge_support = 0;
1413446Smrj int NX_support = 0;
1423446Smrj 
1433446Smrj /*
1443446Smrj  * Low 32 bits of kernel entry address passed back to assembler.
1453446Smrj  * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
1463446Smrj  */
1473446Smrj uint32_t entry_addr_low;
1483446Smrj 
1493446Smrj /*
1503446Smrj  * Memlists for the kernel. We shouldn't need a lot of these.
1513446Smrj  */
1523489Sjosephb #define	MAX_MEMLIST (50)
1533446Smrj struct boot_memlist memlists[MAX_MEMLIST];
1543446Smrj uint_t memlists_used = 0;
1553489Sjosephb struct boot_memlist pcimemlists[MAX_MEMLIST];
1563489Sjosephb uint_t pcimemlists_used = 0;
1579940SVikram.Hegde@Sun.COM struct boot_memlist rsvdmemlists[MAX_MEMLIST];
1589940SVikram.Hegde@Sun.COM uint_t rsvdmemlists_used = 0;
1593446Smrj 
1603446Smrj #define	MAX_MODULES (10)
1613446Smrj struct boot_modules modules[MAX_MODULES];
1623446Smrj uint_t modules_used = 0;
1633446Smrj 
1643446Smrj /*
1653446Smrj  * Debugging macros
1663446Smrj  */
1673446Smrj uint_t prom_debug = 0;
1683446Smrj uint_t map_debug = 0;
1693446Smrj 
1703446Smrj /*
1715084Sjohnlev  * Either hypervisor-specific or grub-specific code builds the initial
1725084Sjohnlev  * memlists. This code does the sort/merge/link for final use.
1733446Smrj  */
1743446Smrj static void
sort_physinstall(void)1753446Smrj sort_physinstall(void)
1763446Smrj {
1773446Smrj 	int i;
1785084Sjohnlev #if !defined(__xpv)
1793446Smrj 	int j;
1803446Smrj 	struct boot_memlist tmp;
1813446Smrj 
1823446Smrj 	/*
1833446Smrj 	 * Now sort the memlists, in case they weren't in order.
1843446Smrj 	 * Yeah, this is a bubble sort; small, simple and easy to get right.
1853446Smrj 	 */
1863446Smrj 	DBG_MSG("Sorting phys-installed list\n");
1873446Smrj 	for (j = memlists_used - 1; j > 0; --j) {
1883446Smrj 		for (i = 0; i < j; ++i) {
1893446Smrj 			if (memlists[i].addr < memlists[i + 1].addr)
1903446Smrj 				continue;
1913446Smrj 			tmp = memlists[i];
1923446Smrj 			memlists[i] = memlists[i + 1];
1933446Smrj 			memlists[i + 1] = tmp;
1943446Smrj 		}
1953446Smrj 	}
1963446Smrj 
1973446Smrj 	/*
1983446Smrj 	 * Merge any memlists that don't have holes between them.
1993446Smrj 	 */
2003446Smrj 	for (i = 0; i <= memlists_used - 1; ++i) {
2013446Smrj 		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
2023446Smrj 			continue;
2033446Smrj 
2043446Smrj 		if (prom_debug)
2053446Smrj 			dboot_printf(
2063446Smrj 			    "merging mem segs %" PRIx64 "...%" PRIx64
2073446Smrj 			    " w/ %" PRIx64 "...%" PRIx64 "\n",
2083446Smrj 			    memlists[i].addr,
2093446Smrj 			    memlists[i].addr + memlists[i].size,
2103446Smrj 			    memlists[i + 1].addr,
2113446Smrj 			    memlists[i + 1].addr + memlists[i + 1].size);
2123446Smrj 
2133446Smrj 		memlists[i].size += memlists[i + 1].size;
2143446Smrj 		for (j = i + 1; j < memlists_used - 1; ++j)
2153446Smrj 			memlists[j] = memlists[j + 1];
2163446Smrj 		--memlists_used;
2173446Smrj 		DBG(memlists_used);
2183446Smrj 		--i;	/* after merging we need to reexamine, so do this */
2193446Smrj 	}
2205084Sjohnlev #endif	/* __xpv */
2213446Smrj 
2223446Smrj 	if (prom_debug) {
2233446Smrj 		dboot_printf("\nFinal memlists:\n");
2243446Smrj 		for (i = 0; i < memlists_used; ++i) {
2253446Smrj 			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
2263446Smrj 			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
2273446Smrj 		}
2283446Smrj 	}
2293446Smrj 
2303446Smrj 	/*
2313446Smrj 	 * link together the memlists with native size pointers
2323446Smrj 	 */
2333446Smrj 	memlists[0].next = 0;
2343446Smrj 	memlists[0].prev = 0;
2353446Smrj 	for (i = 1; i < memlists_used; ++i) {
2363446Smrj 		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
2373446Smrj 		memlists[i].next = 0;
2383446Smrj 		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
2393446Smrj 	}
2403446Smrj 	bi->bi_phys_install = (native_ptr_t)memlists;
2413446Smrj 	DBG(bi->bi_phys_install);
2423446Smrj }
2433446Smrj 
2449940SVikram.Hegde@Sun.COM /*
2459940SVikram.Hegde@Sun.COM  * build bios reserved memlists
2469940SVikram.Hegde@Sun.COM  */
2479940SVikram.Hegde@Sun.COM static void
build_rsvdmemlists(void)2489940SVikram.Hegde@Sun.COM build_rsvdmemlists(void)
2499940SVikram.Hegde@Sun.COM {
2509940SVikram.Hegde@Sun.COM 	int i;
2519940SVikram.Hegde@Sun.COM 
2529940SVikram.Hegde@Sun.COM 	rsvdmemlists[0].next = 0;
2539940SVikram.Hegde@Sun.COM 	rsvdmemlists[0].prev = 0;
2549940SVikram.Hegde@Sun.COM 	for (i = 1; i < rsvdmemlists_used; ++i) {
2559940SVikram.Hegde@Sun.COM 		rsvdmemlists[i].prev =
2569940SVikram.Hegde@Sun.COM 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
2579940SVikram.Hegde@Sun.COM 		rsvdmemlists[i].next = 0;
2589940SVikram.Hegde@Sun.COM 		rsvdmemlists[i - 1].next =
2599940SVikram.Hegde@Sun.COM 		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
2609940SVikram.Hegde@Sun.COM 	}
2619940SVikram.Hegde@Sun.COM 	bi->bi_rsvdmem = (native_ptr_t)rsvdmemlists;
2629940SVikram.Hegde@Sun.COM 	DBG(bi->bi_rsvdmem);
2639940SVikram.Hegde@Sun.COM }
2649940SVikram.Hegde@Sun.COM 
2655084Sjohnlev #if defined(__xpv)
2665084Sjohnlev 
2675084Sjohnlev /*
2685084Sjohnlev  * halt on the hypervisor after a delay to drain console output
2695084Sjohnlev  */
2705084Sjohnlev void
dboot_halt(void)2715084Sjohnlev dboot_halt(void)
2725084Sjohnlev {
2735084Sjohnlev 	uint_t i = 10000;
2745084Sjohnlev 
2755084Sjohnlev 	while (--i)
276*11387SSurya.Prakki@Sun.COM 		(void) HYPERVISOR_yield();
277*11387SSurya.Prakki@Sun.COM 	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
2785084Sjohnlev }
2795084Sjohnlev 
2805084Sjohnlev /*
2815084Sjohnlev  * From a machine address, find the corresponding pseudo-physical address.
2825084Sjohnlev  * Pseudo-physical address are contiguous and run from mfn_base in each VM.
2835084Sjohnlev  * Machine addresses are the real underlying hardware addresses.
2845084Sjohnlev  * These are needed for page table entries. Note that this routine is
2855084Sjohnlev  * poorly protected. A bad value of "ma" will cause a page fault.
2865084Sjohnlev  */
2875084Sjohnlev paddr_t
ma_to_pa(maddr_t ma)2885084Sjohnlev ma_to_pa(maddr_t ma)
2895084Sjohnlev {
2905084Sjohnlev 	ulong_t pgoff = ma & MMU_PAGEOFFSET;
2915084Sjohnlev 	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
2925084Sjohnlev 	paddr_t pa;
2935084Sjohnlev 
2945084Sjohnlev 	if (pfn >= xen_info->nr_pages)
2955084Sjohnlev 		return (-(paddr_t)1);
2965084Sjohnlev 	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
2975084Sjohnlev #ifdef DEBUG
2985084Sjohnlev 	if (ma != pa_to_ma(pa))
2995084Sjohnlev 		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
3005084Sjohnlev 		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
3015084Sjohnlev #endif
3025084Sjohnlev 	return (pa);
3035084Sjohnlev }
3045084Sjohnlev 
3055084Sjohnlev /*
3065084Sjohnlev  * From a pseudo-physical address, find the corresponding machine address.
3075084Sjohnlev  */
3085084Sjohnlev maddr_t
pa_to_ma(paddr_t pa)3095084Sjohnlev pa_to_ma(paddr_t pa)
3105084Sjohnlev {
3115084Sjohnlev 	pfn_t pfn;
3125084Sjohnlev 	ulong_t mfn;
3135084Sjohnlev 
3145084Sjohnlev 	pfn = mmu_btop(pa - mfn_base);
3155084Sjohnlev 	if (pa < mfn_base || pfn >= xen_info->nr_pages)
3165084Sjohnlev 		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
3175084Sjohnlev 	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
3185084Sjohnlev #ifdef DEBUG
3195084Sjohnlev 	if (mfn_to_pfn_mapping[mfn] != pfn)
3205084Sjohnlev 		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
3215084Sjohnlev 		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
3225084Sjohnlev #endif
3235084Sjohnlev 	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
3245084Sjohnlev }
3255084Sjohnlev 
3265084Sjohnlev #endif	/* __xpv */
3275084Sjohnlev 
3283446Smrj x86pte_t
get_pteval(paddr_t table,uint_t index)3293446Smrj get_pteval(paddr_t table, uint_t index)
3303446Smrj {
3313446Smrj 	if (pae_support)
3323446Smrj 		return (((x86pte_t *)(uintptr_t)table)[index]);
3333446Smrj 	return (((x86pte32_t *)(uintptr_t)table)[index]);
3343446Smrj }
3353446Smrj 
3363446Smrj /*ARGSUSED*/
3373446Smrj void
set_pteval(paddr_t table,uint_t index,uint_t level,x86pte_t pteval)3383446Smrj set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
3393446Smrj {
3405084Sjohnlev #ifdef __xpv
3415084Sjohnlev 	mmu_update_t t;
3425084Sjohnlev 	maddr_t mtable = pa_to_ma(table);
3435084Sjohnlev 	int retcnt;
3445084Sjohnlev 
3455084Sjohnlev 	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
3465084Sjohnlev 	t.val = pteval;
3475084Sjohnlev 	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
3485084Sjohnlev 		dboot_panic("HYPERVISOR_mmu_update() failed");
3495084Sjohnlev #else /* __xpv */
3503446Smrj 	uintptr_t tab_addr = (uintptr_t)table;
3513446Smrj 
3523446Smrj 	if (pae_support)
3533446Smrj 		((x86pte_t *)tab_addr)[index] = pteval;
3543446Smrj 	else
3553446Smrj 		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
3563446Smrj 	if (level == top_level && level == 2)
3573446Smrj 		reload_cr3();
3585084Sjohnlev #endif /* __xpv */
3593446Smrj }
3603446Smrj 
3613446Smrj paddr_t
make_ptable(x86pte_t * pteval,uint_t level)3623446Smrj make_ptable(x86pte_t *pteval, uint_t level)
3633446Smrj {
3643446Smrj 	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
3653446Smrj 
3663446Smrj 	if (level == top_level && level == 2)
3673446Smrj 		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
3683446Smrj 	else
3693446Smrj 		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
3703446Smrj 
3715084Sjohnlev #ifdef __xpv
3725084Sjohnlev 	/* Remove write permission to the new page table. */
3735084Sjohnlev 	if (HYPERVISOR_update_va_mapping(new_table,
3745084Sjohnlev 	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
3755084Sjohnlev 		dboot_panic("HYP_update_va_mapping error");
3765084Sjohnlev #endif
3775084Sjohnlev 
3783446Smrj 	if (map_debug)
3793446Smrj 		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
3803446Smrj 		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
3813446Smrj 	return (new_table);
3823446Smrj }
3833446Smrj 
3843446Smrj x86pte_t *
map_pte(paddr_t table,uint_t index)3853446Smrj map_pte(paddr_t table, uint_t index)
3863446Smrj {
3873446Smrj 	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
3883446Smrj }
3893446Smrj 
3907656SSherry.Moore@Sun.COM /*
3917656SSherry.Moore@Sun.COM  * dump out the contents of page tables...
3927656SSherry.Moore@Sun.COM  */
3937656SSherry.Moore@Sun.COM static void
dump_tables(void)3947656SSherry.Moore@Sun.COM dump_tables(void)
3957656SSherry.Moore@Sun.COM {
3967656SSherry.Moore@Sun.COM 	uint_t save_index[4];	/* for recursion */
3977656SSherry.Moore@Sun.COM 	char *save_table[4];	/* for recursion */
3987656SSherry.Moore@Sun.COM 	uint_t	l;
3997656SSherry.Moore@Sun.COM 	uint64_t va;
4007656SSherry.Moore@Sun.COM 	uint64_t pgsize;
4017656SSherry.Moore@Sun.COM 	int index;
4027656SSherry.Moore@Sun.COM 	int i;
4037656SSherry.Moore@Sun.COM 	x86pte_t pteval;
4047656SSherry.Moore@Sun.COM 	char *table;
4057656SSherry.Moore@Sun.COM 	static char *tablist = "\t\t\t";
4067656SSherry.Moore@Sun.COM 	char *tabs = tablist + 3 - top_level;
4077656SSherry.Moore@Sun.COM 	uint_t pa, pa1;
4085084Sjohnlev #if !defined(__xpv)
4095084Sjohnlev #define	maddr_t paddr_t
4105084Sjohnlev #endif /* !__xpv */
4115084Sjohnlev 
4127656SSherry.Moore@Sun.COM 	dboot_printf("Finished pagetables:\n");
4137656SSherry.Moore@Sun.COM 	table = (char *)(uintptr_t)top_page_table;
4147656SSherry.Moore@Sun.COM 	l = top_level;
4157656SSherry.Moore@Sun.COM 	va = 0;
4167656SSherry.Moore@Sun.COM 	for (index = 0; index < ptes_per_table; ++index) {
4177656SSherry.Moore@Sun.COM 		pgsize = 1ull << shift_amt[l];
4187656SSherry.Moore@Sun.COM 		if (pae_support)
4197656SSherry.Moore@Sun.COM 			pteval = ((x86pte_t *)table)[index];
4207656SSherry.Moore@Sun.COM 		else
4217656SSherry.Moore@Sun.COM 			pteval = ((x86pte32_t *)table)[index];
4227656SSherry.Moore@Sun.COM 		if (pteval == 0)
4237656SSherry.Moore@Sun.COM 			goto next_entry;
4247656SSherry.Moore@Sun.COM 
4257656SSherry.Moore@Sun.COM 		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
426*11387SSurya.Prakki@Sun.COM 		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
4277656SSherry.Moore@Sun.COM 		pa = ma_to_pa(pteval & MMU_PAGEMASK);
4287656SSherry.Moore@Sun.COM 		dboot_printf(" physaddr=%x\n", pa);
4297656SSherry.Moore@Sun.COM 
4307656SSherry.Moore@Sun.COM 		/*
4317656SSherry.Moore@Sun.COM 		 * Don't try to walk hypervisor private pagetables
4327656SSherry.Moore@Sun.COM 		 */
4337656SSherry.Moore@Sun.COM 		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
4347656SSherry.Moore@Sun.COM 			save_table[l] = table;
4357656SSherry.Moore@Sun.COM 			save_index[l] = index;
4367656SSherry.Moore@Sun.COM 			--l;
4377656SSherry.Moore@Sun.COM 			index = -1;
4387656SSherry.Moore@Sun.COM 			table = (char *)(uintptr_t)
4397656SSherry.Moore@Sun.COM 			    ma_to_pa(pteval & MMU_PAGEMASK);
4407656SSherry.Moore@Sun.COM 			goto recursion;
4417656SSherry.Moore@Sun.COM 		}
4427656SSherry.Moore@Sun.COM 
4437656SSherry.Moore@Sun.COM 		/*
4447656SSherry.Moore@Sun.COM 		 * shorten dump for consecutive mappings
4457656SSherry.Moore@Sun.COM 		 */
4467656SSherry.Moore@Sun.COM 		for (i = 1; index + i < ptes_per_table; ++i) {
4477656SSherry.Moore@Sun.COM 			if (pae_support)
4487656SSherry.Moore@Sun.COM 				pteval = ((x86pte_t *)table)[index + i];
4497656SSherry.Moore@Sun.COM 			else
4507656SSherry.Moore@Sun.COM 				pteval = ((x86pte32_t *)table)[index + i];
4517656SSherry.Moore@Sun.COM 			if (pteval == 0)
4527656SSherry.Moore@Sun.COM 				break;
4537656SSherry.Moore@Sun.COM 			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
4547656SSherry.Moore@Sun.COM 			if (pa1 != pa + i * pgsize)
4557656SSherry.Moore@Sun.COM 				break;
4567656SSherry.Moore@Sun.COM 		}
4577656SSherry.Moore@Sun.COM 		if (i > 2) {
4587656SSherry.Moore@Sun.COM 			dboot_printf("%s...\n", tabs + l);
4597656SSherry.Moore@Sun.COM 			va += pgsize * (i - 2);
4607656SSherry.Moore@Sun.COM 			index += i - 2;
4617656SSherry.Moore@Sun.COM 		}
4627656SSherry.Moore@Sun.COM next_entry:
4637656SSherry.Moore@Sun.COM 		va += pgsize;
4647656SSherry.Moore@Sun.COM 		if (l == 3 && index == 256)	/* VA hole */
4657656SSherry.Moore@Sun.COM 			va = 0xffff800000000000ull;
4667656SSherry.Moore@Sun.COM recursion:
4677656SSherry.Moore@Sun.COM 		;
4687656SSherry.Moore@Sun.COM 	}
4697656SSherry.Moore@Sun.COM 	if (l < top_level) {
4707656SSherry.Moore@Sun.COM 		++l;
4717656SSherry.Moore@Sun.COM 		index = save_index[l];
4727656SSherry.Moore@Sun.COM 		table = save_table[l];
4737656SSherry.Moore@Sun.COM 		goto recursion;
4747656SSherry.Moore@Sun.COM 	}
4757656SSherry.Moore@Sun.COM }
4767656SSherry.Moore@Sun.COM 
4773446Smrj /*
4785084Sjohnlev  * Add a mapping for the machine page at the given virtual address.
4793446Smrj  */
4803446Smrj static void
map_ma_at_va(maddr_t ma,native_ptr_t va,uint_t level)4815084Sjohnlev map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
4823446Smrj {
4833446Smrj 	x86pte_t *ptep;
4843446Smrj 	x86pte_t pteval;
4853446Smrj 
4865084Sjohnlev 	pteval = ma | pte_bits;
4873446Smrj 	if (level > 0)
4883446Smrj 		pteval |= PT_PAGESIZE;
4893446Smrj 	if (va >= target_kernel_text && pge_support)
4903446Smrj 		pteval |= PT_GLOBAL;
4913446Smrj 
4925084Sjohnlev 	if (map_debug && ma != va)
4935084Sjohnlev 		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
4943446Smrj 		    " pte=0x%" PRIx64 " l=%d\n",
4955084Sjohnlev 		    (uint64_t)ma, (uint64_t)va, pteval, level);
4965084Sjohnlev 
4975084Sjohnlev #if defined(__xpv)
4985084Sjohnlev 	/*
4995084Sjohnlev 	 * see if we can avoid find_pte() on the hypervisor
5005084Sjohnlev 	 */
5015084Sjohnlev 	if (HYPERVISOR_update_va_mapping(va, pteval,
5025084Sjohnlev 	    UVMF_INVLPG | UVMF_LOCAL) == 0)
5035084Sjohnlev 		return;
5045084Sjohnlev #endif
5053446Smrj 
5063446Smrj 	/*
5073446Smrj 	 * Find the pte that will map this address. This creates any
5083446Smrj 	 * missing intermediate level page tables
5093446Smrj 	 */
5103446Smrj 	ptep = find_pte(va, NULL, level, 0);
5113446Smrj 
5123446Smrj 	/*
5135084Sjohnlev 	 * When paravirtualized, we must use hypervisor calls to modify the
5145084Sjohnlev 	 * PTE, since paging is active. On real hardware we just write to
5155084Sjohnlev 	 * the pagetables which aren't in use yet.
5163446Smrj 	 */
5175084Sjohnlev #if defined(__xpv)
5185084Sjohnlev 	ptep = ptep;	/* shut lint up */
5195084Sjohnlev 	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
5205084Sjohnlev 		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
5215084Sjohnlev 		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
5225084Sjohnlev 		    (uint64_t)va, level, (uint64_t)ma, pteval);
5235084Sjohnlev #else
5243446Smrj 	if (va < 1024 * 1024)
5253446Smrj 		pteval |= PT_NOCACHE;		/* for video RAM */
5263446Smrj 	if (pae_support)
5273446Smrj 		*ptep = pteval;
5283446Smrj 	else
5293446Smrj 		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
5305084Sjohnlev #endif
5313446Smrj }
5323446Smrj 
5333446Smrj /*
5345084Sjohnlev  * Add a mapping for the physical page at the given virtual address.
5353446Smrj  */
5363446Smrj static void
map_pa_at_va(paddr_t pa,native_ptr_t va,uint_t level)5375084Sjohnlev map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
5383446Smrj {
5395084Sjohnlev 	map_ma_at_va(pa_to_ma(pa), va, level);
5403446Smrj }
5413446Smrj 
5423446Smrj /*
5433489Sjosephb  * This is called to remove start..end from the
5443489Sjosephb  * possible range of PCI addresses.
5453489Sjosephb  */
5463489Sjosephb const uint64_t pci_lo_limit = 0x00100000ul;
5473489Sjosephb const uint64_t pci_hi_limit = 0xfff00000ul;
5483489Sjosephb static void
exclude_from_pci(uint64_t start,uint64_t end)5493489Sjosephb exclude_from_pci(uint64_t start, uint64_t end)
5503489Sjosephb {
5513489Sjosephb 	int i;
5523489Sjosephb 	int j;
5533489Sjosephb 	struct boot_memlist *ml;
5543489Sjosephb 
5553489Sjosephb 	for (i = 0; i < pcimemlists_used; ++i) {
5563489Sjosephb 		ml = &pcimemlists[i];
5573489Sjosephb 
5583489Sjosephb 		/* delete the entire range? */
5593489Sjosephb 		if (start <= ml->addr && ml->addr + ml->size <= end) {
5603489Sjosephb 			--pcimemlists_used;
5613489Sjosephb 			for (j = i; j < pcimemlists_used; ++j)
5623489Sjosephb 				pcimemlists[j] = pcimemlists[j + 1];
5633489Sjosephb 			--i;	/* to revisit the new one at this index */
5643489Sjosephb 		}
5653489Sjosephb 
5663489Sjosephb 		/* split a range? */
5673489Sjosephb 		else if (ml->addr < start && end < ml->addr + ml->size) {
5683489Sjosephb 
5693489Sjosephb 			++pcimemlists_used;
5703489Sjosephb 			if (pcimemlists_used > MAX_MEMLIST)
5713489Sjosephb 				dboot_panic("too many pcimemlists");
5723489Sjosephb 
5733489Sjosephb 			for (j = pcimemlists_used - 1; j > i; --j)
5743489Sjosephb 				pcimemlists[j] = pcimemlists[j - 1];
5753489Sjosephb 			ml->size = start - ml->addr;
5763489Sjosephb 
5773489Sjosephb 			++ml;
5783489Sjosephb 			ml->size = (ml->addr + ml->size) - end;
5793489Sjosephb 			ml->addr = end;
5803489Sjosephb 			++i;	/* skip on to next one */
5813489Sjosephb 		}
5823489Sjosephb 
5833489Sjosephb 		/* cut memory off the start? */
5843489Sjosephb 		else if (ml->addr < end && end < ml->addr + ml->size) {
5853489Sjosephb 			ml->size -= end - ml->addr;
5863489Sjosephb 			ml->addr = end;
5873489Sjosephb 		}
5883489Sjosephb 
5893489Sjosephb 		/* cut memory off the end? */
5903489Sjosephb 		else if (ml->addr <= start && start < ml->addr + ml->size) {
5913489Sjosephb 			ml->size = start - ml->addr;
5923489Sjosephb 		}
5933489Sjosephb 	}
5943489Sjosephb }
5953489Sjosephb 
5963489Sjosephb /*
5975084Sjohnlev  * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
5985084Sjohnlev  * definition in Xen source.
5995084Sjohnlev  */
6005084Sjohnlev #ifdef __xpv
6015084Sjohnlev typedef struct {
6025084Sjohnlev 	uint32_t	base_addr_low;
6035084Sjohnlev 	uint32_t	base_addr_high;
6045084Sjohnlev 	uint32_t	length_low;
6055084Sjohnlev 	uint32_t	length_high;
6065084Sjohnlev 	uint32_t	type;
6075084Sjohnlev } mmap_t;
6085084Sjohnlev #else
6095084Sjohnlev typedef mb_memory_map_t mmap_t;
6105084Sjohnlev #endif
6115084Sjohnlev 
6125084Sjohnlev static void
build_pcimemlists(mmap_t * mem,int num)6135084Sjohnlev build_pcimemlists(mmap_t *mem, int num)
6145084Sjohnlev {
6155084Sjohnlev 	mmap_t *mmap;
6165084Sjohnlev 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
6175084Sjohnlev 	uint64_t start;
6185084Sjohnlev 	uint64_t end;
6195084Sjohnlev 	int i;
6205084Sjohnlev 
6215084Sjohnlev 	/*
6225084Sjohnlev 	 * initialize
6235084Sjohnlev 	 */
6245084Sjohnlev 	pcimemlists[0].addr = pci_lo_limit;
6255084Sjohnlev 	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
6265084Sjohnlev 	pcimemlists_used = 1;
6275084Sjohnlev 
6285084Sjohnlev 	/*
6295084Sjohnlev 	 * Fill in PCI memlists.
6305084Sjohnlev 	 */
6315084Sjohnlev 	for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
6325084Sjohnlev 		start = ((uint64_t)mmap->base_addr_high << 32) +
6335084Sjohnlev 		    mmap->base_addr_low;
6345084Sjohnlev 		end = start + ((uint64_t)mmap->length_high << 32) +
6355084Sjohnlev 		    mmap->length_low;
6365084Sjohnlev 
6375084Sjohnlev 		if (prom_debug)
6385084Sjohnlev 			dboot_printf("\ttype: %d %" PRIx64 "..%"
6395084Sjohnlev 			    PRIx64 "\n", mmap->type, start, end);
6405084Sjohnlev 
6415084Sjohnlev 		/*
6425084Sjohnlev 		 * page align start and end
6435084Sjohnlev 		 */
6445084Sjohnlev 		start = (start + page_offset) & ~page_offset;
6455084Sjohnlev 		end &= ~page_offset;
6465084Sjohnlev 		if (end <= start)
6475084Sjohnlev 			continue;
6485084Sjohnlev 
6495084Sjohnlev 		exclude_from_pci(start, end);
6505084Sjohnlev 	}
6515084Sjohnlev 
6525084Sjohnlev 	/*
6535084Sjohnlev 	 * Finish off the pcimemlist
6545084Sjohnlev 	 */
6555084Sjohnlev 	if (prom_debug) {
6565084Sjohnlev 		for (i = 0; i < pcimemlists_used; ++i) {
6575084Sjohnlev 			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
6585084Sjohnlev 			    PRIx64 "\n", pcimemlists[i].addr,
6595084Sjohnlev 			    pcimemlists[i].addr + pcimemlists[i].size);
6605084Sjohnlev 		}
6615084Sjohnlev 	}
6625084Sjohnlev 	pcimemlists[0].next = 0;
6635084Sjohnlev 	pcimemlists[0].prev = 0;
6645084Sjohnlev 	for (i = 1; i < pcimemlists_used; ++i) {
6655084Sjohnlev 		pcimemlists[i].prev =
6665084Sjohnlev 		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
6675084Sjohnlev 		pcimemlists[i].next = 0;
6685084Sjohnlev 		pcimemlists[i - 1].next =
6695084Sjohnlev 		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
6705084Sjohnlev 	}
6715084Sjohnlev 	bi->bi_pcimem = (native_ptr_t)pcimemlists;
6725084Sjohnlev 	DBG(bi->bi_pcimem);
6735084Sjohnlev }
6745084Sjohnlev 
6755084Sjohnlev #if defined(__xpv)
6765084Sjohnlev /*
6775084Sjohnlev  * Initialize memory allocator stuff from hypervisor-supplied start info.
6785084Sjohnlev  *
6795084Sjohnlev  * There is 512KB of scratch area after the boot stack page.
6805084Sjohnlev  * We'll use that for everything except the kernel nucleus pages which are too
6815084Sjohnlev  * big to fit there and are allocated last anyway.
6825084Sjohnlev  */
6835084Sjohnlev #define	MAXMAPS	100
6845084Sjohnlev static mmap_t map_buffer[MAXMAPS];
6855084Sjohnlev static void
init_mem_alloc(void)6865084Sjohnlev init_mem_alloc(void)
6875084Sjohnlev {
6885084Sjohnlev 	int	local;	/* variables needed to find start region */
6895084Sjohnlev 	paddr_t	scratch_start;
6905084Sjohnlev 	xen_memory_map_t map;
6915084Sjohnlev 
6925084Sjohnlev 	DBG_MSG("Entered init_mem_alloc()\n");
6935084Sjohnlev 
6945084Sjohnlev 	/*
6955084Sjohnlev 	 * Free memory follows the stack. There's at least 512KB of scratch
6965084Sjohnlev 	 * space, rounded up to at least 2Mb alignment.  That should be enough
6975084Sjohnlev 	 * for the page tables we'll need to build.  The nucleus memory is
6985084Sjohnlev 	 * allocated last and will be outside the addressible range.  We'll
6995084Sjohnlev 	 * switch to new page tables before we unpack the kernel
7005084Sjohnlev 	 */
7015084Sjohnlev 	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
7025084Sjohnlev 	DBG(scratch_start);
7035084Sjohnlev 	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
7045084Sjohnlev 	DBG(scratch_end);
7055084Sjohnlev 
7065084Sjohnlev 	/*
7075084Sjohnlev 	 * For paranoia, leave some space between hypervisor data and ours.
7085084Sjohnlev 	 * Use 500 instead of 512.
7095084Sjohnlev 	 */
7105084Sjohnlev 	next_avail_addr = scratch_end - 500 * 1024;
7115084Sjohnlev 	DBG(next_avail_addr);
7125084Sjohnlev 
7135084Sjohnlev 	/*
7145084Sjohnlev 	 * The domain builder gives us at most 1 module
7155084Sjohnlev 	 */
7165084Sjohnlev 	DBG(xen_info->mod_len);
7175084Sjohnlev 	if (xen_info->mod_len > 0) {
7185084Sjohnlev 		DBG(xen_info->mod_start);
7195084Sjohnlev 		modules[0].bm_addr = xen_info->mod_start;
7205084Sjohnlev 		modules[0].bm_size = xen_info->mod_len;
7215084Sjohnlev 		bi->bi_module_cnt = 1;
7225084Sjohnlev 		bi->bi_modules = (native_ptr_t)modules;
7235084Sjohnlev 	} else {
7245084Sjohnlev 		bi->bi_module_cnt = 0;
7255084Sjohnlev 		bi->bi_modules = NULL;
7265084Sjohnlev 	}
7275084Sjohnlev 	DBG(bi->bi_module_cnt);
7285084Sjohnlev 	DBG(bi->bi_modules);
7295084Sjohnlev 
7305084Sjohnlev 	DBG(xen_info->mfn_list);
7315084Sjohnlev 	DBG(xen_info->nr_pages);
7325084Sjohnlev 	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
7335084Sjohnlev 	DBG(max_mem);
7345084Sjohnlev 
7355084Sjohnlev 	/*
7365084Sjohnlev 	 * Using pseudo-physical addresses, so only 1 memlist element
7375084Sjohnlev 	 */
7385084Sjohnlev 	memlists[0].addr = 0;
7395084Sjohnlev 	DBG(memlists[0].addr);
7405084Sjohnlev 	memlists[0].size = max_mem;
7415084Sjohnlev 	DBG(memlists[0].size);
7425084Sjohnlev 	memlists_used = 1;
7435084Sjohnlev 	DBG(memlists_used);
7445084Sjohnlev 
7455084Sjohnlev 	/*
7465084Sjohnlev 	 * finish building physinstall list
7475084Sjohnlev 	 */
7485084Sjohnlev 	sort_physinstall();
7495084Sjohnlev 
7509940SVikram.Hegde@Sun.COM 	/*
7519940SVikram.Hegde@Sun.COM 	 * build bios reserved memlists
7529940SVikram.Hegde@Sun.COM 	 */
7539940SVikram.Hegde@Sun.COM 	build_rsvdmemlists();
7549940SVikram.Hegde@Sun.COM 
7555084Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
7565084Sjohnlev 		/*
7575084Sjohnlev 		 * build PCI Memory list
7585084Sjohnlev 		 */
7595084Sjohnlev 		map.nr_entries = MAXMAPS;
7605084Sjohnlev 		/*LINTED: constant in conditional context*/
7615084Sjohnlev 		set_xen_guest_handle(map.buffer, map_buffer);
7625084Sjohnlev 		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
7635084Sjohnlev 			dboot_panic("getting XENMEM_machine_memory_map failed");
7645084Sjohnlev 		build_pcimemlists(map_buffer, map.nr_entries);
7655084Sjohnlev 	}
7665084Sjohnlev }
7675084Sjohnlev 
7685084Sjohnlev #else	/* !__xpv */
7695084Sjohnlev 
7705084Sjohnlev /*
7715084Sjohnlev  * During memory allocation, find the highest address not used yet.
7725084Sjohnlev  */
7735084Sjohnlev static void
check_higher(paddr_t a)7745084Sjohnlev check_higher(paddr_t a)
7755084Sjohnlev {
7765084Sjohnlev 	if (a < next_avail_addr)
7775084Sjohnlev 		return;
7785084Sjohnlev 	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
7795084Sjohnlev 	DBG(next_avail_addr);
7805084Sjohnlev }
7815084Sjohnlev 
7825084Sjohnlev /*
7833446Smrj  * Walk through the module information finding the last used address.
7843446Smrj  * The first available address will become the top level page table.
7853446Smrj  *
7863446Smrj  * We then build the phys_install memlist from the multiboot information.
7873446Smrj  */
7883446Smrj static void
init_mem_alloc(void)7893446Smrj init_mem_alloc(void)
7903446Smrj {
7913446Smrj 	mb_memory_map_t *mmap;
7923446Smrj 	mb_module_t *mod;
7933446Smrj 	uint64_t start;
7943446Smrj 	uint64_t end;
7953446Smrj 	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
7963446Smrj 	extern char _end[];
7973446Smrj 	int i;
7983446Smrj 
7993446Smrj 	DBG_MSG("Entered init_mem_alloc()\n");
8003446Smrj 	DBG((uintptr_t)mb_info);
8013446Smrj 
80210304SSeth.Goldberg@Sun.COM 	if (mb_info->mods_count > MAX_MODULES) {
80310304SSeth.Goldberg@Sun.COM 		dboot_panic("Too many modules (%d) -- the maximum is %d.",
80410304SSeth.Goldberg@Sun.COM 		    mb_info->mods_count, MAX_MODULES);
80510304SSeth.Goldberg@Sun.COM 	}
8063446Smrj 	/*
8073446Smrj 	 * search the modules to find the last used address
8083446Smrj 	 * we'll build the module list while we're walking through here
8093446Smrj 	 */
8103446Smrj 	DBG_MSG("\nFinding Modules\n");
8113446Smrj 	check_higher((paddr_t)&_end);
8123446Smrj 	for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
8133446Smrj 	    i < mb_info->mods_count;
8143446Smrj 	    ++mod, ++i) {
8153446Smrj 		if (prom_debug) {
8163446Smrj 			dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
8173446Smrj 			    i, (char *)(mod->mod_name),
8183446Smrj 			    (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
8193446Smrj 		}
8203446Smrj 		modules[i].bm_addr = mod->mod_start;
82110304SSeth.Goldberg@Sun.COM 		if (mod->mod_start > mod->mod_end) {
82210304SSeth.Goldberg@Sun.COM 			dboot_panic("module[%d]: Invalid module start address "
82310304SSeth.Goldberg@Sun.COM 			    "(0x%llx)", i, (uint64_t)mod->mod_start);
82410304SSeth.Goldberg@Sun.COM 		}
82510304SSeth.Goldberg@Sun.COM 		modules[i].bm_size = mod->mod_end - mod->mod_start;
8263446Smrj 
8273446Smrj 		check_higher(mod->mod_end);
8283446Smrj 	}
8293446Smrj 	bi->bi_modules = (native_ptr_t)modules;
8303446Smrj 	DBG(bi->bi_modules);
8313446Smrj 	bi->bi_module_cnt = mb_info->mods_count;
8323446Smrj 	DBG(bi->bi_module_cnt);
8333446Smrj 
8343446Smrj 	/*
8353446Smrj 	 * Walk through the memory map from multiboot and build our memlist
8363446Smrj 	 * structures. Note these will have native format pointers.
8373446Smrj 	 */
8383446Smrj 	DBG_MSG("\nFinding Memory Map\n");
8393446Smrj 	DBG(mb_info->flags);
8403446Smrj 	max_mem = 0;
8413446Smrj 	if (mb_info->flags & 0x40) {
8425084Sjohnlev 		int cnt = 0;
8435084Sjohnlev 
8443446Smrj 		DBG(mb_info->mmap_addr);
8453446Smrj 		DBG(mb_info->mmap_length);
8463446Smrj 		check_higher(mb_info->mmap_addr + mb_info->mmap_length);
8473446Smrj 
8483446Smrj 		for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
8493446Smrj 		    (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
8503446Smrj 		    mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
8513446Smrj 		    + sizeof (mmap->size))) {
8525084Sjohnlev 			++cnt;
8533446Smrj 			start = ((uint64_t)mmap->base_addr_high << 32) +
8543446Smrj 			    mmap->base_addr_low;
8553446Smrj 			end = start + ((uint64_t)mmap->length_high << 32) +
8563446Smrj 			    mmap->length_low;
8573446Smrj 
8583489Sjosephb 			if (prom_debug)
8593446Smrj 				dboot_printf("\ttype: %d %" PRIx64 "..%"
8603446Smrj 				    PRIx64 "\n", mmap->type, start, end);
8613446Smrj 
8623446Smrj 			/*
8633446Smrj 			 * page align start and end
8643446Smrj 			 */
8653446Smrj 			start = (start + page_offset) & ~page_offset;
8663446Smrj 			end &= ~page_offset;
8673446Smrj 			if (end <= start)
8683446Smrj 				continue;
8693446Smrj 
8703489Sjosephb 			/*
8713489Sjosephb 			 * only type 1 is usable RAM
8723489Sjosephb 			 */
8739940SVikram.Hegde@Sun.COM 			switch (mmap->type) {
8749940SVikram.Hegde@Sun.COM 			case 1:
8759940SVikram.Hegde@Sun.COM 				if (end > max_mem)
8769940SVikram.Hegde@Sun.COM 					max_mem = end;
8779940SVikram.Hegde@Sun.COM 				memlists[memlists_used].addr = start;
8789940SVikram.Hegde@Sun.COM 				memlists[memlists_used].size = end - start;
8799940SVikram.Hegde@Sun.COM 				++memlists_used;
8809940SVikram.Hegde@Sun.COM 				if (memlists_used > MAX_MEMLIST)
8819940SVikram.Hegde@Sun.COM 					dboot_panic("too many memlists");
8829940SVikram.Hegde@Sun.COM 				break;
8839940SVikram.Hegde@Sun.COM 			case 2:
8849940SVikram.Hegde@Sun.COM 				rsvdmemlists[rsvdmemlists_used].addr = start;
8859940SVikram.Hegde@Sun.COM 				rsvdmemlists[rsvdmemlists_used].size =
8869940SVikram.Hegde@Sun.COM 				    end - start;
8879940SVikram.Hegde@Sun.COM 				++rsvdmemlists_used;
8889940SVikram.Hegde@Sun.COM 				if (rsvdmemlists_used > MAX_MEMLIST)
8899940SVikram.Hegde@Sun.COM 					dboot_panic("too many rsvdmemlists");
8909940SVikram.Hegde@Sun.COM 				break;
8919940SVikram.Hegde@Sun.COM 			default:
8923489Sjosephb 				continue;
8939940SVikram.Hegde@Sun.COM 			}
8943446Smrj 		}
8955084Sjohnlev 		build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
8963446Smrj 	} else if (mb_info->flags & 0x01) {
8973446Smrj 		DBG(mb_info->mem_lower);
8983446Smrj 		memlists[memlists_used].addr = 0;
8993446Smrj 		memlists[memlists_used].size = mb_info->mem_lower * 1024;
9003446Smrj 		++memlists_used;
9013446Smrj 		DBG(mb_info->mem_upper);
9023446Smrj 		memlists[memlists_used].addr = 1024 * 1024;
9033446Smrj 		memlists[memlists_used].size = mb_info->mem_upper * 1024;
9043446Smrj 		++memlists_used;
9055084Sjohnlev 
9065084Sjohnlev 		/*
9075084Sjohnlev 		 * Old platform - assume I/O space at the end of memory.
9085084Sjohnlev 		 */
9095084Sjohnlev 		pcimemlists[0].addr =
9105084Sjohnlev 		    (mb_info->mem_upper * 1024) + (1024 * 1024);
9115084Sjohnlev 		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
9125084Sjohnlev 		pcimemlists[0].next = 0;
9135084Sjohnlev 		pcimemlists[0].prev = 0;
9145084Sjohnlev 		bi->bi_pcimem = (native_ptr_t)pcimemlists;
9155084Sjohnlev 		DBG(bi->bi_pcimem);
9163446Smrj 	} else {
9175084Sjohnlev 		dboot_panic("No memory info from boot loader!!!");
9183446Smrj 	}
9193446Smrj 
9203446Smrj 	check_higher(bi->bi_cmdline);
9213446Smrj 
9223446Smrj 	/*
9233446Smrj 	 * finish processing the physinstall list
9243446Smrj 	 */
9253446Smrj 	sort_physinstall();
9269940SVikram.Hegde@Sun.COM 
9279940SVikram.Hegde@Sun.COM 	/*
9289940SVikram.Hegde@Sun.COM 	 * build bios reserved mem lists
9299940SVikram.Hegde@Sun.COM 	 */
9309940SVikram.Hegde@Sun.COM 	build_rsvdmemlists();
9313446Smrj }
9325084Sjohnlev #endif /* !__xpv */
9333446Smrj 
9343446Smrj /*
9353446Smrj  * Simple memory allocator, allocates aligned physical memory.
9363446Smrj  * Note that startup_kernel() only allocates memory, never frees.
9373446Smrj  * Memory usage just grows in an upward direction.
9383446Smrj  */
9393446Smrj static void *
do_mem_alloc(uint32_t size,uint32_t align)9403446Smrj do_mem_alloc(uint32_t size, uint32_t align)
9413446Smrj {
9423446Smrj 	uint_t i;
9433446Smrj 	uint64_t best;
9443446Smrj 	uint64_t start;
9453446Smrj 	uint64_t end;
9463446Smrj 
9473446Smrj 	/*
9483446Smrj 	 * make sure size is a multiple of pagesize
9493446Smrj 	 */
9503446Smrj 	size = RNDUP(size, MMU_PAGESIZE);
9513446Smrj 	next_avail_addr = RNDUP(next_avail_addr, align);
9523446Smrj 
9533446Smrj 	/*
9545084Sjohnlev 	 * XXPV fixme joe
9555084Sjohnlev 	 *
9563446Smrj 	 * a really large bootarchive that causes you to run out of memory
9573446Smrj 	 * may cause this to blow up
9583446Smrj 	 */
9593446Smrj 	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
9603446Smrj 	best = (uint64_t)-size;
9613446Smrj 	for (i = 0; i < memlists_used; ++i) {
9623446Smrj 		start = memlists[i].addr;
9635084Sjohnlev #if defined(__xpv)
9645084Sjohnlev 		start += mfn_base;
9655084Sjohnlev #endif
9663446Smrj 		end = start + memlists[i].size;
9673446Smrj 
9683446Smrj 		/*
9693446Smrj 		 * did we find the desired address?
9703446Smrj 		 */
9713446Smrj 		if (start <= next_avail_addr && next_avail_addr + size <= end) {
9723446Smrj 			best = next_avail_addr;
9733446Smrj 			goto done;
9743446Smrj 		}
9753446Smrj 
9763446Smrj 		/*
9773446Smrj 		 * if not is this address the best so far?
9783446Smrj 		 */
9793446Smrj 		if (start > next_avail_addr && start < best &&
9803446Smrj 		    RNDUP(start, align) + size <= end)
9813446Smrj 			best = RNDUP(start, align);
9823446Smrj 	}
9833446Smrj 
9843446Smrj 	/*
9853446Smrj 	 * We didn't find exactly the address we wanted, due to going off the
9863446Smrj 	 * end of a memory region. Return the best found memory address.
9873446Smrj 	 */
9883446Smrj done:
9893446Smrj 	next_avail_addr = best + size;
9905084Sjohnlev #if defined(__xpv)
9915084Sjohnlev 	if (next_avail_addr > scratch_end)
9925084Sjohnlev 		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
9935084Sjohnlev 		    "0x%lx", (ulong_t)next_avail_addr,
9945084Sjohnlev 		    (ulong_t)scratch_end);
9955084Sjohnlev #endif
9963446Smrj 	(void) memset((void *)(uintptr_t)best, 0, size);
9973446Smrj 	return ((void *)(uintptr_t)best);
9983446Smrj }
9993446Smrj 
10003446Smrj void *
mem_alloc(uint32_t size)10013446Smrj mem_alloc(uint32_t size)
10023446Smrj {
10033446Smrj 	return (do_mem_alloc(size, MMU_PAGESIZE));
10043446Smrj }
10053446Smrj 
10063446Smrj 
10073446Smrj /*
10083446Smrj  * Build page tables to map all of memory used so far as well as the kernel.
10093446Smrj  */
10103446Smrj static void
build_page_tables(void)10113446Smrj build_page_tables(void)
10123446Smrj {
10133446Smrj 	uint32_t psize;
10143446Smrj 	uint32_t level;
10153446Smrj 	uint32_t off;
10165084Sjohnlev 	uint64_t start;
10175084Sjohnlev #if !defined(__xpv)
10183446Smrj 	uint32_t i;
10193446Smrj 	uint64_t end;
10205084Sjohnlev #endif	/* __xpv */
10213446Smrj 
10223446Smrj 	/*
10235084Sjohnlev 	 * If we're on metal, we need to create the top level pagetable.
10243446Smrj 	 */
10255084Sjohnlev #if defined(__xpv)
10265084Sjohnlev 	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
10275084Sjohnlev #else /* __xpv */
10283446Smrj 	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
10295084Sjohnlev #endif /* __xpv */
10303446Smrj 	DBG((uintptr_t)top_page_table);
10313446Smrj 
10323446Smrj 	/*
10333446Smrj 	 * Determine if we'll use large mappings for kernel, then map it.
10343446Smrj 	 */
10353446Smrj 	if (largepage_support) {
10363446Smrj 		psize = lpagesize;
10373446Smrj 		level = 1;
10383446Smrj 	} else {
10393446Smrj 		psize = MMU_PAGESIZE;
10403446Smrj 		level = 0;
10413446Smrj 	}
10423446Smrj 
10433446Smrj 	DBG_MSG("Mapping kernel\n");
10443446Smrj 	DBG(ktext_phys);
10453446Smrj 	DBG(target_kernel_text);
10463446Smrj 	DBG(ksize);
10473446Smrj 	DBG(psize);
10483446Smrj 	for (off = 0; off < ksize; off += psize)
10493446Smrj 		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
10503446Smrj 
10513446Smrj 	/*
10523446Smrj 	 * The kernel will need a 1 page window to work with page tables
10533446Smrj 	 */
10543446Smrj 	bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
10553446Smrj 	DBG(bi->bi_pt_window);
10563446Smrj 	bi->bi_pte_to_pt_window =
10573446Smrj 	    (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
10583446Smrj 	DBG(bi->bi_pte_to_pt_window);
10593446Smrj 
10605084Sjohnlev #if defined(__xpv)
10615084Sjohnlev 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
10625084Sjohnlev 		/* If this is a domU we're done. */
10635084Sjohnlev 		DBG_MSG("\nPage tables constructed\n");
10645084Sjohnlev 		return;
10655084Sjohnlev 	}
10665084Sjohnlev #endif /* __xpv */
10675084Sjohnlev 
10683446Smrj 	/*
10695084Sjohnlev 	 * We need 1:1 mappings for the lower 1M of memory to access
10705084Sjohnlev 	 * BIOS tables used by a couple of drivers during boot.
10715084Sjohnlev 	 *
10725084Sjohnlev 	 * The following code works because our simple memory allocator
10735084Sjohnlev 	 * only grows usage in an upwards direction.
10743446Smrj 	 *
10755084Sjohnlev 	 * Note that by this point in boot some mappings for low memory
10765084Sjohnlev 	 * may already exist because we've already accessed device in low
10775084Sjohnlev 	 * memory.  (Specifically the video frame buffer and keyboard
10785084Sjohnlev 	 * status ports.)  If we're booting on raw hardware then GRUB
10795084Sjohnlev 	 * created these mappings for us.  If we're booting under a
10805084Sjohnlev 	 * hypervisor then we went ahead and remapped these devices into
10815084Sjohnlev 	 * memory allocated within dboot itself.
10825084Sjohnlev 	 */
10835084Sjohnlev 	if (map_debug)
10845084Sjohnlev 		dboot_printf("1:1 map pa=0..1Meg\n");
10855084Sjohnlev 	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
10865084Sjohnlev #if defined(__xpv)
10875084Sjohnlev 		map_ma_at_va(start, start, 0);
10885084Sjohnlev #else /* __xpv */
10895084Sjohnlev 		map_pa_at_va(start, start, 0);
10905084Sjohnlev #endif /* __xpv */
10915084Sjohnlev 	}
10925084Sjohnlev 
10935084Sjohnlev #if !defined(__xpv)
10943446Smrj 	for (i = 0; i < memlists_used; ++i) {
10953446Smrj 		start = memlists[i].addr;
10963446Smrj 
10973446Smrj 		end = start + memlists[i].size;
10983446Smrj 
10993446Smrj 		if (map_debug)
11003446Smrj 			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
11013446Smrj 			    start, end);
11023446Smrj 		while (start < end && start < next_avail_addr) {
11033446Smrj 			map_pa_at_va(start, start, 0);
11043446Smrj 			start += MMU_PAGESIZE;
11053446Smrj 		}
11063446Smrj 	}
11075084Sjohnlev #endif /* !__xpv */
11083446Smrj 
11093446Smrj 	DBG_MSG("\nPage tables constructed\n");
11103446Smrj }
11113446Smrj 
11123446Smrj #define	NO_MULTIBOOT	\
11133446Smrj "multiboot is no longer used to boot the Solaris Operating System.\n\
11143446Smrj The grub entry should be changed to:\n\
11153446Smrj kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
11163446Smrj module$ /platform/i86pc/$ISADIR/boot_archive\n\
11173446Smrj See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
11183446Smrj 
11193446Smrj /*
11203446Smrj  * startup_kernel has a pretty simple job. It builds pagetables which reflect
11213446Smrj  * 1:1 mappings for all memory in use. It then also adds mappings for
11223446Smrj  * the kernel nucleus at virtual address of target_kernel_text using large page
11233446Smrj  * mappings. The page table pages are also accessible at 1:1 mapped
11243446Smrj  * virtual addresses.
11253446Smrj  */
11263446Smrj /*ARGSUSED*/
11273446Smrj void
startup_kernel(void)11283446Smrj startup_kernel(void)
11293446Smrj {
11303446Smrj 	char *cmdline;
11313446Smrj 	uintptr_t addr;
11325084Sjohnlev #if defined(__xpv)
11335084Sjohnlev 	physdev_set_iopl_t set_iopl;
11345084Sjohnlev #endif /* __xpv */
11353446Smrj 
11363446Smrj 	/*
11373446Smrj 	 * At this point we are executing in a 32 bit real mode.
11383446Smrj 	 */
11395084Sjohnlev #if defined(__xpv)
11405084Sjohnlev 	cmdline = (char *)xen_info->cmd_line;
11415084Sjohnlev #else /* __xpv */
11423446Smrj 	cmdline = (char *)mb_info->cmdline;
11435084Sjohnlev #endif /* __xpv */
11445084Sjohnlev 
11453446Smrj 	prom_debug = (strstr(cmdline, "prom_debug") != NULL);
11463446Smrj 	map_debug = (strstr(cmdline, "map_debug") != NULL);
11475084Sjohnlev 
11485084Sjohnlev #if defined(__xpv)
11495084Sjohnlev 	/*
11505084Sjohnlev 	 * For dom0, before we initialize the console subsystem we'll
11515084Sjohnlev 	 * need to enable io operations, so set I/O priveldge level to 1.
11525084Sjohnlev 	 */
11535084Sjohnlev 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
11545084Sjohnlev 		set_iopl.iopl = 1;
11555084Sjohnlev 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
11565084Sjohnlev 	}
11575084Sjohnlev #endif /* __xpv */
11585084Sjohnlev 
11593446Smrj 	bcons_init(cmdline);
11603446Smrj 	DBG_MSG("\n\nSolaris prekernel set: ");
11613446Smrj 	DBG_MSG(cmdline);
11623446Smrj 	DBG_MSG("\n");
11633446Smrj 
11643446Smrj 	if (strstr(cmdline, "multiboot") != NULL) {
11653446Smrj 		dboot_panic(NO_MULTIBOOT);
11663446Smrj 	}
11673446Smrj 
11683446Smrj 	/*
11693446Smrj 	 * boot info must be 16 byte aligned for 64 bit kernel ABI
11703446Smrj 	 */
11713446Smrj 	addr = (uintptr_t)boot_info;
11723446Smrj 	addr = (addr + 0xf) & ~0xf;
11733446Smrj 	bi = (struct xboot_info *)addr;
11743446Smrj 	DBG((uintptr_t)bi);
11753446Smrj 	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
11763446Smrj 
11773446Smrj 	/*
11783446Smrj 	 * Need correct target_kernel_text value
11793446Smrj 	 */
11803446Smrj #if defined(_BOOT_TARGET_amd64)
11813446Smrj 	target_kernel_text = KERNEL_TEXT_amd64;
11825084Sjohnlev #elif defined(__xpv)
11835084Sjohnlev 	target_kernel_text = KERNEL_TEXT_i386_xpv;
11843446Smrj #else
11853446Smrj 	target_kernel_text = KERNEL_TEXT_i386;
11863446Smrj #endif
11873446Smrj 	DBG(target_kernel_text);
11883446Smrj 
11895084Sjohnlev #if defined(__xpv)
11905084Sjohnlev 
11915084Sjohnlev 	/*
11925084Sjohnlev 	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
11935084Sjohnlev 	 */
11945084Sjohnlev 
11955084Sjohnlev #if defined(_BOOT_TARGET_amd64)
11965084Sjohnlev 	/*
11975084Sjohnlev 	 * 64-bit hypervisor.
11985084Sjohnlev 	 */
11995084Sjohnlev 	amd64_support = 1;
12005084Sjohnlev 	pae_support = 1;
12015084Sjohnlev 
12025084Sjohnlev #else	/* _BOOT_TARGET_amd64 */
12035084Sjohnlev 
12045084Sjohnlev 	/*
12055084Sjohnlev 	 * See if we are running on a PAE Hypervisor
12065084Sjohnlev 	 */
12075084Sjohnlev 	{
12085084Sjohnlev 		xen_capabilities_info_t caps;
12095084Sjohnlev 
12105084Sjohnlev 		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
12115084Sjohnlev 			dboot_panic("HYPERVISOR_xen_version(caps) failed");
12125084Sjohnlev 		caps[sizeof (caps) - 1] = 0;
12135084Sjohnlev 		if (prom_debug)
12145084Sjohnlev 			dboot_printf("xen capabilities %s\n", caps);
12155084Sjohnlev 		if (strstr(caps, "x86_32p") != NULL)
12165084Sjohnlev 			pae_support = 1;
12175084Sjohnlev 	}
12185084Sjohnlev 
12195084Sjohnlev #endif	/* _BOOT_TARGET_amd64 */
12205084Sjohnlev 	{
12215084Sjohnlev 		xen_platform_parameters_t p;
12225084Sjohnlev 
12235084Sjohnlev 		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
12245084Sjohnlev 			dboot_panic("HYPERVISOR_xen_version(parms) failed");
12255084Sjohnlev 		DBG(p.virt_start);
12265084Sjohnlev 		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
12275084Sjohnlev 	}
12285084Sjohnlev 
12295084Sjohnlev 	/*
12305084Sjohnlev 	 * The hypervisor loads stuff starting at 1Gig
12315084Sjohnlev 	 */
12325084Sjohnlev 	mfn_base = ONE_GIG;
12335084Sjohnlev 	DBG(mfn_base);
12345084Sjohnlev 
12355084Sjohnlev 	/*
12365084Sjohnlev 	 * enable writable page table mode for the hypervisor
12375084Sjohnlev 	 */
12385084Sjohnlev 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
12395084Sjohnlev 	    VMASST_TYPE_writable_pagetables) < 0)
12405084Sjohnlev 		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
12415084Sjohnlev 
12425084Sjohnlev 	/*
12435084Sjohnlev 	 * check for NX support
12445084Sjohnlev 	 */
12455084Sjohnlev 	if (pae_support) {
12465084Sjohnlev 		uint32_t eax = 0x80000000;
12475084Sjohnlev 		uint32_t edx = get_cpuid_edx(&eax);
12485084Sjohnlev 
12495084Sjohnlev 		if (eax >= 0x80000001) {
12505084Sjohnlev 			eax = 0x80000001;
12515084Sjohnlev 			edx = get_cpuid_edx(&eax);
12525084Sjohnlev 			if (edx & CPUID_AMD_EDX_NX)
12535084Sjohnlev 				NX_support = 1;
12545084Sjohnlev 		}
12555084Sjohnlev 	}
12565084Sjohnlev 
12575084Sjohnlev #if !defined(_BOOT_TARGET_amd64)
12585084Sjohnlev 
12595084Sjohnlev 	/*
12605084Sjohnlev 	 * The 32-bit hypervisor uses segmentation to protect itself from
12615084Sjohnlev 	 * guests. This means when a guest attempts to install a flat 4GB
12625084Sjohnlev 	 * code or data descriptor the 32-bit hypervisor will protect itself
12635084Sjohnlev 	 * by silently shrinking the segment such that if the guest attempts
12645084Sjohnlev 	 * any access where the hypervisor lives a #gp fault is generated.
12655084Sjohnlev 	 * The problem is that some applications expect a full 4GB flat
12665084Sjohnlev 	 * segment for their current thread pointer and will use negative
12675084Sjohnlev 	 * offset segment wrap around to access data. TLS support in linux
12685084Sjohnlev 	 * brand is one example of this.
12695084Sjohnlev 	 *
12705084Sjohnlev 	 * The 32-bit hypervisor can catch the #gp fault in these cases
12715084Sjohnlev 	 * and emulate the access without passing the #gp fault to the guest
12725084Sjohnlev 	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
12735084Sjohnlev 	 * Seems like this should have been the default.
12745084Sjohnlev 	 * Either way, we want the hypervisor -- and not Solaris -- to deal
12755084Sjohnlev 	 * to deal with emulating these accesses.
12765084Sjohnlev 	 */
12775084Sjohnlev 	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
12785084Sjohnlev 	    VMASST_TYPE_4gb_segments) < 0)
12795084Sjohnlev 		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
12805084Sjohnlev #endif	/* !_BOOT_TARGET_amd64 */
12815084Sjohnlev 
12825084Sjohnlev #else	/* __xpv */
12835084Sjohnlev 
12843446Smrj 	/*
12853446Smrj 	 * use cpuid to enable MMU features
12863446Smrj 	 */
12873446Smrj 	if (have_cpuid()) {
12883446Smrj 		uint32_t eax, edx;
12893446Smrj 
12903446Smrj 		eax = 1;
12913446Smrj 		edx = get_cpuid_edx(&eax);
12923446Smrj 		if (edx & CPUID_INTC_EDX_PSE)
12933446Smrj 			largepage_support = 1;
12943446Smrj 		if (edx & CPUID_INTC_EDX_PGE)
12953446Smrj 			pge_support = 1;
12963446Smrj 		if (edx & CPUID_INTC_EDX_PAE)
12973446Smrj 			pae_support = 1;
12983446Smrj 
12993446Smrj 		eax = 0x80000000;
13003446Smrj 		edx = get_cpuid_edx(&eax);
13013446Smrj 		if (eax >= 0x80000001) {
13023446Smrj 			eax = 0x80000001;
13033446Smrj 			edx = get_cpuid_edx(&eax);
13043446Smrj 			if (edx & CPUID_AMD_EDX_LM)
13053446Smrj 				amd64_support = 1;
13063446Smrj 			if (edx & CPUID_AMD_EDX_NX)
13073446Smrj 				NX_support = 1;
13083446Smrj 		}
13093446Smrj 	} else {
13103446Smrj 		dboot_printf("cpuid not supported\n");
13113446Smrj 	}
13125084Sjohnlev #endif /* __xpv */
13135084Sjohnlev 
13143446Smrj 
13153446Smrj #if defined(_BOOT_TARGET_amd64)
13163446Smrj 	if (amd64_support == 0)
13175084Sjohnlev 		dboot_panic("long mode not supported, rebooting");
13183446Smrj 	else if (pae_support == 0)
13195084Sjohnlev 		dboot_panic("long mode, but no PAE; rebooting");
13205084Sjohnlev #else
13215084Sjohnlev 	/*
13225084Sjohnlev 	 * Allow the command line to over-ride use of PAE for 32 bit.
13235084Sjohnlev 	 */
13245084Sjohnlev 	if (strstr(cmdline, "disablePAE=true") != NULL) {
13255084Sjohnlev 		pae_support = 0;
13265084Sjohnlev 		NX_support = 0;
13275084Sjohnlev 		amd64_support = 0;
13285084Sjohnlev 	}
13293446Smrj #endif
13303446Smrj 
13313446Smrj 	/*
13325084Sjohnlev 	 * initialize the simple memory allocator
13333446Smrj 	 */
13343446Smrj 	init_mem_alloc();
13353446Smrj 
13365084Sjohnlev #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
13375084Sjohnlev 	/*
13385084Sjohnlev 	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
13395084Sjohnlev 	 */
13405084Sjohnlev 	if (max_mem < FOUR_GIG && NX_support == 0)
13415084Sjohnlev 		pae_support = 0;
13425084Sjohnlev #endif
13435084Sjohnlev 
13443446Smrj 	/*
13453446Smrj 	 * configure mmu information
13463446Smrj 	 */
13475084Sjohnlev 	if (pae_support) {
13483446Smrj 		shift_amt = shift_amt_pae;
13493446Smrj 		ptes_per_table = 512;
13503446Smrj 		pte_size = 8;
13513446Smrj 		lpagesize = TWO_MEG;
13523446Smrj #if defined(_BOOT_TARGET_amd64)
13533446Smrj 		top_level = 3;
13543446Smrj #else
13553446Smrj 		top_level = 2;
13563446Smrj #endif
13573446Smrj 	} else {
13583446Smrj 		pae_support = 0;
13593446Smrj 		NX_support = 0;
13603446Smrj 		shift_amt = shift_amt_nopae;
13613446Smrj 		ptes_per_table = 1024;
13623446Smrj 		pte_size = 4;
13633446Smrj 		lpagesize = FOUR_MEG;
13643446Smrj 		top_level = 1;
13653446Smrj 	}
13663446Smrj 
13673446Smrj 	DBG(pge_support);
13683446Smrj 	DBG(NX_support);
13693446Smrj 	DBG(largepage_support);
13703446Smrj 	DBG(amd64_support);
13713446Smrj 	DBG(top_level);
13723446Smrj 	DBG(pte_size);
13733446Smrj 	DBG(ptes_per_table);
13743446Smrj 	DBG(lpagesize);
13753446Smrj 
13765084Sjohnlev #if defined(__xpv)
13775084Sjohnlev 	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
13785084Sjohnlev #else
13793446Smrj 	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
13805084Sjohnlev #endif
13813446Smrj 
13825084Sjohnlev #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
13833446Smrj 	/*
13843446Smrj 	 * For grub, copy kernel bits from the ELF64 file to final place.
13853446Smrj 	 */
13863446Smrj 	DBG_MSG("\nAllocating nucleus pages.\n");
13873446Smrj 	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
13883446Smrj 	if (ktext_phys == 0)
13895084Sjohnlev 		dboot_panic("failed to allocate aligned kernel memory");
13903446Smrj 	if (dboot_elfload64(mb_header.load_addr) != 0)
13915084Sjohnlev 		dboot_panic("failed to parse kernel ELF image, rebooting");
13925084Sjohnlev #endif
13933446Smrj 
13943446Smrj 	DBG(ktext_phys);
13953446Smrj 
13963446Smrj 	/*
13973446Smrj 	 * Allocate page tables.
13983446Smrj 	 */
13993446Smrj 	build_page_tables();
14003446Smrj 
14013446Smrj 	/*
14023446Smrj 	 * return to assembly code to switch to running kernel
14033446Smrj 	 */
14043446Smrj 	entry_addr_low = (uint32_t)target_kernel_text;
14053446Smrj 	DBG(entry_addr_low);
14063446Smrj 	bi->bi_use_largepage = largepage_support;
14073446Smrj 	bi->bi_use_pae = pae_support;
14083446Smrj 	bi->bi_use_pge = pge_support;
14093446Smrj 	bi->bi_use_nx = NX_support;
14105084Sjohnlev 
14115084Sjohnlev #if defined(__xpv)
14125084Sjohnlev 
14135084Sjohnlev 	bi->bi_next_paddr = next_avail_addr - mfn_base;
14145084Sjohnlev 	DBG(bi->bi_next_paddr);
14155084Sjohnlev 	bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
14165084Sjohnlev 	DBG(bi->bi_next_vaddr);
14175084Sjohnlev 
14185084Sjohnlev 	/*
14195084Sjohnlev 	 * unmap unused pages in start area to make them available for DMA
14205084Sjohnlev 	 */
14215084Sjohnlev 	while (next_avail_addr < scratch_end) {
14225084Sjohnlev 		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
14235084Sjohnlev 		    0, UVMF_INVLPG | UVMF_LOCAL);
14245084Sjohnlev 		next_avail_addr += MMU_PAGESIZE;
14255084Sjohnlev 	}
14265084Sjohnlev 
14275084Sjohnlev 	bi->bi_xen_start_info = (uintptr_t)xen_info;
14285084Sjohnlev 	DBG((uintptr_t)HYPERVISOR_shared_info);
14295084Sjohnlev 	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
14305084Sjohnlev 	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
14315084Sjohnlev 
14325084Sjohnlev #else /* __xpv */
14335084Sjohnlev 
14343446Smrj 	bi->bi_next_paddr = next_avail_addr;
14353446Smrj 	DBG(bi->bi_next_paddr);
14363446Smrj 	bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
14373446Smrj 	DBG(bi->bi_next_vaddr);
14383446Smrj 	bi->bi_mb_info = (uintptr_t)mb_info;
14393446Smrj 	bi->bi_top_page_table = (uintptr_t)top_page_table;
14403446Smrj 
14415084Sjohnlev #endif /* __xpv */
14425084Sjohnlev 
14433446Smrj 	bi->bi_kseg_size = FOUR_MEG;
14443446Smrj 	DBG(bi->bi_kseg_size);
14453446Smrj 
14467673SSherry.Moore@Sun.COM #ifndef __xpv
14479489SJoe.Bonasera@sun.com 	if (map_debug)
14487656SSherry.Moore@Sun.COM 		dump_tables();
14497673SSherry.Moore@Sun.COM #endif
14507656SSherry.Moore@Sun.COM 
14513446Smrj 	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
14523446Smrj }
1453