1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27
28 #include <sys/types.h>
29 #include <sys/machparam.h>
30 #include <sys/x86_archext.h>
31 #include <sys/systm.h>
32 #include <sys/mach_mmu.h>
33 #include <sys/multiboot.h>
34
35 #if defined(__xpv)
36
37 #include <sys/hypervisor.h>
38 uintptr_t xen_virt_start;
39 pfn_t *mfn_to_pfn_mapping;
40
41 #else /* !__xpv */
42
43 extern multiboot_header_t mb_header;
44 extern int have_cpuid(void);
45
46 #endif /* !__xpv */
47
48 #include <sys/inttypes.h>
49 #include <sys/bootinfo.h>
50 #include <sys/mach_mmu.h>
51 #include <sys/boot_console.h>
52
53 #include "dboot_asm.h"
54 #include "dboot_printf.h"
55 #include "dboot_xboot.h"
56 #include "dboot_elfload.h"
57
58 /*
59 * This file contains code that runs to transition us from either a multiboot
60 * compliant loader (32 bit non-paging) or a XPV domain loader to
61 * regular kernel execution. Its task is to setup the kernel memory image
62 * and page tables.
63 *
64 * The code executes as:
65 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
66 * - a 32 bit program for the 32-bit PV hypervisor
67 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
68 *
69 * Under the PV hypervisor, we must create mappings for any memory beyond the
70 * initial start of day allocation (such as the kernel itself).
71 *
72 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
73 * Since we are running in real mode, so all such memory is accessible.
74 */
75
76 /*
77 * Standard bits used in PTE (page level) and PTP (internal levels)
78 */
79 x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
80 x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
81
82 /*
83 * This is the target addresses (physical) where the kernel text and data
84 * nucleus pages will be unpacked. On the hypervisor this is actually a
85 * virtual address.
86 */
87 paddr_t ktext_phys;
88 uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */
89
90 static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */
91
92 /*
93 * The stack is setup in assembler before entering startup_kernel()
94 */
95 char stack_space[STACK_SIZE];
96
97 /*
98 * Used to track physical memory allocation
99 */
100 static paddr_t next_avail_addr = 0;
101
102 #if defined(__xpv)
103 /*
104 * Additional information needed for hypervisor memory allocation.
105 * Only memory up to scratch_end is mapped by page tables.
106 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
107 * to derive a pfn from a pointer, you subtract mfn_base.
108 */
109
110 static paddr_t scratch_end = 0; /* we can't write all of mem here */
111 static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */
112 start_info_t *xen_info;
113
114 #else /* __xpv */
115
116 /*
117 * If on the metal, then we have a multiboot loader.
118 */
119 multiboot_info_t *mb_info;
120
121 #endif /* __xpv */
122
123 /*
124 * This contains information passed to the kernel
125 */
126 struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */
127 struct xboot_info *bi;
128
129 /*
130 * Page table and memory stuff.
131 */
132 static paddr_t max_mem; /* maximum memory address */
133
134 /*
135 * Information about processor MMU
136 */
137 int amd64_support = 0;
138 int largepage_support = 0;
139 int pae_support = 0;
140 int pge_support = 0;
141 int NX_support = 0;
142
143 /*
144 * Low 32 bits of kernel entry address passed back to assembler.
145 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
146 */
147 uint32_t entry_addr_low;
148
149 /*
150 * Memlists for the kernel. We shouldn't need a lot of these.
151 */
152 #define MAX_MEMLIST (50)
153 struct boot_memlist memlists[MAX_MEMLIST];
154 uint_t memlists_used = 0;
155 struct boot_memlist pcimemlists[MAX_MEMLIST];
156 uint_t pcimemlists_used = 0;
157 struct boot_memlist rsvdmemlists[MAX_MEMLIST];
158 uint_t rsvdmemlists_used = 0;
159
160 #define MAX_MODULES (10)
161 struct boot_modules modules[MAX_MODULES];
162 uint_t modules_used = 0;
163
164 /*
165 * Debugging macros
166 */
167 uint_t prom_debug = 0;
168 uint_t map_debug = 0;
169
170 /*
171 * Either hypervisor-specific or grub-specific code builds the initial
172 * memlists. This code does the sort/merge/link for final use.
173 */
174 static void
sort_physinstall(void)175 sort_physinstall(void)
176 {
177 int i;
178 #if !defined(__xpv)
179 int j;
180 struct boot_memlist tmp;
181
182 /*
183 * Now sort the memlists, in case they weren't in order.
184 * Yeah, this is a bubble sort; small, simple and easy to get right.
185 */
186 DBG_MSG("Sorting phys-installed list\n");
187 for (j = memlists_used - 1; j > 0; --j) {
188 for (i = 0; i < j; ++i) {
189 if (memlists[i].addr < memlists[i + 1].addr)
190 continue;
191 tmp = memlists[i];
192 memlists[i] = memlists[i + 1];
193 memlists[i + 1] = tmp;
194 }
195 }
196
197 /*
198 * Merge any memlists that don't have holes between them.
199 */
200 for (i = 0; i <= memlists_used - 1; ++i) {
201 if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
202 continue;
203
204 if (prom_debug)
205 dboot_printf(
206 "merging mem segs %" PRIx64 "...%" PRIx64
207 " w/ %" PRIx64 "...%" PRIx64 "\n",
208 memlists[i].addr,
209 memlists[i].addr + memlists[i].size,
210 memlists[i + 1].addr,
211 memlists[i + 1].addr + memlists[i + 1].size);
212
213 memlists[i].size += memlists[i + 1].size;
214 for (j = i + 1; j < memlists_used - 1; ++j)
215 memlists[j] = memlists[j + 1];
216 --memlists_used;
217 DBG(memlists_used);
218 --i; /* after merging we need to reexamine, so do this */
219 }
220 #endif /* __xpv */
221
222 if (prom_debug) {
223 dboot_printf("\nFinal memlists:\n");
224 for (i = 0; i < memlists_used; ++i) {
225 dboot_printf("\t%d: addr=%" PRIx64 " size=%"
226 PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
227 }
228 }
229
230 /*
231 * link together the memlists with native size pointers
232 */
233 memlists[0].next = 0;
234 memlists[0].prev = 0;
235 for (i = 1; i < memlists_used; ++i) {
236 memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
237 memlists[i].next = 0;
238 memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
239 }
240 bi->bi_phys_install = (native_ptr_t)memlists;
241 DBG(bi->bi_phys_install);
242 }
243
244 /*
245 * build bios reserved memlists
246 */
247 static void
build_rsvdmemlists(void)248 build_rsvdmemlists(void)
249 {
250 int i;
251
252 rsvdmemlists[0].next = 0;
253 rsvdmemlists[0].prev = 0;
254 for (i = 1; i < rsvdmemlists_used; ++i) {
255 rsvdmemlists[i].prev =
256 (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
257 rsvdmemlists[i].next = 0;
258 rsvdmemlists[i - 1].next =
259 (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
260 }
261 bi->bi_rsvdmem = (native_ptr_t)rsvdmemlists;
262 DBG(bi->bi_rsvdmem);
263 }
264
265 #if defined(__xpv)
266
267 /*
268 * halt on the hypervisor after a delay to drain console output
269 */
270 void
dboot_halt(void)271 dboot_halt(void)
272 {
273 uint_t i = 10000;
274
275 while (--i)
276 (void) HYPERVISOR_yield();
277 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
278 }
279
280 /*
281 * From a machine address, find the corresponding pseudo-physical address.
282 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
283 * Machine addresses are the real underlying hardware addresses.
284 * These are needed for page table entries. Note that this routine is
285 * poorly protected. A bad value of "ma" will cause a page fault.
286 */
287 paddr_t
ma_to_pa(maddr_t ma)288 ma_to_pa(maddr_t ma)
289 {
290 ulong_t pgoff = ma & MMU_PAGEOFFSET;
291 ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
292 paddr_t pa;
293
294 if (pfn >= xen_info->nr_pages)
295 return (-(paddr_t)1);
296 pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
297 #ifdef DEBUG
298 if (ma != pa_to_ma(pa))
299 dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
300 "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
301 #endif
302 return (pa);
303 }
304
305 /*
306 * From a pseudo-physical address, find the corresponding machine address.
307 */
308 maddr_t
pa_to_ma(paddr_t pa)309 pa_to_ma(paddr_t pa)
310 {
311 pfn_t pfn;
312 ulong_t mfn;
313
314 pfn = mmu_btop(pa - mfn_base);
315 if (pa < mfn_base || pfn >= xen_info->nr_pages)
316 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
317 mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
318 #ifdef DEBUG
319 if (mfn_to_pfn_mapping[mfn] != pfn)
320 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
321 pfn, mfn, mfn_to_pfn_mapping[mfn]);
322 #endif
323 return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
324 }
325
326 #endif /* __xpv */
327
328 x86pte_t
get_pteval(paddr_t table,uint_t index)329 get_pteval(paddr_t table, uint_t index)
330 {
331 if (pae_support)
332 return (((x86pte_t *)(uintptr_t)table)[index]);
333 return (((x86pte32_t *)(uintptr_t)table)[index]);
334 }
335
336 /*ARGSUSED*/
337 void
set_pteval(paddr_t table,uint_t index,uint_t level,x86pte_t pteval)338 set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
339 {
340 #ifdef __xpv
341 mmu_update_t t;
342 maddr_t mtable = pa_to_ma(table);
343 int retcnt;
344
345 t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
346 t.val = pteval;
347 if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
348 dboot_panic("HYPERVISOR_mmu_update() failed");
349 #else /* __xpv */
350 uintptr_t tab_addr = (uintptr_t)table;
351
352 if (pae_support)
353 ((x86pte_t *)tab_addr)[index] = pteval;
354 else
355 ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
356 if (level == top_level && level == 2)
357 reload_cr3();
358 #endif /* __xpv */
359 }
360
361 paddr_t
make_ptable(x86pte_t * pteval,uint_t level)362 make_ptable(x86pte_t *pteval, uint_t level)
363 {
364 paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
365
366 if (level == top_level && level == 2)
367 *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
368 else
369 *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
370
371 #ifdef __xpv
372 /* Remove write permission to the new page table. */
373 if (HYPERVISOR_update_va_mapping(new_table,
374 *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
375 dboot_panic("HYP_update_va_mapping error");
376 #endif
377
378 if (map_debug)
379 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
380 PRIx64 "\n", level, (ulong_t)new_table, *pteval);
381 return (new_table);
382 }
383
384 x86pte_t *
map_pte(paddr_t table,uint_t index)385 map_pte(paddr_t table, uint_t index)
386 {
387 return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
388 }
389
390 /*
391 * dump out the contents of page tables...
392 */
393 static void
dump_tables(void)394 dump_tables(void)
395 {
396 uint_t save_index[4]; /* for recursion */
397 char *save_table[4]; /* for recursion */
398 uint_t l;
399 uint64_t va;
400 uint64_t pgsize;
401 int index;
402 int i;
403 x86pte_t pteval;
404 char *table;
405 static char *tablist = "\t\t\t";
406 char *tabs = tablist + 3 - top_level;
407 uint_t pa, pa1;
408 #if !defined(__xpv)
409 #define maddr_t paddr_t
410 #endif /* !__xpv */
411
412 dboot_printf("Finished pagetables:\n");
413 table = (char *)(uintptr_t)top_page_table;
414 l = top_level;
415 va = 0;
416 for (index = 0; index < ptes_per_table; ++index) {
417 pgsize = 1ull << shift_amt[l];
418 if (pae_support)
419 pteval = ((x86pte_t *)table)[index];
420 else
421 pteval = ((x86pte32_t *)table)[index];
422 if (pteval == 0)
423 goto next_entry;
424
425 dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
426 tabs + l, (void *)table, index, (uint64_t)pteval, va);
427 pa = ma_to_pa(pteval & MMU_PAGEMASK);
428 dboot_printf(" physaddr=%x\n", pa);
429
430 /*
431 * Don't try to walk hypervisor private pagetables
432 */
433 if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
434 save_table[l] = table;
435 save_index[l] = index;
436 --l;
437 index = -1;
438 table = (char *)(uintptr_t)
439 ma_to_pa(pteval & MMU_PAGEMASK);
440 goto recursion;
441 }
442
443 /*
444 * shorten dump for consecutive mappings
445 */
446 for (i = 1; index + i < ptes_per_table; ++i) {
447 if (pae_support)
448 pteval = ((x86pte_t *)table)[index + i];
449 else
450 pteval = ((x86pte32_t *)table)[index + i];
451 if (pteval == 0)
452 break;
453 pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
454 if (pa1 != pa + i * pgsize)
455 break;
456 }
457 if (i > 2) {
458 dboot_printf("%s...\n", tabs + l);
459 va += pgsize * (i - 2);
460 index += i - 2;
461 }
462 next_entry:
463 va += pgsize;
464 if (l == 3 && index == 256) /* VA hole */
465 va = 0xffff800000000000ull;
466 recursion:
467 ;
468 }
469 if (l < top_level) {
470 ++l;
471 index = save_index[l];
472 table = save_table[l];
473 goto recursion;
474 }
475 }
476
477 /*
478 * Add a mapping for the machine page at the given virtual address.
479 */
480 static void
map_ma_at_va(maddr_t ma,native_ptr_t va,uint_t level)481 map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
482 {
483 x86pte_t *ptep;
484 x86pte_t pteval;
485
486 pteval = ma | pte_bits;
487 if (level > 0)
488 pteval |= PT_PAGESIZE;
489 if (va >= target_kernel_text && pge_support)
490 pteval |= PT_GLOBAL;
491
492 if (map_debug && ma != va)
493 dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
494 " pte=0x%" PRIx64 " l=%d\n",
495 (uint64_t)ma, (uint64_t)va, pteval, level);
496
497 #if defined(__xpv)
498 /*
499 * see if we can avoid find_pte() on the hypervisor
500 */
501 if (HYPERVISOR_update_va_mapping(va, pteval,
502 UVMF_INVLPG | UVMF_LOCAL) == 0)
503 return;
504 #endif
505
506 /*
507 * Find the pte that will map this address. This creates any
508 * missing intermediate level page tables
509 */
510 ptep = find_pte(va, NULL, level, 0);
511
512 /*
513 * When paravirtualized, we must use hypervisor calls to modify the
514 * PTE, since paging is active. On real hardware we just write to
515 * the pagetables which aren't in use yet.
516 */
517 #if defined(__xpv)
518 ptep = ptep; /* shut lint up */
519 if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
520 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
521 " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
522 (uint64_t)va, level, (uint64_t)ma, pteval);
523 #else
524 if (va < 1024 * 1024)
525 pteval |= PT_NOCACHE; /* for video RAM */
526 if (pae_support)
527 *ptep = pteval;
528 else
529 *((x86pte32_t *)ptep) = (x86pte32_t)pteval;
530 #endif
531 }
532
533 /*
534 * Add a mapping for the physical page at the given virtual address.
535 */
536 static void
map_pa_at_va(paddr_t pa,native_ptr_t va,uint_t level)537 map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
538 {
539 map_ma_at_va(pa_to_ma(pa), va, level);
540 }
541
542 /*
543 * This is called to remove start..end from the
544 * possible range of PCI addresses.
545 */
546 const uint64_t pci_lo_limit = 0x00100000ul;
547 const uint64_t pci_hi_limit = 0xfff00000ul;
548 static void
exclude_from_pci(uint64_t start,uint64_t end)549 exclude_from_pci(uint64_t start, uint64_t end)
550 {
551 int i;
552 int j;
553 struct boot_memlist *ml;
554
555 for (i = 0; i < pcimemlists_used; ++i) {
556 ml = &pcimemlists[i];
557
558 /* delete the entire range? */
559 if (start <= ml->addr && ml->addr + ml->size <= end) {
560 --pcimemlists_used;
561 for (j = i; j < pcimemlists_used; ++j)
562 pcimemlists[j] = pcimemlists[j + 1];
563 --i; /* to revisit the new one at this index */
564 }
565
566 /* split a range? */
567 else if (ml->addr < start && end < ml->addr + ml->size) {
568
569 ++pcimemlists_used;
570 if (pcimemlists_used > MAX_MEMLIST)
571 dboot_panic("too many pcimemlists");
572
573 for (j = pcimemlists_used - 1; j > i; --j)
574 pcimemlists[j] = pcimemlists[j - 1];
575 ml->size = start - ml->addr;
576
577 ++ml;
578 ml->size = (ml->addr + ml->size) - end;
579 ml->addr = end;
580 ++i; /* skip on to next one */
581 }
582
583 /* cut memory off the start? */
584 else if (ml->addr < end && end < ml->addr + ml->size) {
585 ml->size -= end - ml->addr;
586 ml->addr = end;
587 }
588
589 /* cut memory off the end? */
590 else if (ml->addr <= start && start < ml->addr + ml->size) {
591 ml->size = start - ml->addr;
592 }
593 }
594 }
595
596 /*
597 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
598 * definition in Xen source.
599 */
600 #ifdef __xpv
601 typedef struct {
602 uint32_t base_addr_low;
603 uint32_t base_addr_high;
604 uint32_t length_low;
605 uint32_t length_high;
606 uint32_t type;
607 } mmap_t;
608 #else
609 typedef mb_memory_map_t mmap_t;
610 #endif
611
612 static void
build_pcimemlists(mmap_t * mem,int num)613 build_pcimemlists(mmap_t *mem, int num)
614 {
615 mmap_t *mmap;
616 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
617 uint64_t start;
618 uint64_t end;
619 int i;
620
621 /*
622 * initialize
623 */
624 pcimemlists[0].addr = pci_lo_limit;
625 pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
626 pcimemlists_used = 1;
627
628 /*
629 * Fill in PCI memlists.
630 */
631 for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
632 start = ((uint64_t)mmap->base_addr_high << 32) +
633 mmap->base_addr_low;
634 end = start + ((uint64_t)mmap->length_high << 32) +
635 mmap->length_low;
636
637 if (prom_debug)
638 dboot_printf("\ttype: %d %" PRIx64 "..%"
639 PRIx64 "\n", mmap->type, start, end);
640
641 /*
642 * page align start and end
643 */
644 start = (start + page_offset) & ~page_offset;
645 end &= ~page_offset;
646 if (end <= start)
647 continue;
648
649 exclude_from_pci(start, end);
650 }
651
652 /*
653 * Finish off the pcimemlist
654 */
655 if (prom_debug) {
656 for (i = 0; i < pcimemlists_used; ++i) {
657 dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
658 PRIx64 "\n", pcimemlists[i].addr,
659 pcimemlists[i].addr + pcimemlists[i].size);
660 }
661 }
662 pcimemlists[0].next = 0;
663 pcimemlists[0].prev = 0;
664 for (i = 1; i < pcimemlists_used; ++i) {
665 pcimemlists[i].prev =
666 (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
667 pcimemlists[i].next = 0;
668 pcimemlists[i - 1].next =
669 (native_ptr_t)(uintptr_t)(pcimemlists + i);
670 }
671 bi->bi_pcimem = (native_ptr_t)pcimemlists;
672 DBG(bi->bi_pcimem);
673 }
674
675 #if defined(__xpv)
676 /*
677 * Initialize memory allocator stuff from hypervisor-supplied start info.
678 *
679 * There is 512KB of scratch area after the boot stack page.
680 * We'll use that for everything except the kernel nucleus pages which are too
681 * big to fit there and are allocated last anyway.
682 */
683 #define MAXMAPS 100
684 static mmap_t map_buffer[MAXMAPS];
685 static void
init_mem_alloc(void)686 init_mem_alloc(void)
687 {
688 int local; /* variables needed to find start region */
689 paddr_t scratch_start;
690 xen_memory_map_t map;
691
692 DBG_MSG("Entered init_mem_alloc()\n");
693
694 /*
695 * Free memory follows the stack. There's at least 512KB of scratch
696 * space, rounded up to at least 2Mb alignment. That should be enough
697 * for the page tables we'll need to build. The nucleus memory is
698 * allocated last and will be outside the addressible range. We'll
699 * switch to new page tables before we unpack the kernel
700 */
701 scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
702 DBG(scratch_start);
703 scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
704 DBG(scratch_end);
705
706 /*
707 * For paranoia, leave some space between hypervisor data and ours.
708 * Use 500 instead of 512.
709 */
710 next_avail_addr = scratch_end - 500 * 1024;
711 DBG(next_avail_addr);
712
713 /*
714 * The domain builder gives us at most 1 module
715 */
716 DBG(xen_info->mod_len);
717 if (xen_info->mod_len > 0) {
718 DBG(xen_info->mod_start);
719 modules[0].bm_addr = xen_info->mod_start;
720 modules[0].bm_size = xen_info->mod_len;
721 bi->bi_module_cnt = 1;
722 bi->bi_modules = (native_ptr_t)modules;
723 } else {
724 bi->bi_module_cnt = 0;
725 bi->bi_modules = NULL;
726 }
727 DBG(bi->bi_module_cnt);
728 DBG(bi->bi_modules);
729
730 DBG(xen_info->mfn_list);
731 DBG(xen_info->nr_pages);
732 max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
733 DBG(max_mem);
734
735 /*
736 * Using pseudo-physical addresses, so only 1 memlist element
737 */
738 memlists[0].addr = 0;
739 DBG(memlists[0].addr);
740 memlists[0].size = max_mem;
741 DBG(memlists[0].size);
742 memlists_used = 1;
743 DBG(memlists_used);
744
745 /*
746 * finish building physinstall list
747 */
748 sort_physinstall();
749
750 /*
751 * build bios reserved memlists
752 */
753 build_rsvdmemlists();
754
755 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
756 /*
757 * build PCI Memory list
758 */
759 map.nr_entries = MAXMAPS;
760 /*LINTED: constant in conditional context*/
761 set_xen_guest_handle(map.buffer, map_buffer);
762 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
763 dboot_panic("getting XENMEM_machine_memory_map failed");
764 build_pcimemlists(map_buffer, map.nr_entries);
765 }
766 }
767
768 #else /* !__xpv */
769
770 /*
771 * During memory allocation, find the highest address not used yet.
772 */
773 static void
check_higher(paddr_t a)774 check_higher(paddr_t a)
775 {
776 if (a < next_avail_addr)
777 return;
778 next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
779 DBG(next_avail_addr);
780 }
781
782 /*
783 * Walk through the module information finding the last used address.
784 * The first available address will become the top level page table.
785 *
786 * We then build the phys_install memlist from the multiboot information.
787 */
788 static void
init_mem_alloc(void)789 init_mem_alloc(void)
790 {
791 mb_memory_map_t *mmap;
792 mb_module_t *mod;
793 uint64_t start;
794 uint64_t end;
795 uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
796 extern char _end[];
797 int i;
798
799 DBG_MSG("Entered init_mem_alloc()\n");
800 DBG((uintptr_t)mb_info);
801
802 if (mb_info->mods_count > MAX_MODULES) {
803 dboot_panic("Too many modules (%d) -- the maximum is %d.",
804 mb_info->mods_count, MAX_MODULES);
805 }
806 /*
807 * search the modules to find the last used address
808 * we'll build the module list while we're walking through here
809 */
810 DBG_MSG("\nFinding Modules\n");
811 check_higher((paddr_t)&_end);
812 for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
813 i < mb_info->mods_count;
814 ++mod, ++i) {
815 if (prom_debug) {
816 dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n",
817 i, (char *)(mod->mod_name),
818 (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
819 }
820 modules[i].bm_addr = mod->mod_start;
821 if (mod->mod_start > mod->mod_end) {
822 dboot_panic("module[%d]: Invalid module start address "
823 "(0x%llx)", i, (uint64_t)mod->mod_start);
824 }
825 modules[i].bm_size = mod->mod_end - mod->mod_start;
826
827 check_higher(mod->mod_end);
828 }
829 bi->bi_modules = (native_ptr_t)modules;
830 DBG(bi->bi_modules);
831 bi->bi_module_cnt = mb_info->mods_count;
832 DBG(bi->bi_module_cnt);
833
834 /*
835 * Walk through the memory map from multiboot and build our memlist
836 * structures. Note these will have native format pointers.
837 */
838 DBG_MSG("\nFinding Memory Map\n");
839 DBG(mb_info->flags);
840 max_mem = 0;
841 if (mb_info->flags & 0x40) {
842 int cnt = 0;
843
844 DBG(mb_info->mmap_addr);
845 DBG(mb_info->mmap_length);
846 check_higher(mb_info->mmap_addr + mb_info->mmap_length);
847
848 for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
849 (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
850 mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
851 + sizeof (mmap->size))) {
852 ++cnt;
853 start = ((uint64_t)mmap->base_addr_high << 32) +
854 mmap->base_addr_low;
855 end = start + ((uint64_t)mmap->length_high << 32) +
856 mmap->length_low;
857
858 if (prom_debug)
859 dboot_printf("\ttype: %d %" PRIx64 "..%"
860 PRIx64 "\n", mmap->type, start, end);
861
862 /*
863 * page align start and end
864 */
865 start = (start + page_offset) & ~page_offset;
866 end &= ~page_offset;
867 if (end <= start)
868 continue;
869
870 /*
871 * only type 1 is usable RAM
872 */
873 switch (mmap->type) {
874 case 1:
875 if (end > max_mem)
876 max_mem = end;
877 memlists[memlists_used].addr = start;
878 memlists[memlists_used].size = end - start;
879 ++memlists_used;
880 if (memlists_used > MAX_MEMLIST)
881 dboot_panic("too many memlists");
882 break;
883 case 2:
884 rsvdmemlists[rsvdmemlists_used].addr = start;
885 rsvdmemlists[rsvdmemlists_used].size =
886 end - start;
887 ++rsvdmemlists_used;
888 if (rsvdmemlists_used > MAX_MEMLIST)
889 dboot_panic("too many rsvdmemlists");
890 break;
891 default:
892 continue;
893 }
894 }
895 build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
896 } else if (mb_info->flags & 0x01) {
897 DBG(mb_info->mem_lower);
898 memlists[memlists_used].addr = 0;
899 memlists[memlists_used].size = mb_info->mem_lower * 1024;
900 ++memlists_used;
901 DBG(mb_info->mem_upper);
902 memlists[memlists_used].addr = 1024 * 1024;
903 memlists[memlists_used].size = mb_info->mem_upper * 1024;
904 ++memlists_used;
905
906 /*
907 * Old platform - assume I/O space at the end of memory.
908 */
909 pcimemlists[0].addr =
910 (mb_info->mem_upper * 1024) + (1024 * 1024);
911 pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
912 pcimemlists[0].next = 0;
913 pcimemlists[0].prev = 0;
914 bi->bi_pcimem = (native_ptr_t)pcimemlists;
915 DBG(bi->bi_pcimem);
916 } else {
917 dboot_panic("No memory info from boot loader!!!");
918 }
919
920 check_higher(bi->bi_cmdline);
921
922 /*
923 * finish processing the physinstall list
924 */
925 sort_physinstall();
926
927 /*
928 * build bios reserved mem lists
929 */
930 build_rsvdmemlists();
931 }
932 #endif /* !__xpv */
933
934 /*
935 * Simple memory allocator, allocates aligned physical memory.
936 * Note that startup_kernel() only allocates memory, never frees.
937 * Memory usage just grows in an upward direction.
938 */
939 static void *
do_mem_alloc(uint32_t size,uint32_t align)940 do_mem_alloc(uint32_t size, uint32_t align)
941 {
942 uint_t i;
943 uint64_t best;
944 uint64_t start;
945 uint64_t end;
946
947 /*
948 * make sure size is a multiple of pagesize
949 */
950 size = RNDUP(size, MMU_PAGESIZE);
951 next_avail_addr = RNDUP(next_avail_addr, align);
952
953 /*
954 * XXPV fixme joe
955 *
956 * a really large bootarchive that causes you to run out of memory
957 * may cause this to blow up
958 */
959 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
960 best = (uint64_t)-size;
961 for (i = 0; i < memlists_used; ++i) {
962 start = memlists[i].addr;
963 #if defined(__xpv)
964 start += mfn_base;
965 #endif
966 end = start + memlists[i].size;
967
968 /*
969 * did we find the desired address?
970 */
971 if (start <= next_avail_addr && next_avail_addr + size <= end) {
972 best = next_avail_addr;
973 goto done;
974 }
975
976 /*
977 * if not is this address the best so far?
978 */
979 if (start > next_avail_addr && start < best &&
980 RNDUP(start, align) + size <= end)
981 best = RNDUP(start, align);
982 }
983
984 /*
985 * We didn't find exactly the address we wanted, due to going off the
986 * end of a memory region. Return the best found memory address.
987 */
988 done:
989 next_avail_addr = best + size;
990 #if defined(__xpv)
991 if (next_avail_addr > scratch_end)
992 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
993 "0x%lx", (ulong_t)next_avail_addr,
994 (ulong_t)scratch_end);
995 #endif
996 (void) memset((void *)(uintptr_t)best, 0, size);
997 return ((void *)(uintptr_t)best);
998 }
999
1000 void *
mem_alloc(uint32_t size)1001 mem_alloc(uint32_t size)
1002 {
1003 return (do_mem_alloc(size, MMU_PAGESIZE));
1004 }
1005
1006
1007 /*
1008 * Build page tables to map all of memory used so far as well as the kernel.
1009 */
1010 static void
build_page_tables(void)1011 build_page_tables(void)
1012 {
1013 uint32_t psize;
1014 uint32_t level;
1015 uint32_t off;
1016 uint64_t start;
1017 #if !defined(__xpv)
1018 uint32_t i;
1019 uint64_t end;
1020 #endif /* __xpv */
1021
1022 /*
1023 * If we're on metal, we need to create the top level pagetable.
1024 */
1025 #if defined(__xpv)
1026 top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1027 #else /* __xpv */
1028 top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1029 #endif /* __xpv */
1030 DBG((uintptr_t)top_page_table);
1031
1032 /*
1033 * Determine if we'll use large mappings for kernel, then map it.
1034 */
1035 if (largepage_support) {
1036 psize = lpagesize;
1037 level = 1;
1038 } else {
1039 psize = MMU_PAGESIZE;
1040 level = 0;
1041 }
1042
1043 DBG_MSG("Mapping kernel\n");
1044 DBG(ktext_phys);
1045 DBG(target_kernel_text);
1046 DBG(ksize);
1047 DBG(psize);
1048 for (off = 0; off < ksize; off += psize)
1049 map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1050
1051 /*
1052 * The kernel will need a 1 page window to work with page tables
1053 */
1054 bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE);
1055 DBG(bi->bi_pt_window);
1056 bi->bi_pte_to_pt_window =
1057 (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1058 DBG(bi->bi_pte_to_pt_window);
1059
1060 #if defined(__xpv)
1061 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
1062 /* If this is a domU we're done. */
1063 DBG_MSG("\nPage tables constructed\n");
1064 return;
1065 }
1066 #endif /* __xpv */
1067
1068 /*
1069 * We need 1:1 mappings for the lower 1M of memory to access
1070 * BIOS tables used by a couple of drivers during boot.
1071 *
1072 * The following code works because our simple memory allocator
1073 * only grows usage in an upwards direction.
1074 *
1075 * Note that by this point in boot some mappings for low memory
1076 * may already exist because we've already accessed device in low
1077 * memory. (Specifically the video frame buffer and keyboard
1078 * status ports.) If we're booting on raw hardware then GRUB
1079 * created these mappings for us. If we're booting under a
1080 * hypervisor then we went ahead and remapped these devices into
1081 * memory allocated within dboot itself.
1082 */
1083 if (map_debug)
1084 dboot_printf("1:1 map pa=0..1Meg\n");
1085 for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
1086 #if defined(__xpv)
1087 map_ma_at_va(start, start, 0);
1088 #else /* __xpv */
1089 map_pa_at_va(start, start, 0);
1090 #endif /* __xpv */
1091 }
1092
1093 #if !defined(__xpv)
1094 for (i = 0; i < memlists_used; ++i) {
1095 start = memlists[i].addr;
1096
1097 end = start + memlists[i].size;
1098
1099 if (map_debug)
1100 dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
1101 start, end);
1102 while (start < end && start < next_avail_addr) {
1103 map_pa_at_va(start, start, 0);
1104 start += MMU_PAGESIZE;
1105 }
1106 }
1107 #endif /* !__xpv */
1108
1109 DBG_MSG("\nPage tables constructed\n");
1110 }
1111
1112 #define NO_MULTIBOOT \
1113 "multiboot is no longer used to boot the Solaris Operating System.\n\
1114 The grub entry should be changed to:\n\
1115 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1116 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1117 See http://www.sun.com/msg/SUNOS-8000-AK for details.\n"
1118
1119 /*
1120 * startup_kernel has a pretty simple job. It builds pagetables which reflect
1121 * 1:1 mappings for all memory in use. It then also adds mappings for
1122 * the kernel nucleus at virtual address of target_kernel_text using large page
1123 * mappings. The page table pages are also accessible at 1:1 mapped
1124 * virtual addresses.
1125 */
1126 /*ARGSUSED*/
1127 void
startup_kernel(void)1128 startup_kernel(void)
1129 {
1130 char *cmdline;
1131 uintptr_t addr;
1132 #if defined(__xpv)
1133 physdev_set_iopl_t set_iopl;
1134 #endif /* __xpv */
1135
1136 /*
1137 * At this point we are executing in a 32 bit real mode.
1138 */
1139 #if defined(__xpv)
1140 cmdline = (char *)xen_info->cmd_line;
1141 #else /* __xpv */
1142 cmdline = (char *)mb_info->cmdline;
1143 #endif /* __xpv */
1144
1145 prom_debug = (strstr(cmdline, "prom_debug") != NULL);
1146 map_debug = (strstr(cmdline, "map_debug") != NULL);
1147
1148 #if defined(__xpv)
1149 /*
1150 * For dom0, before we initialize the console subsystem we'll
1151 * need to enable io operations, so set I/O priveldge level to 1.
1152 */
1153 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1154 set_iopl.iopl = 1;
1155 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1156 }
1157 #endif /* __xpv */
1158
1159 bcons_init(cmdline);
1160 DBG_MSG("\n\nSolaris prekernel set: ");
1161 DBG_MSG(cmdline);
1162 DBG_MSG("\n");
1163
1164 if (strstr(cmdline, "multiboot") != NULL) {
1165 dboot_panic(NO_MULTIBOOT);
1166 }
1167
1168 /*
1169 * boot info must be 16 byte aligned for 64 bit kernel ABI
1170 */
1171 addr = (uintptr_t)boot_info;
1172 addr = (addr + 0xf) & ~0xf;
1173 bi = (struct xboot_info *)addr;
1174 DBG((uintptr_t)bi);
1175 bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
1176
1177 /*
1178 * Need correct target_kernel_text value
1179 */
1180 #if defined(_BOOT_TARGET_amd64)
1181 target_kernel_text = KERNEL_TEXT_amd64;
1182 #elif defined(__xpv)
1183 target_kernel_text = KERNEL_TEXT_i386_xpv;
1184 #else
1185 target_kernel_text = KERNEL_TEXT_i386;
1186 #endif
1187 DBG(target_kernel_text);
1188
1189 #if defined(__xpv)
1190
1191 /*
1192 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
1193 */
1194
1195 #if defined(_BOOT_TARGET_amd64)
1196 /*
1197 * 64-bit hypervisor.
1198 */
1199 amd64_support = 1;
1200 pae_support = 1;
1201
1202 #else /* _BOOT_TARGET_amd64 */
1203
1204 /*
1205 * See if we are running on a PAE Hypervisor
1206 */
1207 {
1208 xen_capabilities_info_t caps;
1209
1210 if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
1211 dboot_panic("HYPERVISOR_xen_version(caps) failed");
1212 caps[sizeof (caps) - 1] = 0;
1213 if (prom_debug)
1214 dboot_printf("xen capabilities %s\n", caps);
1215 if (strstr(caps, "x86_32p") != NULL)
1216 pae_support = 1;
1217 }
1218
1219 #endif /* _BOOT_TARGET_amd64 */
1220 {
1221 xen_platform_parameters_t p;
1222
1223 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
1224 dboot_panic("HYPERVISOR_xen_version(parms) failed");
1225 DBG(p.virt_start);
1226 mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
1227 }
1228
1229 /*
1230 * The hypervisor loads stuff starting at 1Gig
1231 */
1232 mfn_base = ONE_GIG;
1233 DBG(mfn_base);
1234
1235 /*
1236 * enable writable page table mode for the hypervisor
1237 */
1238 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1239 VMASST_TYPE_writable_pagetables) < 0)
1240 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
1241
1242 /*
1243 * check for NX support
1244 */
1245 if (pae_support) {
1246 uint32_t eax = 0x80000000;
1247 uint32_t edx = get_cpuid_edx(&eax);
1248
1249 if (eax >= 0x80000001) {
1250 eax = 0x80000001;
1251 edx = get_cpuid_edx(&eax);
1252 if (edx & CPUID_AMD_EDX_NX)
1253 NX_support = 1;
1254 }
1255 }
1256
1257 #if !defined(_BOOT_TARGET_amd64)
1258
1259 /*
1260 * The 32-bit hypervisor uses segmentation to protect itself from
1261 * guests. This means when a guest attempts to install a flat 4GB
1262 * code or data descriptor the 32-bit hypervisor will protect itself
1263 * by silently shrinking the segment such that if the guest attempts
1264 * any access where the hypervisor lives a #gp fault is generated.
1265 * The problem is that some applications expect a full 4GB flat
1266 * segment for their current thread pointer and will use negative
1267 * offset segment wrap around to access data. TLS support in linux
1268 * brand is one example of this.
1269 *
1270 * The 32-bit hypervisor can catch the #gp fault in these cases
1271 * and emulate the access without passing the #gp fault to the guest
1272 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
1273 * Seems like this should have been the default.
1274 * Either way, we want the hypervisor -- and not Solaris -- to deal
1275 * to deal with emulating these accesses.
1276 */
1277 if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
1278 VMASST_TYPE_4gb_segments) < 0)
1279 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
1280 #endif /* !_BOOT_TARGET_amd64 */
1281
1282 #else /* __xpv */
1283
1284 /*
1285 * use cpuid to enable MMU features
1286 */
1287 if (have_cpuid()) {
1288 uint32_t eax, edx;
1289
1290 eax = 1;
1291 edx = get_cpuid_edx(&eax);
1292 if (edx & CPUID_INTC_EDX_PSE)
1293 largepage_support = 1;
1294 if (edx & CPUID_INTC_EDX_PGE)
1295 pge_support = 1;
1296 if (edx & CPUID_INTC_EDX_PAE)
1297 pae_support = 1;
1298
1299 eax = 0x80000000;
1300 edx = get_cpuid_edx(&eax);
1301 if (eax >= 0x80000001) {
1302 eax = 0x80000001;
1303 edx = get_cpuid_edx(&eax);
1304 if (edx & CPUID_AMD_EDX_LM)
1305 amd64_support = 1;
1306 if (edx & CPUID_AMD_EDX_NX)
1307 NX_support = 1;
1308 }
1309 } else {
1310 dboot_printf("cpuid not supported\n");
1311 }
1312 #endif /* __xpv */
1313
1314
1315 #if defined(_BOOT_TARGET_amd64)
1316 if (amd64_support == 0)
1317 dboot_panic("long mode not supported, rebooting");
1318 else if (pae_support == 0)
1319 dboot_panic("long mode, but no PAE; rebooting");
1320 #else
1321 /*
1322 * Allow the command line to over-ride use of PAE for 32 bit.
1323 */
1324 if (strstr(cmdline, "disablePAE=true") != NULL) {
1325 pae_support = 0;
1326 NX_support = 0;
1327 amd64_support = 0;
1328 }
1329 #endif
1330
1331 /*
1332 * initialize the simple memory allocator
1333 */
1334 init_mem_alloc();
1335
1336 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
1337 /*
1338 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
1339 */
1340 if (max_mem < FOUR_GIG && NX_support == 0)
1341 pae_support = 0;
1342 #endif
1343
1344 /*
1345 * configure mmu information
1346 */
1347 if (pae_support) {
1348 shift_amt = shift_amt_pae;
1349 ptes_per_table = 512;
1350 pte_size = 8;
1351 lpagesize = TWO_MEG;
1352 #if defined(_BOOT_TARGET_amd64)
1353 top_level = 3;
1354 #else
1355 top_level = 2;
1356 #endif
1357 } else {
1358 pae_support = 0;
1359 NX_support = 0;
1360 shift_amt = shift_amt_nopae;
1361 ptes_per_table = 1024;
1362 pte_size = 4;
1363 lpagesize = FOUR_MEG;
1364 top_level = 1;
1365 }
1366
1367 DBG(pge_support);
1368 DBG(NX_support);
1369 DBG(largepage_support);
1370 DBG(amd64_support);
1371 DBG(top_level);
1372 DBG(pte_size);
1373 DBG(ptes_per_table);
1374 DBG(lpagesize);
1375
1376 #if defined(__xpv)
1377 ktext_phys = ONE_GIG; /* from UNIX Mapfile */
1378 #else
1379 ktext_phys = FOUR_MEG; /* from UNIX Mapfile */
1380 #endif
1381
1382 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
1383 /*
1384 * For grub, copy kernel bits from the ELF64 file to final place.
1385 */
1386 DBG_MSG("\nAllocating nucleus pages.\n");
1387 ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
1388 if (ktext_phys == 0)
1389 dboot_panic("failed to allocate aligned kernel memory");
1390 if (dboot_elfload64(mb_header.load_addr) != 0)
1391 dboot_panic("failed to parse kernel ELF image, rebooting");
1392 #endif
1393
1394 DBG(ktext_phys);
1395
1396 /*
1397 * Allocate page tables.
1398 */
1399 build_page_tables();
1400
1401 /*
1402 * return to assembly code to switch to running kernel
1403 */
1404 entry_addr_low = (uint32_t)target_kernel_text;
1405 DBG(entry_addr_low);
1406 bi->bi_use_largepage = largepage_support;
1407 bi->bi_use_pae = pae_support;
1408 bi->bi_use_pge = pge_support;
1409 bi->bi_use_nx = NX_support;
1410
1411 #if defined(__xpv)
1412
1413 bi->bi_next_paddr = next_avail_addr - mfn_base;
1414 DBG(bi->bi_next_paddr);
1415 bi->bi_next_vaddr = (native_ptr_t)next_avail_addr;
1416 DBG(bi->bi_next_vaddr);
1417
1418 /*
1419 * unmap unused pages in start area to make them available for DMA
1420 */
1421 while (next_avail_addr < scratch_end) {
1422 (void) HYPERVISOR_update_va_mapping(next_avail_addr,
1423 0, UVMF_INVLPG | UVMF_LOCAL);
1424 next_avail_addr += MMU_PAGESIZE;
1425 }
1426
1427 bi->bi_xen_start_info = (uintptr_t)xen_info;
1428 DBG((uintptr_t)HYPERVISOR_shared_info);
1429 bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
1430 bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
1431
1432 #else /* __xpv */
1433
1434 bi->bi_next_paddr = next_avail_addr;
1435 DBG(bi->bi_next_paddr);
1436 bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
1437 DBG(bi->bi_next_vaddr);
1438 bi->bi_mb_info = (uintptr_t)mb_info;
1439 bi->bi_top_page_table = (uintptr_t)top_page_table;
1440
1441 #endif /* __xpv */
1442
1443 bi->bi_kseg_size = FOUR_MEG;
1444 DBG(bi->bi_kseg_size);
1445
1446 #ifndef __xpv
1447 if (map_debug)
1448 dump_tables();
1449 #endif
1450
1451 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
1452 }
1453