xref: /netbsd-src/sys/arch/i386/i386/machdep.c (revision 60196e5ccd47f6fe2a91430034a883c25c30f8f5)
1 /*	$NetBSD: machdep.c,v 1.842 2024/06/27 23:58:46 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009, 2017
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum, by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility NASA Ames Research Center, by Julio M. Merino Vidal,
11  * by Andrew Doran, and by Maxime Villard.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
37  * All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * William Jolitz.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.842 2024/06/27 23:58:46 riastradh Exp $");
71 
72 #include "opt_beep.h"
73 #include "opt_compat_freebsd.h"
74 #include "opt_compat_netbsd.h"
75 #include "opt_cpureset_delay.h"
76 #include "opt_ddb.h"
77 #include "opt_kgdb.h"
78 #include "opt_mtrr.h"
79 #include "opt_modular.h"
80 #include "opt_multiboot.h"
81 #include "opt_multiprocessor.h"
82 #include "opt_physmem.h"
83 #include "opt_realmem.h"
84 #include "opt_user_ldt.h"
85 #include "opt_xen.h"
86 #include "isa.h"
87 #include "pci.h"
88 
89 #include <sys/param.h>
90 #include <sys/systm.h>
91 #include <sys/signal.h>
92 #include <sys/signalvar.h>
93 #include <sys/kernel.h>
94 #include <sys/cpu.h>
95 #include <sys/exec.h>
96 #include <sys/fcntl.h>
97 #include <sys/reboot.h>
98 #include <sys/conf.h>
99 #include <sys/kauth.h>
100 #include <sys/msgbuf.h>
101 #include <sys/mount.h>
102 #include <sys/syscallargs.h>
103 #include <sys/core.h>
104 #include <sys/kcore.h>
105 #include <sys/ucontext.h>
106 #include <sys/ras.h>
107 #include <sys/ksyms.h>
108 #include <sys/device.h>
109 #include <sys/timevar.h>
110 
111 #ifdef KGDB
112 #include <sys/kgdb.h>
113 #endif
114 
115 #include <dev/cons.h>
116 #include <dev/mm.h>
117 
118 #include <uvm/uvm.h>
119 #include <uvm/uvm_page.h>
120 
121 #include <sys/sysctl.h>
122 
123 #include <x86/efi.h>
124 
125 #include <machine/cpu.h>
126 #include <machine/cpu_rng.h>
127 #include <machine/cpufunc.h>
128 #include <machine/cpuvar.h>
129 #include <machine/gdt.h>
130 #include <machine/intr.h>
131 #include <machine/kcore.h>
132 #include <machine/pio.h>
133 #include <machine/psl.h>
134 #include <machine/reg.h>
135 #include <machine/specialreg.h>
136 #include <machine/bootinfo.h>
137 #include <machine/mtrr.h>
138 #include <machine/pmap_private.h>
139 #include <x86/x86/tsc.h>
140 
141 #include <x86/bootspace.h>
142 #include <x86/fpu.h>
143 #include <x86/dbregs.h>
144 #include <x86/machdep.h>
145 
146 #include <machine/multiboot.h>
147 
148 #ifdef XEN
149 #include <xen/evtchn.h>
150 #include <xen/xen.h>
151 #include <xen/hypervisor.h>
152 #endif
153 
154 #include <dev/isa/isareg.h>
155 #include <machine/isa_machdep.h>
156 #include <dev/ic/i8042reg.h>
157 
158 #include <ddb/db_active.h>
159 
160 #ifdef DDB
161 #include <machine/db_machdep.h>
162 #include <ddb/db_extern.h>
163 #endif
164 
165 #include "acpica.h"
166 #include "bioscall.h"
167 
168 #if NBIOSCALL > 0
169 #include <machine/bioscall.h>
170 #endif
171 
172 #if NACPICA > 0
173 #include <dev/acpi/acpivar.h>
174 #define ACPI_MACHDEP_PRIVATE
175 #include <machine/acpi_machdep.h>
176 #else
177 #include <machine/i82489var.h>
178 #endif
179 
180 #include "isa.h"
181 #include "isadma.h"
182 #include "ksyms.h"
183 
184 #include "cardbus.h"
185 #if NCARDBUS > 0
186 /* For rbus_min_start hint. */
187 #include <sys/bus.h>
188 #include <dev/cardbus/rbus.h>
189 #include <machine/rbus_machdep.h>
190 #endif
191 
192 #include "mca.h"
193 #if NMCA > 0
194 #include <machine/mca_machdep.h>	/* for mca_busprobe() */
195 #endif
196 
197 #ifdef MULTIPROCESSOR		/* XXX */
198 #include <machine/mpbiosvar.h>	/* XXX */
199 #endif				/* XXX */
200 
201 /* the following is used externally (sysctl_hw) */
202 char machine[] = "i386";		/* CPU "architecture" */
203 char machine_arch[] = "i386";		/* machine == machine_arch */
204 
205 #ifdef CPURESET_DELAY
206 int cpureset_delay = CPURESET_DELAY;
207 #else
208 int cpureset_delay = 2000; /* default to 2s */
209 #endif
210 
211 #ifdef MTRR
212 const struct mtrr_funcs *mtrr_funcs;
213 #endif
214 
215 int cpu_class;
216 int use_pae;
217 int i386_fpu_fdivbug;
218 
219 int i386_use_fxsave;
220 int i386_has_sse;
221 int i386_has_sse2;
222 
223 vaddr_t idt_vaddr;
224 paddr_t idt_paddr;
225 vaddr_t gdt_vaddr;
226 paddr_t gdt_paddr;
227 vaddr_t ldt_vaddr;
228 paddr_t ldt_paddr;
229 
230 vaddr_t pentium_idt_vaddr;
231 
232 struct vm_map *phys_map = NULL;
233 
234 extern struct bootspace bootspace;
235 
236 extern paddr_t lowmem_rsvd;
237 extern paddr_t avail_start, avail_end;
238 #ifdef XENPV
239 extern paddr_t pmap_pa_start, pmap_pa_end;
240 void hypervisor_callback(void);
241 void failsafe_callback(void);
242 #endif
243 
244 /*
245  * Size of memory segments, before any memory is stolen.
246  */
247 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
248 int mem_cluster_cnt = 0;
249 
250 void init_bootspace(void);
251 void init386(paddr_t);
252 void initgdt(union descriptor *);
253 
254 static void i386_proc0_pcb_ldt_init(void);
255 
256 int *esym;
257 int *eblob;
258 extern int boothowto;
259 
260 #ifndef XENPV
261 
262 /* Base memory reported by BIOS. */
263 #ifndef REALBASEMEM
264 int biosbasemem = 0;
265 #else
266 int biosbasemem = REALBASEMEM;
267 #endif
268 
269 /* Extended memory reported by BIOS. */
270 #ifndef REALEXTMEM
271 int biosextmem = 0;
272 #else
273 int biosextmem = REALEXTMEM;
274 #endif
275 
276 /* Set if any boot-loader set biosbasemem/biosextmem. */
277 int biosmem_implicit;
278 
279 /*
280  * Representation of the bootinfo structure constructed by a NetBSD native
281  * boot loader.  Only be used by native_loader().
282  */
283 struct bootinfo_source {
284 	uint32_t bs_naddrs;
285 	void *bs_addrs[1]; /* Actually longer. */
286 };
287 
288 /* Only called by locore.S; no need to be in a header file. */
289 void native_loader(int, int, struct bootinfo_source *, paddr_t, int, int);
290 
291 /*
292  * Called as one of the very first things during system startup (just after
293  * the boot loader gave control to the kernel image), this routine is in
294  * charge of retrieving the parameters passed in by the boot loader and
295  * storing them in the appropriate kernel variables.
296  *
297  * WARNING: Because the kernel has not yet relocated itself to KERNBASE,
298  * special care has to be taken when accessing memory because absolute
299  * addresses (referring to kernel symbols) do not work.  So:
300  *
301  *     1) Avoid jumps to absolute addresses (such as gotos and switches).
302  *     2) To access global variables use their physical address, which
303  *        can be obtained using the RELOC macro.
304  */
305 void
native_loader(int bl_boothowto,int bl_bootdev,struct bootinfo_source * bl_bootinfo,paddr_t bl_esym,int bl_biosextmem,int bl_biosbasemem)306 native_loader(int bl_boothowto, int bl_bootdev,
307     struct bootinfo_source *bl_bootinfo, paddr_t bl_esym,
308     int bl_biosextmem, int bl_biosbasemem)
309 {
310 #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE))
311 
312 	*RELOC(int *, &boothowto) = bl_boothowto;
313 
314 	/*
315 	 * The boot loader provides a physical, non-relocated address
316 	 * for the symbols table's end.  We need to convert it to a
317 	 * virtual address.
318 	 */
319 	if (bl_esym != 0)
320 		*RELOC(int **, &esym) = (int *)((vaddr_t)bl_esym + KERNBASE);
321 	else
322 		*RELOC(int **, &esym) = 0;
323 
324 	/*
325 	 * Copy bootinfo entries (if any) from the boot loader's
326 	 * representation to the kernel's bootinfo space.
327 	 */
328 	if (bl_bootinfo != NULL) {
329 		size_t i;
330 		uint8_t *data;
331 		struct bootinfo *bidest;
332 		struct btinfo_modulelist *bi;
333 
334 		bidest = RELOC(struct bootinfo *, &bootinfo);
335 
336 		data = &bidest->bi_data[0];
337 
338 		for (i = 0; i < bl_bootinfo->bs_naddrs; i++) {
339 			struct btinfo_common *bc;
340 
341 			bc = bl_bootinfo->bs_addrs[i];
342 
343 			if ((data + bc->len) >
344 			    (&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
345 				break;
346 
347 			memcpy(data, bc, bc->len);
348 			/*
349 			 * If any modules were loaded, record where they
350 			 * end.  We'll need to skip over them.
351 			 */
352 			bi = (struct btinfo_modulelist *)data;
353 			if (bi->common.type == BTINFO_MODULELIST) {
354 				*RELOC(int **, &eblob) =
355 				    (int *)(bi->endpa + KERNBASE);
356 			}
357 			data += bc->len;
358 		}
359 		bidest->bi_nentries = i;
360 	}
361 
362 	/*
363 	 * Configure biosbasemem and biosextmem only if they were not
364 	 * explicitly given during the kernel's build.
365 	 */
366 	if (*RELOC(int *, &biosbasemem) == 0) {
367 		*RELOC(int *, &biosbasemem) = bl_biosbasemem;
368 		*RELOC(int *, &biosmem_implicit) = 1;
369 	}
370 	if (*RELOC(int *, &biosextmem) == 0) {
371 		*RELOC(int *, &biosextmem) = bl_biosextmem;
372 		*RELOC(int *, &biosmem_implicit) = 1;
373 	}
374 #undef RELOC
375 }
376 
377 #endif /* XENPV */
378 
379 /*
380  * Machine-dependent startup code
381  */
382 void
cpu_startup(void)383 cpu_startup(void)
384 {
385 	int x, y;
386 	vaddr_t minaddr, maxaddr;
387 	psize_t sz;
388 
389 	/*
390 	 * For console drivers that require uvm and pmap to be initialized,
391 	 * we'll give them one more chance here...
392 	 */
393 	consinit();
394 
395 	/*
396 	 * Initialize error message buffer (et end of core).
397 	 */
398 	if (msgbuf_p_cnt == 0)
399 		panic("msgbuf paddr map has not been set up");
400 	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
401 		continue;
402 
403 	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
404 	if (msgbuf_vaddr == 0)
405 		panic("failed to valloc msgbuf_vaddr");
406 
407 	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
408 		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
409 			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
410 			    msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
411 			    VM_PROT_READ|VM_PROT_WRITE, 0);
412 	}
413 
414 	pmap_update(pmap_kernel());
415 
416 	initmsgbuf((void *)msgbuf_vaddr, sz);
417 
418 #ifdef MULTIBOOT
419 	multiboot1_print_info();
420 	multiboot2_print_info();
421 #endif
422 
423 #if NCARDBUS > 0
424 	/* Tell RBUS how much RAM we have, so it can use heuristics. */
425 	rbus_min_start_hint(ctob((psize_t)physmem));
426 #endif
427 
428 	minaddr = 0;
429 
430 	/*
431 	 * Allocate a submap for physio
432 	 */
433 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
434 	    VM_PHYS_SIZE, 0, false, NULL);
435 
436 	/* Say hello. */
437 	banner();
438 
439 	/* Safe for i/o port / memory space allocation to use malloc now. */
440 #if NISA > 0 || NPCI > 0
441 	x86_bus_space_mallocok();
442 #endif
443 
444 	gdt_init();
445 	i386_proc0_pcb_ldt_init();
446 
447 	cpu_init_tss(&cpu_info_primary);
448 #ifndef XENPV
449 	ltr(cpu_info_primary.ci_tss_sel);
450 #endif
451 
452 	x86_startup();
453 }
454 
455 /*
456  * Set up proc0's PCB and LDT.
457  */
458 static void
i386_proc0_pcb_ldt_init(void)459 i386_proc0_pcb_ldt_init(void)
460 {
461 	struct lwp *l = &lwp0;
462 	struct pcb *pcb = lwp_getpcb(l);
463 
464 	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
465 	pcb->pcb_esp0 = uvm_lwp_getuarea(l) + USPACE - 16;
466 	pcb->pcb_iopl = IOPL_KPL;
467 	l->l_md.md_regs = (struct trapframe *)pcb->pcb_esp0 - 1;
468 	memcpy(&pcb->pcb_fsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_fsd));
469 	memcpy(&pcb->pcb_gsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_gsd));
470 	pcb->pcb_dbregs = NULL;
471 
472 #ifndef XENPV
473 	lldt(GSEL(GLDT_SEL, SEL_KPL));
474 #else
475 	HYPERVISOR_fpu_taskswitch(1);
476 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
477 #endif
478 }
479 
480 #ifdef XENPV
481 /* used in assembly */
482 void i386_switch_context(lwp_t *);
483 void i386_tls_switch(lwp_t *);
484 
485 /*
486  * Switch context:
487  * - switch stack pointer for user->kernel transition
488  */
489 void
i386_switch_context(lwp_t * l)490 i386_switch_context(lwp_t *l)
491 {
492 	struct pcb *pcb;
493 
494 	pcb = lwp_getpcb(l);
495 
496 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
497 
498 	struct physdev_set_iopl set_iopl;
499 	set_iopl.iopl = pcb->pcb_iopl;
500 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
501 }
502 
503 void
i386_tls_switch(lwp_t * l)504 i386_tls_switch(lwp_t *l)
505 {
506 	struct cpu_info *ci = curcpu();
507 	struct pcb *pcb = lwp_getpcb(l);
508 
509 	/*
510 	 * Raise the IPL to IPL_HIGH. XXX Still needed?
511 	 */
512 	(void)splhigh();
513 
514 	/* Update TLS segment pointers */
515 	update_descriptor(&ci->ci_gdt[GUFS_SEL],
516 	    (union descriptor *)&pcb->pcb_fsd);
517 	update_descriptor(&ci->ci_gdt[GUGS_SEL],
518 	    (union descriptor *)&pcb->pcb_gsd);
519 }
520 #endif /* XENPV */
521 
522 /* XXX */
523 #define IDTVEC(name)	__CONCAT(X, name)
524 typedef void (vector)(void);
525 
526 #ifndef XENPV
527 static void	tss_init(struct i386tss *, void *, void *);
528 
529 static void
tss_init(struct i386tss * tss,void * stack,void * func)530 tss_init(struct i386tss *tss, void *stack, void *func)
531 {
532 	KASSERT(curcpu()->ci_pmap == pmap_kernel());
533 
534 	memset(tss, 0, sizeof *tss);
535 	tss->tss_esp0 = tss->tss_esp = (int)((char *)stack + USPACE - 16);
536 	tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
537 	tss->__tss_cs = GSEL(GCODE_SEL, SEL_KPL);
538 	tss->tss_fs = GSEL(GCPU_SEL, SEL_KPL);
539 	tss->tss_gs = tss->__tss_es = tss->__tss_ds =
540 	    tss->__tss_ss = GSEL(GDATA_SEL, SEL_KPL);
541 	/* %cr3 contains the value associated to pmap_kernel */
542 	tss->tss_cr3 = rcr3();
543 	tss->tss_esp = (int)((char *)stack + USPACE - 16);
544 	tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
545 	tss->__tss_eflags = PSL_MBO | PSL_NT;	/* XXX not needed? */
546 	tss->__tss_eip = (int)func;
547 }
548 
549 extern vector IDTVEC(tss_trap08);
550 #if defined(DDB) && defined(MULTIPROCESSOR)
551 extern vector Xintr_ddbipi, Xintr_x2apic_ddbipi;
552 extern int ddb_vec;
553 #endif
554 
555 void
cpu_set_tss_gates(struct cpu_info * ci)556 cpu_set_tss_gates(struct cpu_info *ci)
557 {
558 	struct segment_descriptor sd;
559 	void *doubleflt_stack;
560 	idt_descriptor_t *idt;
561 
562 	doubleflt_stack = (void *)uvm_km_alloc(kernel_map, USPACE, 0,
563 	    UVM_KMF_WIRED);
564 	tss_init(&ci->ci_tss->dblflt_tss, doubleflt_stack, IDTVEC(tss_trap08));
565 
566 	setsegment(&sd, &ci->ci_tss->dblflt_tss, sizeof(struct i386tss) - 1,
567 	    SDT_SYS386TSS, SEL_KPL, 0, 0);
568 	ci->ci_gdt[GTRAPTSS_SEL].sd = sd;
569 
570 	idt = cpu_info_primary.ci_idtvec.iv_idt;
571 	set_idtgate(&idt[8], NULL, 0, SDT_SYSTASKGT, SEL_KPL,
572 	    GSEL(GTRAPTSS_SEL, SEL_KPL));
573 
574 #if defined(DDB) && defined(MULTIPROCESSOR)
575 	/*
576 	 * Set up separate handler for the DDB IPI, so that it doesn't
577 	 * stomp on a possibly corrupted stack.
578 	 *
579 	 * XXX overwriting the gate set in db_machine_init.
580 	 * Should rearrange the code so that it's set only once.
581 	 */
582 	void *ddbipi_stack;
583 
584 	ddbipi_stack = (void *)uvm_km_alloc(kernel_map, USPACE, 0,
585 	    UVM_KMF_WIRED);
586 	tss_init(&ci->ci_tss->ddbipi_tss, ddbipi_stack,
587 	    x2apic_mode ? Xintr_x2apic_ddbipi : Xintr_ddbipi);
588 
589 	setsegment(&sd, &ci->ci_tss->ddbipi_tss, sizeof(struct i386tss) - 1,
590 	    SDT_SYS386TSS, SEL_KPL, 0, 0);
591 	ci->ci_gdt[GIPITSS_SEL].sd = sd;
592 
593 	set_idtgate(&idt[ddb_vec], NULL, 0, SDT_SYSTASKGT, SEL_KPL,
594 	    GSEL(GIPITSS_SEL, SEL_KPL));
595 #endif
596 }
597 #endif /* XENPV */
598 
599 /*
600  * Set up TSS and I/O bitmap.
601  */
602 void
cpu_init_tss(struct cpu_info * ci)603 cpu_init_tss(struct cpu_info *ci)
604 {
605 	struct cpu_tss *cputss;
606 
607 	cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
608 	    sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
609 
610 	cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
611 #ifndef XENPV
612 	cputss->tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
613 	cputss->tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
614 	cputss->tss.tss_cr3 = rcr3();
615 #endif
616 
617 	ci->ci_tss = cputss;
618 #ifndef XENPV
619 	ci->ci_tss_sel = tss_alloc(&cputss->tss);
620 #endif
621 }
622 
623 void *
getframe(struct lwp * l,int sig,int * onstack)624 getframe(struct lwp *l, int sig, int *onstack)
625 {
626 	struct proc *p = l->l_proc;
627 	struct trapframe *tf = l->l_md.md_regs;
628 
629 	/* Do we need to jump onto the signal stack? */
630 	*onstack = (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
631 	    && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
632 	if (*onstack)
633 		return (char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size;
634 	return (void *)tf->tf_esp;
635 }
636 
637 /*
638  * Build context to run handler in.  We invoke the handler
639  * directly, only returning via the trampoline.  Note the
640  * trampoline version numbers are coordinated with machine-
641  * dependent code in libc.
642  */
643 void
buildcontext(struct lwp * l,int sel,void * catcher,void * fp)644 buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
645 {
646 	struct trapframe *tf = l->l_md.md_regs;
647 
648 	tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
649 	tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
650 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
651 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
652 	tf->tf_eip = (int)catcher;
653 	tf->tf_cs = GSEL(sel, SEL_UPL);
654 	tf->tf_eflags &= ~PSL_CLEARSIG;
655 	tf->tf_esp = (int)fp;
656 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
657 
658 	/* Ensure FP state is reset. */
659 	fpu_sigreset(l);
660 }
661 
662 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)663 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
664 {
665 	struct lwp *l = curlwp;
666 	struct proc *p = l->l_proc;
667 	struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
668 	int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
669 	    GUCODEBIG_SEL : GUCODE_SEL;
670 	struct sigacts *ps = p->p_sigacts;
671 	int onstack, error;
672 	int sig = ksi->ksi_signo;
673 	struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
674 	sig_t catcher = SIGACTION(p, sig).sa_handler;
675 
676 	KASSERT(mutex_owned(p->p_lock));
677 
678 	fp--;
679 
680 	memset(&frame, 0, sizeof(frame));
681 	frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
682 	frame.sf_signum = sig;
683 	frame.sf_sip = &fp->sf_si;
684 	frame.sf_ucp = &fp->sf_uc;
685 	frame.sf_si._info = ksi->ksi_info;
686 	frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
687 	frame.sf_uc.uc_sigmask = *mask;
688 	frame.sf_uc.uc_link = l->l_ctxlink;
689 	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
690 	    ? _UC_SETSTACK : _UC_CLRSTACK;
691 
692 	sendsig_reset(l, sig);
693 
694 	mutex_exit(p->p_lock);
695 	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
696 	error = copyout(&frame, fp, sizeof(frame));
697 	mutex_enter(p->p_lock);
698 
699 	if (error != 0) {
700 		/*
701 		 * Process has trashed its stack; give it an illegal
702 		 * instruction to halt it in its tracks.
703 		 */
704 		sigexit(l, SIGILL);
705 		/* NOTREACHED */
706 	}
707 
708 	buildcontext(l, sel, catcher, fp);
709 
710 	/* Remember that we're now on the signal stack. */
711 	if (onstack)
712 		l->l_sigstk.ss_flags |= SS_ONSTACK;
713 }
714 
715 static void
maybe_dump(int howto)716 maybe_dump(int howto)
717 {
718 	int s;
719 
720 	/* Disable interrupts. */
721 	s = splhigh();
722 
723 	/* Do a dump if requested. */
724 	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
725 		dumpsys();
726 
727 	splx(s);
728 }
729 
730 void
cpu_reboot(int howto,char * bootstr)731 cpu_reboot(int howto, char *bootstr)
732 {
733 	static bool syncdone = false;
734 	int s = IPL_NONE;
735 
736 	if (cold) {
737 		howto |= RB_HALT;
738 		goto haltsys;
739 	}
740 
741 	boothowto = howto;
742 
743 	/* XXX used to dump after vfs_shutdown() and before
744 	 * detaching devices / shutdown hooks / pmf_system_shutdown().
745 	 */
746 	maybe_dump(howto);
747 
748 	/*
749 	 * If we've panic'd, don't make the situation potentially
750 	 * worse by syncing or unmounting the file systems.
751 	 */
752 	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
753 		if (!syncdone) {
754 			syncdone = true;
755 			/* XXX used to force unmount as well, here */
756 			vfs_sync_all(curlwp);
757 		}
758 
759 		while (vfs_unmountall1(curlwp, false, false) ||
760 		       config_detach_all(boothowto) ||
761 		       vfs_unmount_forceone(curlwp))
762 			;	/* do nothing */
763 	} else {
764 		if (!db_active)
765 			suspendsched();
766 	}
767 
768 	pmf_system_shutdown(boothowto);
769 
770 	s = splhigh();
771 
772 	/* amd64 maybe_dump() */
773 
774 haltsys:
775 	doshutdownhooks();
776 
777 	if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
778 #if NACPICA > 0
779 		if (s != IPL_NONE)
780 			splx(s);
781 
782 		acpi_enter_sleep_state(ACPI_STATE_S5);
783 #else
784 		__USE(s);
785 #endif
786 #ifdef XEN
787 		if (vm_guest == VM_GUEST_XENPV ||
788 		    vm_guest == VM_GUEST_XENPVH ||
789 		    vm_guest == VM_GUEST_XENPVHVM)
790 			HYPERVISOR_shutdown();
791 #endif /* XEN */
792 	}
793 
794 #ifdef MULTIPROCESSOR
795 	cpu_broadcast_halt();
796 #endif /* MULTIPROCESSOR */
797 
798 	if (howto & RB_HALT) {
799 #if NACPICA > 0
800 		acpi_disable();
801 #endif
802 
803 		printf("\n");
804 		printf("The operating system has halted.\n");
805 		printf("Please press any key to reboot.\n\n");
806 
807 #ifdef BEEP_ONHALT
808 		{
809 			int c;
810 			for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
811 				sysbeep(BEEP_ONHALT_PITCH,
812 					BEEP_ONHALT_PERIOD * hz / 1000);
813 				delay(BEEP_ONHALT_PERIOD * 1000);
814 				sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
815 				delay(BEEP_ONHALT_PERIOD * 1000);
816 			}
817 		}
818 #endif
819 
820 		cnpollc(1);	/* for proper keyboard command handling */
821 		if (cngetc() == 0) {
822 			/* no console attached, so just hlt */
823 			printf("No keyboard - cannot reboot after all.\n");
824 			for(;;) {
825 				x86_hlt();
826 			}
827 		}
828 		cnpollc(0);
829 	}
830 
831 	printf("rebooting...\n");
832 	if (cpureset_delay > 0)
833 		delay(cpureset_delay * 1000);
834 	cpu_reset();
835 	for(;;) ;
836 	/*NOTREACHED*/
837 }
838 
839 /*
840  * Clear registers on exec
841  */
842 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)843 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
844 {
845 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
846 	struct pcb *pcb = lwp_getpcb(l);
847 	struct trapframe *tf;
848 
849 #ifdef USER_LDT
850 	pmap_ldt_cleanup(l);
851 #endif
852 
853 	fpu_clear(l, pack->ep_osversion >= 699002600
854 	    ? __INITIAL_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
855 
856 	memcpy(&pcb->pcb_fsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_fsd));
857 	memcpy(&pcb->pcb_gsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_gsd));
858 
859 	x86_dbregs_clear(l);
860 
861 	tf = l->l_md.md_regs;
862 	tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
863 	tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
864 	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
865 	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
866 	tf->tf_edi = 0;
867 	tf->tf_esi = 0;
868 	tf->tf_ebp = 0;
869 	tf->tf_ebx = l->l_proc->p_psstrp;
870 	tf->tf_edx = 0;
871 	tf->tf_ecx = 0;
872 	tf->tf_eax = 0;
873 	tf->tf_eip = pack->ep_entry;
874 	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
875 	    LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
876 	tf->tf_eflags = PSL_USERSET;
877 	tf->tf_esp = stack;
878 	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
879 }
880 
881 /*
882  * Initialize segments and descriptor tables
883  */
884 
885 union descriptor *gdtstore, *ldtstore;
886 union descriptor *pentium_idt;
887 extern vaddr_t lwp0uarea;
888 
889 void
setgate(struct gate_descriptor * gd,void * func,int args,int type,int dpl,int sel)890 setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
891     int sel)
892 {
893 
894 	gd->gd_looffset = (int)func;
895 	gd->gd_selector = sel;
896 	gd->gd_stkcpy = args;
897 	gd->gd_xx = 0;
898 	gd->gd_type = type;
899 	gd->gd_dpl = dpl;
900 	gd->gd_p = 1;
901 	gd->gd_hioffset = (int)func >> 16;
902 }
903 
904 void
unsetgate(struct gate_descriptor * gd)905 unsetgate(struct gate_descriptor *gd)
906 {
907 
908 	gd->gd_p = 0;
909 	gd->gd_hioffset = 0;
910 	gd->gd_looffset = 0;
911 	gd->gd_selector = 0;
912 	gd->gd_xx = 0;
913 	gd->gd_stkcpy = 0;
914 	gd->gd_type = 0;
915 	gd->gd_dpl = 0;
916 }
917 
918 void
setregion(struct region_descriptor * rd,void * base,size_t limit)919 setregion(struct region_descriptor *rd, void *base, size_t limit)
920 {
921 
922 	rd->rd_limit = (int)limit;
923 	rd->rd_base = (int)base;
924 }
925 
926 void
setsegment(struct segment_descriptor * sd,const void * base,size_t limit,int type,int dpl,int def32,int gran)927 setsegment(struct segment_descriptor *sd, const void *base, size_t limit,
928     int type, int dpl, int def32, int gran)
929 {
930 
931 	sd->sd_lolimit = (int)limit;
932 	sd->sd_lobase = (int)base;
933 	sd->sd_type = type;
934 	sd->sd_dpl = dpl;
935 	sd->sd_p = 1;
936 	sd->sd_hilimit = (int)limit >> 16;
937 	sd->sd_xx = 0;
938 	sd->sd_def32 = def32;
939 	sd->sd_gran = gran;
940 	sd->sd_hibase = (int)base >> 24;
941 }
942 
943 /* XXX */
944 extern vector IDTVEC(syscall);
945 extern vector *IDTVEC(exceptions)[];
946 #ifdef XENPV
947 extern union descriptor tmpgdt[];
948 #endif
949 
950 void
cpu_init_idt(struct cpu_info * ci)951 cpu_init_idt(struct cpu_info *ci)
952 {
953 	struct region_descriptor region;
954 	struct idt_vec *iv;
955 	idt_descriptor_t *idt;
956 
957 	iv = &ci->ci_idtvec;
958 	idt = iv->iv_idt_pentium;
959 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
960 	lidt(&region);
961 }
962 
963 /*
964  * initgdt(tgdt)
965  *
966  *	Initialize a temporary Global Descriptor Table (GDT) using
967  *	storage space at tgdt.
968  *
969  *	1. Set up segment descriptors for our purposes, including a
970  *	   CPU-local segment descriptor pointing at &cpu_info_primary.
971  *
972  *	2. Load the address into the Global Descriptor Table Register.
973  *
974  *	3. Set up segment selectors for all the segment registers using
975  *	   it so that %fs-relative addressing works for the CPU-local
976  *	   data.
977  *
978  *	After this put, CPUVAR(...), curcpu(), and curlwp will work.
979  *
980  *	Eventually the kernel will switch to a second temporary GDT
981  *	allocated with pmap_bootstrap_valloc in pmap_bootstrap, and
982  *	then to permanent GDT allocated with uvm_km(9) in gdt_init.
983  *	But the first temporary GDT is needed now to get us going with
984  *	early access to curcpu() and curlwp before we enter kernel
985  *	main.
986  *
987  *	XXX The purpose of each of the segment descriptors should be
988  *	written down somewhere in a single place that can be cross-
989  *	referenced.
990  *
991  *	References:
992  *
993  *	- Intel 64 and IA-32 Architectures Software Developer's Manual,
994  *	  Volume 3: System Programming Guide, Order Number 325384,
995  *	  April 2022, Sec. 3.5.1 `Segment Descriptor Tables',
996  *	  pp. 3-14 through 3-16.
997  */
998 void
initgdt(union descriptor * tgdt)999 initgdt(union descriptor *tgdt)
1000 {
1001 	KASSERT(tgdt != NULL);
1002 
1003 	gdtstore = tgdt;
1004 #ifdef XENPV
1005 	u_long	frames[16];
1006 #else
1007 	struct region_descriptor region;
1008 	memset(gdtstore, 0, NGDT * sizeof(*gdtstore));
1009 #endif
1010 
1011 	/* make gdt gates and memory segments */
1012 	setsegment(&gdtstore[GCODE_SEL].sd, 0, 0xfffff,
1013 	    SDT_MEMERA, SEL_KPL, 1, 1);
1014 	setsegment(&gdtstore[GDATA_SEL].sd, 0, 0xfffff,
1015 	    SDT_MEMRWA, SEL_KPL, 1, 1);
1016 	setsegment(&gdtstore[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
1017 	    SDT_MEMERA, SEL_UPL, 1, 1);
1018 	setsegment(&gdtstore[GUCODEBIG_SEL].sd, 0, 0xfffff,
1019 	    SDT_MEMERA, SEL_UPL, 1, 1);
1020 	setsegment(&gdtstore[GUDATA_SEL].sd, 0, 0xfffff,
1021 	    SDT_MEMRWA, SEL_UPL, 1, 1);
1022 #if NBIOSCALL > 0 && !defined(XENPV)
1023 	/* bios trampoline GDT entries */
1024 	setsegment(&gdtstore[GBIOSCODE_SEL].sd, 0, 0xfffff,
1025 	    SDT_MEMERA, SEL_KPL, 0, 0);
1026 	setsegment(&gdtstore[GBIOSDATA_SEL].sd, 0, 0xfffff,
1027 	    SDT_MEMRWA, SEL_KPL, 0, 0);
1028 #endif
1029 	setsegment(&gdtstore[GCPU_SEL].sd, &cpu_info_primary,
1030 	    sizeof(struct cpu_info) - 1, SDT_MEMRWA, SEL_KPL, 1, 0);
1031 
1032 #ifndef XENPV
1033 	setregion(&region, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
1034 	lgdt(&region);
1035 #else /* !XENPV */
1036 	/*
1037 	 * We jumpstart the bootstrap process a bit so we can update
1038 	 * page permissions. This is done redundantly later from
1039 	 * x86_xpmap.c:xen_locore() - harmless.
1040 	 */
1041 	xpmap_phys_to_machine_mapping =
1042 	    (unsigned long *)xen_start_info.mfn_list;
1043 
1044 	frames[0] = xpmap_ptom((uint32_t)gdtstore - KERNBASE) >> PAGE_SHIFT;
1045 	{	/*
1046 		 * Enter the gdt page RO into the kernel map. We can't
1047 		 * use pmap_kenter_pa() here, because %fs is not
1048 		 * usable until the gdt is loaded, and %fs is used as
1049 		 * the base pointer for curcpu() and curlwp(), both of
1050 		 * which are in the callpath of pmap_kenter_pa().
1051 		 * So we mash up our own - this is MD code anyway.
1052 		 */
1053 		extern pt_entry_t xpmap_pg_nx;
1054 		pt_entry_t pte;
1055 
1056 		pte = pmap_pa2pte((vaddr_t)gdtstore - KERNBASE);
1057 		pte |= xpmap_pg_nx | PTE_P;
1058 
1059 		if (HYPERVISOR_update_va_mapping((vaddr_t)gdtstore, pte,
1060 		    UVMF_INVLPG) < 0) {
1061 			panic("gdt page RO update failed.\n");
1062 		}
1063 	}
1064 
1065 	if (HYPERVISOR_set_gdt(frames, NGDT /* XXX is it right ? */))
1066 		panic("HYPERVISOR_set_gdt failed!\n");
1067 
1068 	lgdt_finish();
1069 #endif /* !XENPV */
1070 }
1071 
1072 #if !defined(XENPV)  && NBIOSCALL > 0
1073 static void
init386_pte0(void)1074 init386_pte0(void)
1075 {
1076 	paddr_t paddr;
1077 	vaddr_t vaddr;
1078 
1079 	paddr = 4 * PAGE_SIZE;
1080 	vaddr = (vaddr_t)vtopte(0);
1081 	pmap_kenter_pa(vaddr, paddr, VM_PROT_ALL, 0);
1082 	pmap_update(pmap_kernel());
1083 	/* make sure it is clean before using */
1084 	memset((void *)vaddr, 0, PAGE_SIZE);
1085 }
1086 #endif /* !XENPV && NBIOSCALL > 0 */
1087 
1088 #ifndef XENPV
1089 static void
init386_ksyms(void)1090 init386_ksyms(void)
1091 {
1092 #if NKSYMS || defined(DDB) || defined(MODULAR)
1093 	extern int end;
1094 	struct btinfo_symtab *symtab;
1095 
1096 #ifdef DDB
1097 	db_machine_init();
1098 #endif
1099 
1100 #if defined(MULTIBOOT)
1101 	if (multiboot1_ksyms_addsyms_elf())
1102 		return;
1103 
1104 	if (multiboot2_ksyms_addsyms_elf())
1105 		return;
1106 #endif
1107 
1108 	if ((symtab = lookup_bootinfo(BTINFO_SYMTAB)) == NULL) {
1109 		ksyms_addsyms_elf(*(int *)&end, ((int *)&end) + 1, esym);
1110 		return;
1111 	}
1112 
1113 	symtab->ssym += KERNBASE;
1114 	symtab->esym += KERNBASE;
1115 	ksyms_addsyms_elf(symtab->nsym, (int *)symtab->ssym, (int *)symtab->esym);
1116 #endif
1117 }
1118 #endif /* XENPV */
1119 
1120 void
init_bootspace(void)1121 init_bootspace(void)
1122 {
1123 	extern char __rodata_start;
1124 	extern char __data_start;
1125 	extern char __kernel_end;
1126 	size_t i = 0;
1127 
1128 	memset(&bootspace, 0, sizeof(bootspace));
1129 
1130 	bootspace.head.va = KERNTEXTOFF;
1131 	bootspace.head.pa = KERNTEXTOFF - KERNBASE;
1132 	bootspace.head.sz = 0;
1133 
1134 	bootspace.segs[i].type = BTSEG_TEXT;
1135 	bootspace.segs[i].va = KERNTEXTOFF;
1136 	bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
1137 	bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
1138 	i++;
1139 
1140 	bootspace.segs[i].type = BTSEG_RODATA;
1141 	bootspace.segs[i].va = (vaddr_t)&__rodata_start;
1142 	bootspace.segs[i].pa = (paddr_t)(vaddr_t)&__rodata_start - KERNBASE;
1143 	bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
1144 	i++;
1145 
1146 	bootspace.segs[i].type = BTSEG_DATA;
1147 	bootspace.segs[i].va = (vaddr_t)&__data_start;
1148 	bootspace.segs[i].pa = (paddr_t)(vaddr_t)&__data_start - KERNBASE;
1149 	bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
1150 	i++;
1151 
1152 	bootspace.boot.va = (vaddr_t)&__kernel_end;
1153 	bootspace.boot.pa = (paddr_t)(vaddr_t)&__kernel_end - KERNBASE;
1154 	bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
1155 	    (size_t)&__kernel_end;
1156 
1157 	/* Virtual address of the top level page */
1158 	bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
1159 }
1160 
1161 void
init386(paddr_t first_avail)1162 init386(paddr_t first_avail)
1163 {
1164 	extern void consinit(void);
1165 	int x;
1166 #ifndef XENPV
1167 	extern paddr_t local_apic_pa;
1168 	union descriptor *tgdt;
1169 	struct region_descriptor region;
1170 #if NBIOSCALL > 0
1171 	extern int biostramp_image_size;
1172 	extern u_char biostramp_image[];
1173 #endif
1174 #endif /* !XENPV */
1175 	struct pcb *pcb;
1176 	struct idt_vec *iv;
1177 	idt_descriptor_t *idt;
1178 
1179 	KASSERT(first_avail % PAGE_SIZE == 0);
1180 
1181 #ifdef XENPV
1182 	KASSERT(HYPERVISOR_shared_info != NULL);
1183 	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1184 #endif
1185 
1186 #ifdef XEN
1187 	if (vm_guest == VM_GUEST_XENPVH)
1188 		xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1189 #endif
1190 
1191 	uvm_lwp_setuarea(&lwp0, lwp0uarea);
1192 
1193 	cpu_probe(&cpu_info_primary);
1194 
1195 	/*
1196 	 * Initialize the no-execute bit on cpu0, if supported.
1197 	 *
1198 	 * Note: The call to cpu_init_msrs for secondary CPUs happens
1199 	 * in cpu_hatch.
1200 	 */
1201 	cpu_init_msrs(&cpu_info_primary, true);
1202 
1203 #ifndef XENPV
1204 	cpu_speculation_init(&cpu_info_primary);
1205 #endif
1206 
1207 #ifdef PAE
1208 	use_pae = 1;
1209 #else
1210 	use_pae = 0;
1211 #endif
1212 
1213 	pcb = lwp_getpcb(&lwp0);
1214 #ifdef XENPV
1215 	pcb->pcb_cr3 = PDPpaddr;
1216 #endif
1217 
1218 #if defined(PAE) && !defined(XENPV)
1219 	/*
1220 	 * Save VA and PA of L3 PD of boot processor (for Xen, this is done
1221 	 * in xen_locore())
1222 	 */
1223 	cpu_info_primary.ci_pae_l3_pdirpa = rcr3();
1224 	cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE);
1225 #endif
1226 
1227 	uvm_md_init();
1228 
1229 	/*
1230 	 * Start with 2 color bins -- this is just a guess to get us
1231 	 * started.  We'll recolor when we determine the largest cache
1232 	 * sizes on the system.
1233 	 */
1234 	uvmexp.ncolors = 2;
1235 
1236 	avail_start = first_avail;
1237 
1238 #ifndef XENPV
1239 	/*
1240 	 * Low memory reservations:
1241 	 * Page 0:	BIOS data
1242 	 * Page 1:	BIOS callback
1243 	 * Page 2:	MP bootstrap code (MP_TRAMPOLINE)
1244 	 * Page 3:	ACPI wakeup code (ACPI_WAKEUP_ADDR)
1245 	 * Page 4:	Temporary page table for 0MB-4MB
1246 	 * Page 5:	Temporary page directory
1247 	 */
1248 	lowmem_rsvd = 6 * PAGE_SIZE;
1249 #else /* !XENPV */
1250 	/* Parse Xen command line (replace bootinfo) */
1251 	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1252 
1253 	/* Use the dummy page as a gdt */
1254 	extern vaddr_t xen_dummy_page;
1255 	gdtstore = (void *)xen_dummy_page;
1256 
1257 	/* Determine physical address space */
1258 	avail_end = ctob((paddr_t)xen_start_info.nr_pages);
1259 	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1260 	pmap_pa_end = pmap_pa_start + ctob((paddr_t)xen_start_info.nr_pages);
1261 	mem_clusters[0].start = avail_start;
1262 	mem_clusters[0].size = avail_end - avail_start;
1263 	mem_cluster_cnt++;
1264 	physmem += xen_start_info.nr_pages;
1265 	uvmexp.wired += atop(avail_start);
1266 
1267 	/*
1268 	 * initgdt() has to be done before consinit(), so that %fs is properly
1269 	 * initialised. initgdt() uses pmap_kenter_pa so it can't be called
1270 	 * before the above variables are set.
1271 	 */
1272 	initgdt(gdtstore);
1273 
1274 	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1275 #endif /* XENPV */
1276 
1277 #if NISA > 0 || NPCI > 0
1278 	x86_bus_space_init();
1279 #endif
1280 
1281 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
1282 
1283 #ifdef DEBUG_MEMLOAD
1284 	printf("mem_cluster_count: %d\n", mem_cluster_cnt);
1285 #endif
1286 
1287 	/*
1288 	 * Call pmap initialization to make new kernel address space.
1289 	 * We must do this before loading pages into the VM system.
1290 	 */
1291 	pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
1292 
1293 	/*
1294 	 * Initialize RNG to get entropy ASAP either from CPU
1295 	 * RDRAND/RDSEED or from seed on disk.  Constraints:
1296 	 *
1297 	 * - Must happen after cpu_init_msrs so that curcpu() and
1298 	 *   curlwp work.
1299 	 *
1300 	 * - Must happen after consinit so we have the opportunity to
1301 	 *   print useful feedback.
1302 	 *
1303 	 * - On KASLR kernels, must happen after pmap_bootstrap because
1304 	 *   x86_rndseed requires access to the direct map.
1305 	 */
1306 	cpu_rng_init();
1307 	x86_rndseed();
1308 
1309 #ifndef XENPV
1310 	/* Initialize the memory clusters. */
1311 	init_x86_clusters();
1312 
1313 	/* Internalize the physical pages into the VM system. */
1314 	init_x86_vm(avail_start);
1315 #else /* !XENPV */
1316 	uvm_page_physload(atop(avail_start), atop(avail_end),
1317 	    atop(avail_start), atop(avail_end),
1318 	    VM_FREELIST_DEFAULT);
1319 
1320 	/* Reclaim the boot gdt page - see locore.s */
1321 	{
1322 		extern pt_entry_t xpmap_pg_nx;
1323 		pt_entry_t pte;
1324 
1325 		pte = pmap_pa2pte((vaddr_t)tmpgdt - KERNBASE);
1326 		pte |= PTE_W | xpmap_pg_nx | PTE_P;
1327 
1328 		if (HYPERVISOR_update_va_mapping((vaddr_t)tmpgdt, pte, UVMF_INVLPG) < 0) {
1329 			panic("tmpgdt page relaim RW update failed.\n");
1330 		}
1331 	}
1332 #endif /* !XENPV */
1333 
1334 	init_x86_msgbuf();
1335 
1336 #if !defined(XENPV) && NBIOSCALL > 0
1337 	/*
1338 	 * XXX Remove this
1339 	 *
1340 	 * Setup a temporary Page Table Entry to allow identity mappings of
1341 	 * the real mode address. This is required by bioscall.
1342 	 */
1343 	init386_pte0();
1344 
1345 	KASSERT(biostramp_image_size <= PAGE_SIZE);
1346 	pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, (paddr_t)BIOSTRAMP_BASE,
1347 	    VM_PROT_ALL, 0);
1348 	pmap_update(pmap_kernel());
1349 	memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
1350 
1351 	/* Needed early, for bioscall() */
1352 	cpu_info_primary.ci_pmap = pmap_kernel();
1353 #endif
1354 
1355 #ifndef XENPV
1356 	pmap_kenter_pa(local_apic_va, local_apic_pa,
1357 	    VM_PROT_READ|VM_PROT_WRITE, 0);
1358 	pmap_update(pmap_kernel());
1359 	memset((void *)local_apic_va, 0, PAGE_SIZE);
1360 #endif
1361 
1362 	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1363 	pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1364 	pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1365 	pmap_update(pmap_kernel());
1366 	memset((void *)idt_vaddr, 0, PAGE_SIZE);
1367 	memset((void *)gdt_vaddr, 0, PAGE_SIZE);
1368 	memset((void *)ldt_vaddr, 0, PAGE_SIZE);
1369 
1370 	pmap_kenter_pa(pentium_idt_vaddr, idt_paddr, VM_PROT_READ, 0);
1371 	pmap_update(pmap_kernel());
1372 	iv = &(cpu_info_primary.ci_idtvec);
1373 	idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
1374 	idt = (idt_descriptor_t *)iv->iv_idt;
1375 
1376 #ifndef XENPV
1377 	/*
1378 	 * Switch from the initial temporary GDT that was allocated on
1379 	 * the stack by our caller, start.  That temporary GDT will be
1380 	 * popped off the stack when init386 returns before start calls
1381 	 * main, so we need to use a second temporary GDT allocated in
1382 	 * pmap_bootstrap with pmap_bootstrap_valloc/palloc to make
1383 	 * sure at least the CPU-local data area, used by CPUVAR(...),
1384 	 * curcpu(), and curlwp via %fs-relative addressing, will
1385 	 * continue to work.
1386 	 *
1387 	 * Later, in gdt_init via cpu_startup, we will finally allocate
1388 	 * a permanent GDT with uvm_km(9).
1389 	 *
1390 	 * The content of the second temporary GDT is the same as the
1391 	 * content of the initial GDT, initialized in initgdt, except
1392 	 * for the address of the LDT, which is also that we are also
1393 	 * switching to a new temporary LDT at a new address.
1394 	 */
1395 	tgdt = gdtstore;
1396 	gdtstore = (union descriptor *)gdt_vaddr;
1397 	ldtstore = (union descriptor *)ldt_vaddr;
1398 
1399 	memcpy(gdtstore, tgdt, NGDT * sizeof(*gdtstore));
1400 
1401 	setsegment(&gdtstore[GLDT_SEL].sd, ldtstore,
1402 	    NLDT * sizeof(ldtstore[0]) - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
1403 #else
1404 	HYPERVISOR_set_callbacks(
1405 	    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
1406 	    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
1407 
1408 	ldtstore = (union descriptor *)ldt_vaddr;
1409 #endif /* XENPV */
1410 
1411 	/* make ldt gates and memory segments */
1412 	ldtstore[LUCODE_SEL] = gdtstore[GUCODE_SEL];
1413 	ldtstore[LUCODEBIG_SEL] = gdtstore[GUCODEBIG_SEL];
1414 	ldtstore[LUDATA_SEL] = gdtstore[GUDATA_SEL];
1415 
1416 	/* exceptions */
1417 	for (x = 0; x < 32; x++) {
1418 		/* Reset to default. Special cases below */
1419 		int sel;
1420 #ifdef XENPV
1421 		sel = SEL_XEN;
1422 #else
1423 		sel = SEL_KPL;
1424 #endif /* XENPV */
1425 
1426 		idt_vec_reserve(iv, x);
1427 
1428  		switch (x) {
1429 #ifdef XENPV
1430 		case 2:  /* NMI */
1431 		case 18: /* MCA */
1432 			sel |= 0x4; /* Auto EOI/mask */
1433 			break;
1434 #endif /* XENPV */
1435 		case 3:
1436 		case 4:
1437 			sel = SEL_UPL;
1438 			break;
1439 		default:
1440 			break;
1441 		}
1442 		set_idtgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386IGT,
1443 		    sel, GSEL(GCODE_SEL, SEL_KPL));
1444 	}
1445 
1446 	/* new-style interrupt gate for syscalls */
1447 	idt_vec_reserve(iv, 128);
1448 	set_idtgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386IGT, SEL_UPL,
1449 	    GSEL(GCODE_SEL, SEL_KPL));
1450 
1451 #ifndef XENPV
1452 	/*
1453 	 * Activate the second temporary GDT, allocated in
1454 	 * pmap_bootstrap with pmap_bootstrap_valloc/palloc, and
1455 	 * initialized with the content of the initial temporary GDT in
1456 	 * initgdt, plus an updated LDT.
1457 	 *
1458 	 * This ensures the %fs-relative addressing for the CPU-local
1459 	 * area used by CPUVAR(...), curcpu(), and curlwp will continue
1460 	 * to work after init386 returns and the initial temporary GDT
1461 	 * is popped off, before we call main and later create a
1462 	 * permanent GDT in gdt_init via cpu_startup.
1463 	 */
1464 	setregion(&region, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
1465 	lgdt(&region);
1466 #endif
1467 
1468 	lldt(GSEL(GLDT_SEL, SEL_KPL));
1469 	cpu_init_idt(&cpu_info_primary);
1470 
1471 #ifdef XENPV
1472 	xen_init_ksyms();
1473 #else /* XENPV */
1474 #ifdef XEN
1475 	if (vm_guest == VM_GUEST_XENPVH)
1476 		xen_init_ksyms();
1477 	else
1478 #endif /* XEN */
1479 		init386_ksyms();
1480 #endif /* XENPV */
1481 
1482 #if NMCA > 0
1483 	/*
1484 	 * check for MCA bus, needed to be done before ISA stuff - if
1485 	 * MCA is detected, ISA needs to use level triggered interrupts
1486 	 * by default
1487 	 * And we do not search for MCA using bioscall() on EFI systems
1488 	 * that lacks it (they lack MCA too, anyway).
1489 	 */
1490 	if (lookup_bootinfo(BTINFO_EFI) == NULL && vm_guest != VM_GUEST_XENPVH)
1491 		mca_busprobe();
1492 #endif
1493 
1494 #ifdef XENPV
1495 	extern int tmpstk;
1496 	cpu_info_primary.ci_intrstack = &tmpstk;
1497 	events_default_setup();
1498 #else
1499 	intr_default_setup();
1500 #endif
1501 
1502 	splraise(IPL_HIGH);
1503 	x86_enable_intr();
1504 
1505 #ifdef DDB
1506 	if (boothowto & RB_KDB)
1507 		Debugger();
1508 #endif
1509 #ifdef KGDB
1510 	kgdb_port_init();
1511 	if (boothowto & RB_KDB) {
1512 		kgdb_debug_init = 1;
1513 		kgdb_connect(1);
1514 	}
1515 #endif
1516 
1517 	if (physmem < btoc(2 * 1024 * 1024)) {
1518 		printf("warning: too little memory available; "
1519 		       "have %lu bytes, want %lu bytes\n"
1520 		       "running in degraded mode\n"
1521 		       "press a key to confirm\n\n",
1522 		       (unsigned long)ptoa(physmem), 2*1024*1024UL);
1523 		cngetc();
1524 	}
1525 
1526 	pcb->pcb_dbregs = NULL;
1527 	x86_dbregs_init();
1528 }
1529 
1530 #include <dev/ic/mc146818reg.h>		/* for NVRAM POST */
1531 #include <i386/isa/nvram.h>		/* for NVRAM POST */
1532 
1533 void
cpu_reset(void)1534 cpu_reset(void)
1535 {
1536 #ifdef XENPV
1537 	HYPERVISOR_reboot();
1538 	for (;;);
1539 #else /* XENPV */
1540 	struct region_descriptor region;
1541 	idt_descriptor_t *idt;
1542 
1543 	idt = (idt_descriptor_t *)cpu_info_primary.ci_idtvec.iv_idt;
1544 	x86_disable_intr();
1545 
1546 	/*
1547 	 * Ensure the NVRAM reset byte contains something vaguely sane.
1548 	 */
1549 
1550 	outb(IO_RTC, NVRAM_RESET);
1551 	outb(IO_RTC+1, NVRAM_RESET_RST);
1552 
1553 	/*
1554 	 * Reset AMD Geode SC1100.
1555 	 *
1556 	 * 1) Write PCI Configuration Address Register (0xcf8) to
1557 	 *    select Function 0, Register 0x44: Bridge Configuration,
1558 	 *    GPIO and LPC Configuration Register Space, Reset
1559 	 *    Control Register.
1560 	 *
1561 	 * 2) Write 0xf to PCI Configuration Data Register (0xcfc)
1562 	 *    to reset IDE controller, IDE bus, and PCI bus, and
1563 	 *    to trigger a system-wide reset.
1564 	 *
1565 	 * See AMD Geode SC1100 Processor Data Book, Revision 2.0,
1566 	 * sections 6.3.1, 6.3.2, and 6.4.1.
1567 	 */
1568 	if (cpu_info_primary.ci_signature == 0x540) {
1569 		outl(0xcf8, 0x80009044);
1570 		outl(0xcfc, 0xf);
1571 	}
1572 
1573 	x86_reset();
1574 
1575 	/*
1576 	 * Try to cause a triple fault and watchdog reset by making the IDT
1577 	 * invalid and causing a fault.
1578 	 */
1579 	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
1580 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1581 	lidt(&region);
1582 	breakpoint();
1583 
1584 #if 0
1585 	/*
1586 	 * Try to cause a triple fault and watchdog reset by unmapping the
1587 	 * entire address space and doing a TLB flush.
1588 	 */
1589 	memset((void *)PTD, 0, PAGE_SIZE);
1590 	tlbflush();
1591 #endif
1592 
1593 	for (;;);
1594 #endif /* XENPV */
1595 }
1596 
1597 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)1598 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
1599 {
1600 	const struct trapframe *tf = l->l_md.md_regs;
1601 	__greg_t *gr = mcp->__gregs;
1602 	__greg_t ras_eip;
1603 
1604 	/* Save register context. */
1605 	gr[_REG_GS]  = tf->tf_gs;
1606 	gr[_REG_FS]  = tf->tf_fs;
1607 	gr[_REG_ES]  = tf->tf_es;
1608 	gr[_REG_DS]  = tf->tf_ds;
1609 	gr[_REG_EFL] = tf->tf_eflags;
1610 
1611 	gr[_REG_EDI]    = tf->tf_edi;
1612 	gr[_REG_ESI]    = tf->tf_esi;
1613 	gr[_REG_EBP]    = tf->tf_ebp;
1614 	gr[_REG_EBX]    = tf->tf_ebx;
1615 	gr[_REG_EDX]    = tf->tf_edx;
1616 	gr[_REG_ECX]    = tf->tf_ecx;
1617 	gr[_REG_EAX]    = tf->tf_eax;
1618 	gr[_REG_EIP]    = tf->tf_eip;
1619 	gr[_REG_CS]     = tf->tf_cs;
1620 	gr[_REG_ESP]    = tf->tf_esp;
1621 	gr[_REG_UESP]   = tf->tf_esp;
1622 	gr[_REG_SS]     = tf->tf_ss;
1623 	gr[_REG_TRAPNO] = tf->tf_trapno;
1624 	gr[_REG_ERR]    = tf->tf_err;
1625 
1626 	if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
1627 	    (void *) gr[_REG_EIP])) != -1)
1628 		gr[_REG_EIP] = ras_eip;
1629 
1630 	*flags |= _UC_CPU;
1631 
1632 	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
1633 	*flags |= _UC_TLSBASE;
1634 
1635 	/*
1636 	 * Save floating point register context.
1637 	 *
1638 	 * If the cpu doesn't support fxsave we must still write to
1639 	 * the entire 512 byte area - otherwise we leak kernel memory
1640 	 * contents to userspace.
1641 	 * It wouldn't matter if we were doing the copyout here.
1642 	 * So we might as well convert to fxsave format.
1643 	 */
1644 	__CTASSERT(sizeof (struct fxsave) ==
1645 	    sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1646 	process_read_fpregs_xmm(l, (struct fxsave *)
1647 	    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1648 	memset(&mcp->__fpregs.__fp_pad, 0, sizeof mcp->__fpregs.__fp_pad);
1649 	*flags |= _UC_FXSAVE | _UC_FPU;
1650 }
1651 
1652 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)1653 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
1654 {
1655 	const __greg_t *gr = mcp->__gregs;
1656 	struct trapframe *tf = l->l_md.md_regs;
1657 
1658 	/*
1659 	 * Check for security violations.  If we're returning
1660 	 * to protected mode, the CPU will validate the segment
1661 	 * registers automatically and generate a trap on
1662 	 * violations.  We handle the trap, rather than doing
1663 	 * all of the checking here.
1664 	 */
1665 	if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
1666 	    !USERMODE(gr[_REG_CS]))
1667 		return EINVAL;
1668 
1669 	return 0;
1670 }
1671 
1672 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)1673 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
1674 {
1675 	struct trapframe *tf = l->l_md.md_regs;
1676 	const __greg_t *gr = mcp->__gregs;
1677 	struct proc *p = l->l_proc;
1678 	int error;
1679 
1680 	/* Restore register context, if any. */
1681 	if ((flags & _UC_CPU) != 0) {
1682 		error = cpu_mcontext_validate(l, mcp);
1683 		if (error)
1684 			return error;
1685 
1686 		tf->tf_gs = gr[_REG_GS];
1687 		tf->tf_fs = gr[_REG_FS];
1688 		tf->tf_es = gr[_REG_ES];
1689 		tf->tf_ds = gr[_REG_DS];
1690 		/* Only change the user-alterable part of eflags */
1691 		tf->tf_eflags &= ~PSL_USER;
1692 		tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
1693 
1694 		tf->tf_edi    = gr[_REG_EDI];
1695 		tf->tf_esi    = gr[_REG_ESI];
1696 		tf->tf_ebp    = gr[_REG_EBP];
1697 		tf->tf_ebx    = gr[_REG_EBX];
1698 		tf->tf_edx    = gr[_REG_EDX];
1699 		tf->tf_ecx    = gr[_REG_ECX];
1700 		tf->tf_eax    = gr[_REG_EAX];
1701 		tf->tf_eip    = gr[_REG_EIP];
1702 		tf->tf_cs     = gr[_REG_CS];
1703 		tf->tf_esp    = gr[_REG_UESP];
1704 		tf->tf_ss     = gr[_REG_SS];
1705 	}
1706 
1707 	if ((flags & _UC_TLSBASE) != 0)
1708 		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
1709 
1710 	/* Restore floating point register context, if given. */
1711 	if ((flags & _UC_FPU) != 0) {
1712 		__CTASSERT(sizeof (struct fxsave) ==
1713 		    sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1714 		__CTASSERT(sizeof (struct save87) ==
1715 		    sizeof mcp->__fpregs.__fp_reg_set.__fpchip_state);
1716 
1717 		if (flags & _UC_FXSAVE) {
1718 			process_write_fpregs_xmm(l, (const struct fxsave *)
1719 				    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1720 		} else {
1721 			process_write_fpregs_s87(l, (const struct save87 *)
1722 				    &mcp->__fpregs.__fp_reg_set.__fpchip_state);
1723 		}
1724 	}
1725 
1726 	mutex_enter(p->p_lock);
1727 	if (flags & _UC_SETSTACK)
1728 		l->l_sigstk.ss_flags |= SS_ONSTACK;
1729 	if (flags & _UC_CLRSTACK)
1730 		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
1731 	mutex_exit(p->p_lock);
1732 	return (0);
1733 }
1734 
1735 #define	DEV_IO 14		/* iopl for compat_10 */
1736 
1737 int
mm_md_open(dev_t dev,int flag,int mode,struct lwp * l)1738 mm_md_open(dev_t dev, int flag, int mode, struct lwp *l)
1739 {
1740 
1741 	switch (minor(dev)) {
1742 	case DEV_IO:
1743 		/*
1744 		 * This is done by i386_iopl(3) now.
1745 		 *
1746 		 * #if defined(COMPAT_10) || defined(COMPAT_FREEBSD)
1747 		 */
1748 		if (flag & FWRITE) {
1749 			struct trapframe *fp;
1750 			int error;
1751 
1752 			error = kauth_authorize_machdep(l->l_cred,
1753 			    KAUTH_MACHDEP_IOPL, NULL, NULL, NULL, NULL);
1754 			if (error)
1755 				return (error);
1756 			fp = curlwp->l_md.md_regs;
1757 			fp->tf_eflags |= PSL_IOPL;
1758 		}
1759 		break;
1760 	default:
1761 		break;
1762 	}
1763 	return 0;
1764 }
1765 
1766 #ifdef PAE
1767 void
cpu_alloc_l3_page(struct cpu_info * ci)1768 cpu_alloc_l3_page(struct cpu_info *ci)
1769 {
1770 	int ret;
1771 	struct pglist pg;
1772 	struct vm_page *vmap;
1773 
1774 	KASSERT(ci != NULL);
1775 	/*
1776 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1777 	 * resides below the 4GB boundary.
1778 	 */
1779 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1780 	vmap = TAILQ_FIRST(&pg);
1781 
1782 	if (ret != 0 || vmap == NULL)
1783 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1784 			__func__, cpu_index(ci), ret);
1785 
1786 	ci->ci_pae_l3_pdirpa = VM_PAGE_TO_PHYS(vmap);
1787 
1788 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1789 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1790 	if (ci->ci_pae_l3_pdir == NULL)
1791 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1792 			__func__, cpu_index(ci));
1793 
1794 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1795 		VM_PROT_READ | VM_PROT_WRITE, 0);
1796 
1797 	pmap_update(pmap_kernel());
1798 }
1799 #endif /* PAE */
1800 
1801 static void
idt_vec_copy(struct idt_vec * dst,struct idt_vec * src)1802 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
1803 {
1804 	idt_descriptor_t *idt_dst;
1805 
1806 	idt_dst = dst->iv_idt;
1807 	memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
1808 	memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
1809 }
1810 
1811 void
idt_vec_init_cpu_md(struct idt_vec * iv,cpuid_t cid)1812 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
1813 {
1814 	vaddr_t va_idt, va_pentium_idt;
1815 	struct vm_page *pg;
1816 
1817 	if (idt_vec_is_pcpu() &&
1818 	    cid != cpu_index(&cpu_info_primary)) {
1819 		va_idt = uvm_km_alloc(kernel_map, PAGE_SIZE,
1820 		    0, UVM_KMF_VAONLY);
1821 		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
1822 		if (pg == NULL) {
1823 			panic("failed to allocate pcpu idt PA");
1824 		}
1825 		pmap_kenter_pa(va_idt, VM_PAGE_TO_PHYS(pg),
1826 		    VM_PROT_READ|VM_PROT_WRITE, 0);
1827 		pmap_update(pmap_kernel());
1828 
1829 		memset((void *)va_idt, 0, PAGE_SIZE);
1830 
1831 		/* pentium f00f bug stuff */
1832 		va_pentium_idt = uvm_km_alloc(kernel_map, PAGE_SIZE,
1833 		    0, UVM_KMF_VAONLY);
1834 		pmap_kenter_pa(va_pentium_idt, VM_PAGE_TO_PHYS(pg),
1835 		    VM_PROT_READ, 0);
1836 		pmap_update(pmap_kernel());
1837 
1838 		iv->iv_idt = (void *)va_idt;
1839 		iv->iv_idt_pentium = (void *)va_pentium_idt;
1840 
1841 		idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
1842 	} else {
1843 		iv->iv_idt = (void *)idt_vaddr;
1844 		iv->iv_idt_pentium = (void *)pentium_idt_vaddr;
1845 	}
1846 }
1847