xref: /netbsd-src/sys/arch/amd64/amd64/machdep.c (revision 6807c6be5e8dec1164c0e4d531977fcad218ce85)
1 /*	$NetBSD: machdep.c,v 1.371 2025/01/22 10:03:55 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * This code is derived from software contributed to The NetBSD Foundation
13  * by Coyote Point Systems, Inc. which was written under contract to Coyote
14  * Point by Jed Davis and Devon O'Dell.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
40  *
41  * Permission to use, copy, modify, and distribute this software for any
42  * purpose with or without fee is hereby granted, provided that the above
43  * copyright notice and this permission notice appear in all copies.
44  *
45  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
46  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
47  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
48  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
49  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
50  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
51  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
52  */
53 
54 /*
55  * Copyright (c) 2007 Manuel Bouyer.
56  *
57  * Redistribution and use in source and binary forms, with or without
58  * modification, are permitted provided that the following conditions
59  * are met:
60  * 1. Redistributions of source code must retain the above copyright
61  *    notice, this list of conditions and the following disclaimer.
62  * 2. Redistributions in binary form must reproduce the above copyright
63  *    notice, this list of conditions and the following disclaimer in the
64  *    documentation and/or other materials provided with the distribution.
65  *
66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
67  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
68  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
69  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
70  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
71  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
75  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /*
79  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
80  * All rights reserved.
81  *
82  * This code is derived from software contributed to Berkeley by
83  * William Jolitz.
84  *
85  * Redistribution and use in source and binary forms, with or without
86  * modification, are permitted provided that the following conditions
87  * are met:
88  * 1. Redistributions of source code must retain the above copyright
89  *    notice, this list of conditions and the following disclaimer.
90  * 2. Redistributions in binary form must reproduce the above copyright
91  *    notice, this list of conditions and the following disclaimer in the
92  *    documentation and/or other materials provided with the distribution.
93  * 3. Neither the name of the University nor the names of its contributors
94  *    may be used to endorse or promote products derived from this software
95  *    without specific prior written permission.
96  *
97  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
98  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
99  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
100  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
101  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
102  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
103  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
104  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
105  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
106  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
107  * SUCH DAMAGE.
108  *
109  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
110  */
111 
112 #include <sys/cdefs.h>
113 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.371 2025/01/22 10:03:55 riastradh Exp $");
114 
115 #include "opt_modular.h"
116 #include "opt_user_ldt.h"
117 #include "opt_ddb.h"
118 #include "opt_kgdb.h"
119 #include "opt_cpureset_delay.h"
120 #include "opt_mtrr.h"
121 #include "opt_realmem.h"
122 #include "opt_xen.h"
123 #include "opt_svs.h"
124 #include "opt_kaslr.h"
125 #ifndef XENPV
126 #include "opt_physmem.h"
127 #endif
128 #include "isa.h"
129 #include "pci.h"
130 
131 #include <sys/param.h>
132 #include <sys/systm.h>
133 #include <sys/signal.h>
134 #include <sys/signalvar.h>
135 #include <sys/kernel.h>
136 #include <sys/cpu.h>
137 #include <sys/exec.h>
138 #include <sys/exec_aout.h>	/* for MID_* */
139 #include <sys/reboot.h>
140 #include <sys/conf.h>
141 #include <sys/msgbuf.h>
142 #include <sys/mount.h>
143 #include <sys/core.h>
144 #include <sys/kcore.h>
145 #include <sys/ucontext.h>
146 #include <machine/kcore.h>
147 #include <sys/ras.h>
148 #include <sys/syscallargs.h>
149 #include <sys/ksyms.h>
150 #include <sys/device.h>
151 #include <sys/lwp.h>
152 #include <sys/proc.h>
153 #include <sys/asan.h>
154 #include <sys/csan.h>
155 #include <sys/msan.h>
156 #include <sys/module.h>
157 #include <sys/timevar.h>
158 
159 #ifdef KGDB
160 #include <sys/kgdb.h>
161 #endif
162 
163 #include <lib/libkern/entpool.h> /* XXX */
164 
165 #include <dev/cons.h>
166 #include <dev/mm.h>
167 
168 #include <uvm/uvm.h>
169 #include <uvm/uvm_page.h>
170 
171 #include <sys/sysctl.h>
172 
173 #include <machine/cpu.h>
174 #include <machine/cpu_rng.h>
175 #include <machine/cpufunc.h>
176 #include <machine/gdt.h>
177 #include <machine/intr.h>
178 #include <machine/pio.h>
179 #include <machine/psl.h>
180 #include <machine/reg.h>
181 #include <machine/specialreg.h>
182 #include <machine/bootinfo.h>
183 #include <x86/fpu.h>
184 #include <x86/dbregs.h>
185 #include <machine/mtrr.h>
186 #include <machine/mpbiosvar.h>
187 #include <machine/pmap_private.h>
188 
189 #include <x86/bootspace.h>
190 #include <x86/cputypes.h>
191 #include <x86/cpuvar.h>
192 #include <x86/machdep.h>
193 #include <x86/x86/tsc.h>
194 
195 #include <dev/isa/isareg.h>
196 #include <machine/isa_machdep.h>
197 #include <dev/ic/i8042reg.h>
198 
199 #ifdef XEN
200 #include <xen/xen.h>
201 #include <xen/hypervisor.h>
202 #include <xen/evtchn.h>
203 #include <xen/include/public/version.h>
204 #include <xen/include/public/vcpu.h>
205 #endif /* XEN */
206 
207 #include <ddb/db_active.h>
208 
209 #ifdef DDB
210 #include <machine/db_machdep.h>
211 #include <ddb/db_extern.h>
212 #include <ddb/db_output.h>
213 #include <ddb/db_interface.h>
214 #endif
215 
216 #include "acpica.h"
217 
218 #if NACPICA > 0
219 #include <dev/acpi/acpivar.h>
220 #define ACPI_MACHDEP_PRIVATE
221 #include <machine/acpi_machdep.h>
222 #else
223 #include <machine/i82489var.h>
224 #endif
225 
226 #include "isa.h"
227 #include "isadma.h"
228 #include "ksyms.h"
229 
230 /* the following is used externally (sysctl_hw) */
231 char machine[] = "amd64";		/* CPU "architecture" */
232 char machine_arch[] = "x86_64";		/* machine == machine_arch */
233 
234 #ifdef CPURESET_DELAY
235 int cpureset_delay = CPURESET_DELAY;
236 #else
237 int cpureset_delay = 2000; /* default to 2s */
238 #endif
239 
240 int cpu_class = CPUCLASS_686;
241 
242 #ifdef MTRR
243 const struct mtrr_funcs *mtrr_funcs;
244 #endif
245 
246 int cpu_class;
247 int use_pae;
248 
249 #ifndef NO_SPARSE_DUMP
250 int sparse_dump = 1;
251 
252 paddr_t max_paddr = 0;
253 unsigned char *sparse_dump_physmap;
254 #endif
255 
256 char *dump_headerbuf, *dump_headerbuf_ptr;
257 #define dump_headerbuf_size PAGE_SIZE
258 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
259 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
260 daddr_t dump_header_blkno;
261 
262 size_t dump_nmemsegs;
263 size_t dump_npages;
264 size_t dump_header_size;
265 size_t dump_totalbytesleft;
266 
267 vaddr_t idt_vaddr;
268 paddr_t idt_paddr;
269 vaddr_t gdt_vaddr;
270 paddr_t gdt_paddr;
271 vaddr_t ldt_vaddr;
272 paddr_t ldt_paddr;
273 
274 static struct vm_map module_map_store;
275 extern struct bootspace bootspace;
276 extern struct slotspace slotspace;
277 
278 vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT;
279 vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT;
280 pd_entry_t *pte_base __read_mostly;
281 
282 struct vm_map *phys_map = NULL;
283 
284 extern paddr_t lowmem_rsvd;
285 extern paddr_t avail_start, avail_end;
286 #ifdef XENPV
287 extern paddr_t pmap_pa_start, pmap_pa_end;
288 #endif
289 
290 struct nmistore {
291 	uint64_t cr3;
292 	uint64_t scratch;
293 } __packed;
294 
295 /*
296  * Size of memory segments, before any memory is stolen.
297  */
298 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
299 int mem_cluster_cnt;
300 
301 int cpu_dump(void);
302 int cpu_dumpsize(void);
303 u_long cpu_dump_mempagecnt(void);
304 void dodumpsys(void);
305 void dumpsys(void);
306 
307 static void x86_64_proc0_pcb_ldt_init(void);
308 
309 void dump_misc_init(void);
310 void dump_seg_prep(void);
311 int dump_seg_iter(int (*)(paddr_t, paddr_t));
312 
313 #ifndef NO_SPARSE_DUMP
314 void sparse_dump_reset(void);
315 void sparse_dump_mark(void);
316 void cpu_dump_prep_sparse(void);
317 #endif
318 
319 void dump_header_start(void);
320 int dump_header_flush(void);
321 int dump_header_addbytes(const void*, size_t);
322 int dump_header_addseg(paddr_t, paddr_t);
323 int dump_header_finish(void);
324 
325 int dump_seg_count_range(paddr_t, paddr_t);
326 int dumpsys_seg(paddr_t, paddr_t);
327 
328 void init_bootspace(void);
329 void init_slotspace(void);
330 void init_x86_64(paddr_t);
331 
332 /*
333  * Machine-dependent startup code
334  */
335 void
336 cpu_startup(void)
337 {
338 	int x, y;
339 	vaddr_t minaddr, maxaddr;
340 	psize_t sz;
341 
342 	/*
343 	 * For console drivers that require uvm and pmap to be initialized,
344 	 * we'll give them one more chance here...
345 	 */
346 	consinit();
347 
348 	/*
349 	 * Initialize error message buffer (at end of core).
350 	 */
351 	if (msgbuf_p_cnt == 0)
352 		panic("msgbuf paddr map has not been set up");
353 	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
354 		continue;
355 
356 	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
357 	if (msgbuf_vaddr == 0)
358 		panic("failed to valloc msgbuf_vaddr");
359 
360 	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
361 		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
362 			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
363 			    msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
364 			    VM_PROT_READ|VM_PROT_WRITE, 0);
365 	}
366 
367 	pmap_update(pmap_kernel());
368 
369 	initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
370 
371 	minaddr = 0;
372 
373 	/*
374 	 * Allocate a submap for physio.
375 	 */
376 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
377 	    VM_PHYS_SIZE, 0, false, NULL);
378 
379 	/*
380 	 * Create the module map.
381 	 *
382 	 * The kernel uses RIP-relative addressing with a maximum offset of
383 	 * 2GB. Because of that, we can't put the kernel modules in kernel_map
384 	 * (like i386 does), since kernel_map is too far away in memory from
385 	 * the kernel sections. So we have to create a special module_map.
386 	 *
387 	 * The module map is taken as what is left of the bootstrap memory
388 	 * created in locore/prekern.
389 	 */
390 	uvm_map_setup(&module_map_store, bootspace.smodule,
391 	    bootspace.emodule, 0);
392 	module_map_store.pmap = pmap_kernel();
393 	module_map = &module_map_store;
394 
395 	/* Say hello. */
396 	banner();
397 
398 #if NISA > 0 || NPCI > 0
399 	/* Safe for i/o port / memory space allocation to use malloc now. */
400 	x86_bus_space_mallocok();
401 #endif
402 
403 #ifdef __HAVE_PCPU_AREA
404 	cpu_pcpuarea_init(&cpu_info_primary);
405 #endif
406 	gdt_init();
407 	x86_64_proc0_pcb_ldt_init();
408 
409 	cpu_init_tss(&cpu_info_primary);
410 #if !defined(XENPV)
411 	ltr(cpu_info_primary.ci_tss_sel);
412 #endif
413 
414 	x86_startup();
415 }
416 
417 #ifdef XENPV
418 /* used in assembly */
419 void hypervisor_callback(void);
420 void failsafe_callback(void);
421 void x86_64_switch_context(struct pcb *);
422 void x86_64_tls_switch(struct lwp *);
423 
424 void
425 x86_64_switch_context(struct pcb *new)
426 {
427 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
428 	struct physdev_set_iopl set_iopl;
429 	set_iopl.iopl = new->pcb_iopl;
430 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
431 }
432 
433 void
434 x86_64_tls_switch(struct lwp *l)
435 {
436 	struct cpu_info *ci = curcpu();
437 	struct pcb *pcb = lwp_getpcb(l);
438 	struct trapframe *tf = l->l_md.md_regs;
439 	uint64_t zero = 0;
440 
441 	/*
442 	 * Raise the IPL to IPL_HIGH. XXX Still needed?
443 	 */
444 	(void)splhigh();
445 
446 	/* Update segment registers */
447 	if (pcb->pcb_flags & PCB_COMPAT32) {
448 		update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
449 		update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
450 		setds(GSEL(GUDATA32_SEL, SEL_UPL));
451 		setes(GSEL(GUDATA32_SEL, SEL_UPL));
452 		setfs(GSEL(GUDATA32_SEL, SEL_UPL));
453 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
454 	} else {
455 		update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero);
456 		update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero);
457 		setds(GSEL(GUDATA_SEL, SEL_UPL));
458 		setes(GSEL(GUDATA_SEL, SEL_UPL));
459 		setfs(0);
460 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
461 		HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
462 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
463 	}
464 }
465 #endif /* XENPV */
466 
467 /*
468  * Set up proc0's PCB and LDT.
469  */
470 static void
471 x86_64_proc0_pcb_ldt_init(void)
472 {
473 	struct lwp *l = &lwp0;
474 	struct pcb *pcb = lwp_getpcb(l);
475 
476 	pcb->pcb_flags = 0;
477 	pcb->pcb_fs = 0;
478 	pcb->pcb_gs = 0;
479 	pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
480 	pcb->pcb_iopl = IOPL_KPL;
481 	pcb->pcb_dbregs = NULL;
482 	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
483 	l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
484 
485 #if !defined(XENPV)
486 	lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
487 #else
488 	xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3);
489 	/* Reset TS bit and set kernel stack for interrupt handlers */
490 	HYPERVISOR_fpu_taskswitch(1);
491 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
492 	struct physdev_set_iopl set_iopl;
493 	set_iopl.iopl = pcb->pcb_iopl;
494 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
495 #endif
496 }
497 
498 /*
499  * Set up TSS and I/O bitmap.
500  */
501 void
502 cpu_init_tss(struct cpu_info *ci)
503 {
504 #ifdef __HAVE_PCPU_AREA
505 	const cpuid_t cid = cpu_index(ci);
506 #endif
507 	struct cpu_tss *cputss;
508 	struct nmistore *store;
509 	uintptr_t p;
510 
511 #ifdef __HAVE_PCPU_AREA
512 	cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss;
513 #else
514 	cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
515 	    sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
516 #endif
517 
518 	cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
519 
520 	/* DDB stack */
521 #ifdef __HAVE_PCPU_AREA
522 	p = (vaddr_t)&pcpuarea->ent[cid].ist0;
523 #else
524 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
525 #endif
526 	cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16;
527 
528 	/* double fault */
529 #ifdef __HAVE_PCPU_AREA
530 	p = (vaddr_t)&pcpuarea->ent[cid].ist1;
531 #else
532 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
533 #endif
534 	cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16;
535 
536 	/* NMI - store a structure at the top of the stack */
537 #ifdef __HAVE_PCPU_AREA
538 	p = (vaddr_t)&pcpuarea->ent[cid].ist2;
539 #else
540 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
541 #endif
542 	cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore);
543 	store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore));
544 	store->cr3 = pmap_pdirpa(pmap_kernel(), 0);
545 
546 	/* DB */
547 #ifdef __HAVE_PCPU_AREA
548 	p = (vaddr_t)&pcpuarea->ent[cid].ist3;
549 #else
550 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
551 #endif
552 	cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16;
553 
554 	ci->ci_tss = cputss;
555 	ci->ci_tss_sel = tss_alloc(&cputss->tss);
556 }
557 
558 void
559 buildcontext(struct lwp *l, void *catcher, void *f)
560 {
561 	struct trapframe *tf = l->l_md.md_regs;
562 
563 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
564 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
565 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
566 	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
567 
568 	tf->tf_rip = (uint64_t)catcher;
569 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
570 	tf->tf_rflags &= ~PSL_CLEARSIG;
571 	tf->tf_rsp = (uint64_t)f;
572 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
573 
574 	/* Ensure FP state is sane */
575 	fpu_sigreset(l);
576 }
577 
578 void
579 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
580 {
581 
582 	printf("sendsig_sigcontext: illegal\n");
583 	sigexit(curlwp, SIGILL);
584 }
585 
586 void
587 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
588 {
589 	struct lwp *l = curlwp;
590 	struct proc *p = l->l_proc;
591 	struct sigacts *ps = p->p_sigacts;
592 	int onstack, error;
593 	int sig = ksi->ksi_signo;
594 	struct sigframe_siginfo *fp, frame;
595 	sig_t catcher = SIGACTION(p, sig).sa_handler;
596 	struct trapframe *tf = l->l_md.md_regs;
597 	char *sp;
598 
599 	KASSERT(mutex_owned(p->p_lock));
600 
601 	/* Do we need to jump onto the signal stack? */
602 	onstack =
603 	    (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
604 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
605 
606 	/* Allocate space for the signal handler context. */
607 	if (onstack)
608 		sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
609 	else
610 		/* AMD64 ABI 128-bytes "red zone". */
611 		sp = (char *)tf->tf_rsp - 128;
612 
613 	sp -= sizeof(struct sigframe_siginfo);
614 	/* Round down the stackpointer to a multiple of 16 for the ABI. */
615 	fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8);
616 
617 	memset(&frame, 0, sizeof(frame));
618 	frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
619 	frame.sf_si._info = ksi->ksi_info;
620 	frame.sf_uc.uc_flags = _UC_SIGMASK;
621 	frame.sf_uc.uc_sigmask = *mask;
622 	frame.sf_uc.uc_link = l->l_ctxlink;
623 	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
624 	    ? _UC_SETSTACK : _UC_CLRSTACK;
625 	sendsig_reset(l, sig);
626 
627 	mutex_exit(p->p_lock);
628 	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
629 	/* Copyout all the fp regs, the signal handler might expect them. */
630 	error = copyout(&frame, fp, sizeof frame);
631 	mutex_enter(p->p_lock);
632 
633 	if (error != 0) {
634 		/*
635 		 * Process has trashed its stack; give it an illegal
636 		 * instruction to halt it in its tracks.
637 		 */
638 		sigexit(l, SIGILL);
639 		/* NOTREACHED */
640 	}
641 
642 	buildcontext(l, catcher, fp);
643 
644 	tf->tf_rdi = sig;
645 	tf->tf_rsi = (uint64_t)&fp->sf_si;
646 	tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
647 
648 	/* Remember that we're now on the signal stack. */
649 	if (onstack)
650 		l->l_sigstk.ss_flags |= SS_ONSTACK;
651 
652 	if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
653 		/*
654 		 * process has given an invalid address for the
655 		 * handler. Stop it, but do not do it before so
656 		 * we can return the right info to userland (or in core dump)
657 		 */
658 		sigexit(l, SIGILL);
659 		/* NOTREACHED */
660 	}
661 }
662 
663 struct pcb dumppcb;
664 
665 void
666 cpu_reboot(int howto, char *bootstr)
667 {
668 	static bool syncdone = false;
669 	int s = IPL_NONE;
670 	__USE(s);	/* ugly otherwise */
671 
672 	if (cold) {
673 		howto |= RB_HALT;
674 		goto haltsys;
675 	}
676 
677 	boothowto = howto;
678 
679 	/* i386 maybe_dump() */
680 
681 	/*
682 	 * If we've panic'd, don't make the situation potentially
683 	 * worse by syncing or unmounting the file systems.
684 	 */
685 	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
686 		if (!syncdone) {
687 			syncdone = true;
688 			/* XXX used to force unmount as well, here */
689 			vfs_sync_all(curlwp);
690 		}
691 
692 		while (vfs_unmountall1(curlwp, false, false) ||
693 		       config_detach_all(boothowto) ||
694 		       vfs_unmount_forceone(curlwp))
695 			;	/* do nothing */
696 	} else {
697 		if (!db_active)
698 			suspendsched();
699 	}
700 
701 	pmf_system_shutdown(boothowto);
702 
703 	/* Disable interrupts. */
704 	s = splhigh();
705 
706 	/* Do a dump if requested. */
707 	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
708 		dumpsys();
709 
710 haltsys:
711 	doshutdownhooks();
712 
713         if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
714 #if NACPICA > 0
715 		if (s != IPL_NONE)
716 			splx(s);
717 
718 		acpi_enter_sleep_state(ACPI_STATE_S5);
719 #endif
720 #ifdef XEN
721 		if (vm_guest == VM_GUEST_XENPV ||
722 		    vm_guest == VM_GUEST_XENPVH ||
723 		    vm_guest == VM_GUEST_XENPVHVM)
724 			HYPERVISOR_shutdown();
725 #endif /* XEN */
726 	}
727 
728 	cpu_broadcast_halt();
729 
730 	if (howto & RB_HALT) {
731 #if NACPICA > 0
732 		acpi_disable();
733 #endif
734 
735 		printf("\n");
736 		printf("The operating system has halted.\n");
737 		printf("Please press any key to reboot.\n\n");
738 		cnpollc(1);	/* for proper keyboard command handling */
739 		if (cngetc() == 0) {
740 			/* no console attached, so just hlt */
741 			printf("No keyboard - cannot reboot after all.\n");
742 			for(;;) {
743 				x86_hlt();
744 			}
745 		}
746 		cnpollc(0);
747 	}
748 
749 	printf("rebooting...\n");
750 	if (cpureset_delay > 0)
751 		delay(cpureset_delay * 1000);
752 	cpu_reset();
753 	for(;;) ;
754 	/*NOTREACHED*/
755 }
756 
757 /*
758  * XXXfvdl share dumpcode.
759  */
760 
761 /*
762  * Perform assorted dump-related initialization tasks.  Assumes that
763  * the maximum physical memory address will not increase afterwards.
764  */
765 void
766 dump_misc_init(void)
767 {
768 #ifndef NO_SPARSE_DUMP
769 	int i;
770 #endif
771 
772 	if (dump_headerbuf != NULL)
773 		return; /* already called */
774 
775 #ifndef NO_SPARSE_DUMP
776 	for (i = 0; i < mem_cluster_cnt; ++i) {
777 		paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
778 		if (max_paddr < top)
779 			max_paddr = top;
780 	}
781 #ifdef DEBUG
782 	printf("dump_misc_init: max_paddr = 0x%lx\n",
783 	    (unsigned long)max_paddr);
784 #endif
785 	if (max_paddr == 0) {
786 		printf("Your machine does not initialize mem_clusters; "
787 		    "sparse_dumps disabled\n");
788 		sparse_dump = 0;
789 	} else {
790 		sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
791 		    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
792 		    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
793 	}
794 #endif
795 	dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
796 	    dump_headerbuf_size,
797 	    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
798 	/* XXXjld should check for failure here, disable dumps if so. */
799 }
800 
801 #ifndef NO_SPARSE_DUMP
802 /*
803  * Clear the set of pages to include in a sparse dump.
804  */
805 void
806 sparse_dump_reset(void)
807 {
808 	memset(sparse_dump_physmap, 0,
809 	    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
810 }
811 
812 /*
813  * Include or exclude pages in a sparse dump.
814  */
815 void
816 sparse_dump_mark(void)
817 {
818 	paddr_t p, pstart, pend;
819 	struct vm_page *pg;
820 	int i;
821 	uvm_physseg_t upm;
822 
823 	/*
824 	 * Mark all memory pages, then unmark pages that are uninteresting.
825 	 * Dereferenceing pg->uobject might crash again if another CPU
826 	 * frees the object out from under us, but we can't lock anything
827 	 * so it's a risk we have to take.
828 	 */
829 
830 	for (i = 0; i < mem_cluster_cnt; ++i) {
831 		pstart = mem_clusters[i].start / PAGE_SIZE;
832 		pend = pstart + mem_clusters[i].size / PAGE_SIZE;
833 
834 		for (p = pstart; p < pend; p++) {
835 			setbit(sparse_dump_physmap, p);
836 		}
837 	}
838         for (upm = uvm_physseg_get_first();
839 	     uvm_physseg_valid_p(upm);
840 	     upm = uvm_physseg_get_next(upm)) {
841 		paddr_t pfn;
842 
843 		/*
844 		 * We assume that seg->start to seg->end are
845 		 * uvm_page_physload()ed
846 		 */
847 		for (pfn = uvm_physseg_get_start(upm);
848 		     pfn < uvm_physseg_get_end(upm);
849 		     pfn++) {
850 			pg = PHYS_TO_VM_PAGE(ptoa(pfn));
851 
852 			if (pg->uanon || (pg->flags & PG_FREE) ||
853 			    (pg->uobject && pg->uobject->pgops)) {
854 				p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
855 				clrbit(sparse_dump_physmap, p);
856 			}
857 		}
858 	}
859 }
860 
861 /*
862  * Machine-dependently decides on the contents of a sparse dump, using
863  * the above.
864  */
865 void
866 cpu_dump_prep_sparse(void)
867 {
868 	sparse_dump_reset();
869 	/* XXX could the alternate recursive page table be skipped? */
870 	sparse_dump_mark();
871 	/* Memory for I/O buffers could be unmarked here, for example. */
872 	/* The kernel text could also be unmarked, but gdb would be upset. */
873 }
874 #endif
875 
876 /*
877  * Abstractly iterate over the collection of memory segments to be
878  * dumped; the callback lacks the customary environment-pointer
879  * argument because none of the current users really need one.
880  *
881  * To be used only after dump_seg_prep is called to set things up.
882  */
883 int
884 dump_seg_iter(int (*callback)(paddr_t, paddr_t))
885 {
886 	int error, i;
887 
888 #define CALLBACK(start,size) do {     \
889 	error = callback(start,size); \
890 	if (error)                    \
891 		return error;         \
892 } while(0)
893 
894 	for (i = 0; i < mem_cluster_cnt; ++i) {
895 #ifndef NO_SPARSE_DUMP
896 		/*
897 		 * The bitmap is scanned within each memory segment,
898 		 * rather than over its entire domain, in case any
899 		 * pages outside of the memory proper have been mapped
900 		 * into kva; they might be devices that wouldn't
901 		 * appreciate being arbitrarily read, and including
902 		 * them could also break the assumption that a sparse
903 		 * dump will always be smaller than a full one.
904 		 */
905 		if (sparse_dump && sparse_dump_physmap) {
906 			paddr_t p, sp_start, sp_end;
907 			int lastset;
908 
909 			sp_start = mem_clusters[i].start;
910 			sp_end = sp_start + mem_clusters[i].size;
911 			sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */
912 			lastset = 0;
913 			for (p = sp_start; p < sp_end; p += PAGE_SIZE) {
914 				int thisset = isset(sparse_dump_physmap,
915 				    p/PAGE_SIZE);
916 
917 				if (!lastset && thisset)
918 					sp_start = p;
919 				if (lastset && !thisset)
920 					CALLBACK(sp_start, p - sp_start);
921 				lastset = thisset;
922 			}
923 			if (lastset)
924 				CALLBACK(sp_start, p - sp_start);
925 		} else
926 #endif
927 			CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
928 	}
929 	return 0;
930 #undef CALLBACK
931 }
932 
933 /*
934  * Prepare for an impending core dump: decide what's being dumped and
935  * how much space it will take up.
936  */
937 void
938 dump_seg_prep(void)
939 {
940 #ifndef NO_SPARSE_DUMP
941 	if (sparse_dump && sparse_dump_physmap)
942 		cpu_dump_prep_sparse();
943 #endif
944 
945 	dump_nmemsegs = 0;
946 	dump_npages = 0;
947 	dump_seg_iter(dump_seg_count_range);
948 
949 	dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
950 	    ALIGN(sizeof(cpu_kcore_hdr_t)) +
951 	    ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
952 	dump_header_size = roundup(dump_header_size, dbtob(1));
953 
954 	/*
955 	 * savecore(8) will read this to decide how many pages to
956 	 * copy, and cpu_dumpconf has already used the pessimistic
957 	 * value to set dumplo, so it's time to tell the truth.
958 	 */
959 	dumpsize = dump_npages; /* XXX could these just be one variable? */
960 }
961 
962 int
963 dump_seg_count_range(paddr_t start, paddr_t size)
964 {
965 	++dump_nmemsegs;
966 	dump_npages += size / PAGE_SIZE;
967 	return 0;
968 }
969 
970 /*
971  * A sparse dump's header may be rather large, due to the number of
972  * "segments" emitted.  These routines manage a simple output buffer,
973  * so that the header can be written to disk incrementally.
974  */
975 void
976 dump_header_start(void)
977 {
978 	dump_headerbuf_ptr = dump_headerbuf;
979 	dump_header_blkno = dumplo;
980 }
981 
982 int
983 dump_header_flush(void)
984 {
985 	const struct bdevsw *bdev;
986 	size_t to_write;
987 	int error;
988 
989 	bdev = bdevsw_lookup(dumpdev);
990 	to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
991 	error = bdev->d_dump(dumpdev, dump_header_blkno,
992 	    dump_headerbuf, to_write);
993 	dump_header_blkno += btodb(to_write);
994 	dump_headerbuf_ptr = dump_headerbuf;
995 	return error;
996 }
997 
998 int
999 dump_header_addbytes(const void* vptr, size_t n)
1000 {
1001 	const char* ptr = vptr;
1002 	int error;
1003 
1004 	while (n > dump_headerbuf_avail) {
1005 		memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
1006 		ptr += dump_headerbuf_avail;
1007 		n -= dump_headerbuf_avail;
1008 		dump_headerbuf_ptr = dump_headerbuf_end;
1009 		error = dump_header_flush();
1010 		if (error)
1011 			return error;
1012 	}
1013 	memcpy(dump_headerbuf_ptr, ptr, n);
1014 	dump_headerbuf_ptr += n;
1015 
1016 	return 0;
1017 }
1018 
1019 int
1020 dump_header_addseg(paddr_t start, paddr_t size)
1021 {
1022 	phys_ram_seg_t seg = { start, size };
1023 	int error;
1024 
1025 	error = dump_header_addbytes(&seg, sizeof(seg));
1026 	if (error) {
1027 		printf("[seg 0x%"PRIxPADDR" bytes 0x%"PRIxPSIZE" failed,"
1028 		    " error=%d] ", start, size, error);
1029 	}
1030 	return error;
1031 }
1032 
1033 int
1034 dump_header_finish(void)
1035 {
1036 	int error;
1037 
1038 	memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
1039 	error = dump_header_flush();
1040 	if (error)
1041 		printf("[finish failed, error=%d] ", error);
1042 	return error;
1043 }
1044 
1045 
1046 /*
1047  * These variables are needed by /sbin/savecore
1048  */
1049 uint32_t	dumpmag = 0x8fca0101;	/* magic number */
1050 int 	dumpsize = 0;		/* pages */
1051 long	dumplo = 0; 		/* blocks */
1052 
1053 /*
1054  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
1055  * for a full (non-sparse) dump.
1056  */
1057 int
1058 cpu_dumpsize(void)
1059 {
1060 	int size;
1061 
1062 	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
1063 	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1064 	if (roundup(size, dbtob(1)) != dbtob(1))
1065 		return (-1);
1066 
1067 	return (1);
1068 }
1069 
1070 /*
1071  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
1072  * for a full (non-sparse) dump.
1073  */
1074 u_long
1075 cpu_dump_mempagecnt(void)
1076 {
1077 	u_long i, n;
1078 
1079 	n = 0;
1080 	for (i = 0; i < mem_cluster_cnt; i++)
1081 		n += atop(mem_clusters[i].size);
1082 	return (n);
1083 }
1084 
1085 /*
1086  * cpu_dump: dump the machine-dependent kernel core dump headers.
1087  */
1088 int
1089 cpu_dump(void)
1090 {
1091 	kcore_seg_t seg;
1092 	cpu_kcore_hdr_t cpuhdr;
1093 	const struct bdevsw *bdev;
1094 	int error;
1095 
1096 	bdev = bdevsw_lookup(dumpdev);
1097 	if (bdev == NULL) {
1098 		printf("[device 0x%llx ENXIO] ", (unsigned long long)dumpdev);
1099 		return ENXIO;
1100 	}
1101 
1102 	/*
1103 	 * Generate a segment header.
1104 	 */
1105 	CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
1106 	seg.c_size = dump_header_size - ALIGN(sizeof(seg));
1107 	error = dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
1108 	if (error) {
1109 		printf("[segment header %zu bytes failed, error=%d] ",
1110 		    ALIGN(sizeof(seg)), error);
1111 		/* blithely proceed (can't fail?) */
1112 	}
1113 
1114 	/*
1115 	 * Add the machine-dependent header info.
1116 	 */
1117 	cpuhdr.ptdpaddr = PDPpaddr;
1118 	cpuhdr.nmemsegs = dump_nmemsegs;
1119 	error = dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
1120 	if (error) {
1121 		printf("[MD header %zu bytes failed, error=%d] ",
1122 		    ALIGN(sizeof(cpuhdr)), error);
1123 		/* blithely proceed (can't fail?) */
1124 	}
1125 
1126 	/*
1127 	 * Write out the memory segment descriptors.
1128 	 */
1129 	return dump_seg_iter(dump_header_addseg);
1130 }
1131 
1132 /*
1133  * Doadump comes here after turning off memory management and
1134  * getting on the dump stack, either when called above, or by
1135  * the auto-restart code.
1136  */
1137 #define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
1138 static vaddr_t dumpspace;
1139 
1140 vaddr_t
1141 reserve_dumppages(vaddr_t p)
1142 {
1143 
1144 	dumpspace = p;
1145 	return (p + BYTES_PER_DUMP);
1146 }
1147 
1148 int
1149 dumpsys_seg(paddr_t maddr, paddr_t bytes)
1150 {
1151 	u_long i, m, n;
1152 	daddr_t blkno;
1153 	const struct bdevsw *bdev;
1154 	int (*dump)(dev_t, daddr_t, void *, size_t);
1155 	int error;
1156 
1157 	if (dumpdev == NODEV)
1158 		return ENODEV;
1159 	bdev = bdevsw_lookup(dumpdev);
1160 	if (bdev == NULL || bdev->d_psize == NULL)
1161 		return ENODEV;
1162 
1163 	dump = bdev->d_dump;
1164 
1165 	blkno = dump_header_blkno;
1166 	for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
1167 		/* Print out how many MBs we have left to go. */
1168 		if ((dump_totalbytesleft % (1024*1024)) == 0)
1169 			printf_nolog("%lu ", (unsigned long)
1170 			    (dump_totalbytesleft / (1024 * 1024)));
1171 
1172 		/* Limit size for next transfer. */
1173 		n = bytes - i;
1174 		if (n > BYTES_PER_DUMP)
1175 			n = BYTES_PER_DUMP;
1176 
1177 		for (m = 0; m < n; m += NBPG)
1178 			pmap_kenter_pa(dumpspace + m, maddr + m,
1179 			    VM_PROT_READ, 0);
1180 		pmap_update(pmap_kernel());
1181 
1182 		error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
1183 		pmap_kremove_local(dumpspace, n);
1184 		if (error)
1185 			return error;
1186 		maddr += n;
1187 		blkno += btodb(n);		/* XXX? */
1188 
1189 #if 0	/* XXX this doesn't work.  grr. */
1190 		/* operator aborting dump? */
1191 		if (sget() != NULL)
1192 			return EINTR;
1193 #endif
1194 	}
1195 	dump_header_blkno = blkno;
1196 
1197 	return 0;
1198 }
1199 
1200 void
1201 dodumpsys(void)
1202 {
1203 	const struct bdevsw *bdev;
1204 	int dumpend, psize;
1205 	int error;
1206 
1207 	if (dumpdev == NODEV)
1208 		return;
1209 
1210 	bdev = bdevsw_lookup(dumpdev);
1211 	if (bdev == NULL || bdev->d_psize == NULL)
1212 		return;
1213 	/*
1214 	 * For dumps during autoconfiguration,
1215 	 * if dump device has already configured...
1216 	 */
1217 	if (dumpsize == 0)
1218 		cpu_dumpconf();
1219 
1220 	printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
1221 	    (unsigned long long)major(dumpdev),
1222 	    (unsigned long long)minor(dumpdev), dumplo, dumpsize);
1223 
1224 	if (dumplo <= 0 || dumpsize <= 0) {
1225 		printf(" not possible\n");
1226 		return;
1227 	}
1228 
1229 	psize = bdev_size(dumpdev);
1230 	printf("\ndump ");
1231 	if (psize == -1) {
1232 		printf("area unavailable\n");
1233 		return;
1234 	}
1235 
1236 #if 0	/* XXX this doesn't work.  grr. */
1237 	/* toss any characters present prior to dump */
1238 	while (sget() != NULL); /*syscons and pccons differ */
1239 #endif
1240 
1241 	dump_seg_prep();
1242 	dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
1243 	if (dumpend > psize) {
1244 		printf("failed: insufficient space (%d < %d)\n",
1245 		    psize, dumpend);
1246 		goto failed;
1247 	}
1248 
1249 	dump_header_start();
1250 	if ((error = cpu_dump()) != 0)
1251 		goto err;
1252 	if ((error = dump_header_finish()) != 0)
1253 		goto err;
1254 
1255 	if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
1256 		printf("BAD header size (%ld [written] != %ld [expected])\n",
1257 		    (long)(dump_header_blkno - dumplo),
1258 		    (long)btodb(dump_header_size));
1259 		goto failed;
1260 	}
1261 
1262 	dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
1263 	error = dump_seg_iter(dumpsys_seg);
1264 
1265 	if (error == 0 && dump_header_blkno != dumpend) {
1266 		printf("BAD dump size (%ld [written] != %ld [expected])\n",
1267 		    (long)(dumpend - dumplo),
1268 		    (long)(dump_header_blkno - dumplo));
1269 		goto failed;
1270 	}
1271 
1272 err:
1273 	switch (error) {
1274 
1275 	case ENXIO:
1276 		printf("device bad\n");
1277 		break;
1278 
1279 	case EFAULT:
1280 		printf("device not ready\n");
1281 		break;
1282 
1283 	case EINVAL:
1284 		printf("area improper\n");
1285 		break;
1286 
1287 	case EIO:
1288 		printf("i/o error\n");
1289 		break;
1290 
1291 	case EINTR:
1292 		printf("aborted from console\n");
1293 		break;
1294 
1295 	case 0:
1296 		printf("succeeded\n");
1297 		break;
1298 
1299 	default:
1300 		printf("error %d\n", error);
1301 		break;
1302 	}
1303 failed:
1304 	printf("\n\n");
1305 	delay(5000000);		/* 5 seconds */
1306 }
1307 
1308 /*
1309  * This is called by main to set dumplo and dumpsize.
1310  * Dumps always skip the first PAGE_SIZE of disk space
1311  * in case there might be a disk label stored there.
1312  * If there is extra space, put dump at the end to
1313  * reduce the chance that swapping trashes it.
1314  *
1315  * Sparse dumps can't placed as close to the end as possible, because
1316  * savecore(8) has to know where to start reading in the dump device
1317  * before it has access to any of the crashed system's state.
1318  *
1319  * Note also that a sparse dump will never be larger than a full one:
1320  * in order to add a phys_ram_seg_t to the header, at least one page
1321  * must be removed.
1322  */
1323 void
1324 cpu_dumpconf(void)
1325 {
1326 	int nblks, dumpblks;	/* size of dump area */
1327 
1328 	if (dumpdev == NODEV)
1329 		goto bad;
1330 	nblks = bdev_size(dumpdev);
1331 	if (nblks <= ctod(1))
1332 		goto bad;
1333 
1334 	dumpblks = cpu_dumpsize();
1335 	if (dumpblks < 0)
1336 		goto bad;
1337 
1338 	/* dumpsize is in page units, and doesn't include headers. */
1339 	dumpsize = cpu_dump_mempagecnt();
1340 
1341 	dumpblks += ctod(dumpsize);
1342 
1343 	/* If dump won't fit (incl. room for possible label), punt. */
1344 	if (dumpblks > (nblks - ctod(1))) {
1345 #ifndef NO_SPARSE_DUMP
1346 		/* A sparse dump might (and hopefully will) fit. */
1347 		dumplo = ctod(1);
1348 #else
1349 		/* But if we're not configured for that, punt. */
1350 		goto bad;
1351 #endif
1352 	} else {
1353 		/* Put dump at end of partition */
1354 		dumplo = nblks - dumpblks;
1355 	}
1356 
1357 
1358 	/* Now that we've decided this will work, init ancillary stuff. */
1359 	dump_misc_init();
1360 	return;
1361 
1362  bad:
1363 	dumpsize = 0;
1364 }
1365 
1366 /*
1367  * Clear registers on exec
1368  */
1369 void
1370 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
1371 {
1372 	struct pcb *pcb = lwp_getpcb(l);
1373 	struct trapframe *tf;
1374 
1375 #ifdef USER_LDT
1376 	pmap_ldt_cleanup(l);
1377 #endif
1378 
1379 	fpu_clear(l, pack->ep_osversion >= 699002600
1380 	    ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
1381 	x86_dbregs_clear(l);
1382 
1383 	kpreempt_disable();
1384 	pcb->pcb_flags = 0;
1385 	l->l_proc->p_flag &= ~PK_32;
1386 	l->l_md.md_flags = MDL_IRET;
1387 	cpu_segregs64_zero(l);
1388 	kpreempt_enable();
1389 
1390 	tf = l->l_md.md_regs;
1391 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
1392 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
1393 	tf->tf_rdi = 0;
1394 	tf->tf_rsi = 0;
1395 	tf->tf_rbp = 0;
1396 	tf->tf_rbx = l->l_proc->p_psstrp;
1397 	tf->tf_rdx = 0;
1398 	tf->tf_rcx = 0;
1399 	tf->tf_rax = 0;
1400 	tf->tf_rip = pack->ep_entry;
1401 	tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
1402 	tf->tf_rflags = PSL_USERSET;
1403 	tf->tf_rsp = stack;
1404 	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
1405 }
1406 
1407 /*
1408  * Initialize segments and descriptor tables
1409  */
1410 char *ldtstore;
1411 char *gdtstore;
1412 
1413 void
1414 setgate(struct gate_descriptor *gd, void *func,
1415     int ist, int type, int dpl, int sel)
1416 {
1417 	vaddr_t vaddr;
1418 
1419 	vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
1420 
1421 	kpreempt_disable();
1422 	pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
1423 
1424 	gd->gd_looffset = (uint64_t)func & 0xffff;
1425 	gd->gd_selector = sel;
1426 	gd->gd_ist = ist;
1427 	gd->gd_type = type;
1428 	gd->gd_dpl = dpl;
1429 	gd->gd_p = 1;
1430 	gd->gd_hioffset = (uint64_t)func >> 16;
1431 	gd->gd_zero = 0;
1432 	gd->gd_xx1 = 0;
1433 	gd->gd_xx2 = 0;
1434 	gd->gd_xx3 = 0;
1435 
1436 	pmap_changeprot_local(vaddr, VM_PROT_READ);
1437 	kpreempt_enable();
1438 }
1439 
1440 void
1441 unsetgate(struct gate_descriptor *gd)
1442 {
1443 	vaddr_t vaddr;
1444 
1445 	vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
1446 
1447 	kpreempt_disable();
1448 	pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
1449 
1450 	memset(gd, 0, sizeof (*gd));
1451 
1452 	pmap_changeprot_local(vaddr, VM_PROT_READ);
1453 	kpreempt_enable();
1454 }
1455 
1456 void
1457 setregion(struct region_descriptor *rd, void *base, uint16_t limit)
1458 {
1459 	rd->rd_limit = limit;
1460 	rd->rd_base = (uint64_t)base;
1461 }
1462 
1463 /*
1464  * Note that the base and limit fields are ignored in long mode.
1465  */
1466 void
1467 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1468 	int type, int dpl, int gran, int def32, int is64)
1469 {
1470 	sd->sd_lolimit = (unsigned)limit;
1471 	sd->sd_lobase = (unsigned long)base;
1472 	sd->sd_type = type;
1473 	sd->sd_dpl = dpl;
1474 	sd->sd_p = 1;
1475 	sd->sd_hilimit = (unsigned)limit >> 16;
1476 	sd->sd_avl = 0;
1477 	sd->sd_long = is64;
1478 	sd->sd_def32 = def32;
1479 	sd->sd_gran = gran;
1480 	sd->sd_hibase = (unsigned long)base >> 24;
1481 }
1482 
1483 void
1484 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1485 	int type, int dpl, int gran)
1486 {
1487 	memset(sd, 0, sizeof *sd);
1488 	sd->sd_lolimit = (unsigned)limit;
1489 	sd->sd_lobase = (uint64_t)base;
1490 	sd->sd_type = type;
1491 	sd->sd_dpl = dpl;
1492 	sd->sd_p = 1;
1493 	sd->sd_hilimit = (unsigned)limit >> 16;
1494 	sd->sd_gran = gran;
1495 	sd->sd_hibase = (uint64_t)base >> 24;
1496 }
1497 
1498 void
1499 cpu_init_idt(struct cpu_info *ci)
1500 {
1501 	struct region_descriptor region;
1502 	idt_descriptor_t *idt;
1503 
1504 	idt = ci->ci_idtvec.iv_idt;
1505 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1506 	lidt(&region);
1507 }
1508 
1509 #define	IDTVEC(name)	__CONCAT(X, name)
1510 typedef void (vector)(void);
1511 extern vector IDTVEC(syscall);
1512 extern vector IDTVEC(syscall32);
1513 extern vector IDTVEC(osyscall);
1514 extern vector *x86_exceptions[];
1515 
1516 #ifndef XENPV
1517 static void
1518 init_x86_64_ksyms(void)
1519 {
1520 #if NKSYMS || defined(DDB) || defined(MODULAR)
1521 	extern int end;
1522 	extern int *esym;
1523 	struct btinfo_symtab *symtab;
1524 	vaddr_t tssym, tesym;
1525 
1526 #ifdef DDB
1527 	db_machine_init();
1528 #endif
1529 
1530 	symtab = lookup_bootinfo(BTINFO_SYMTAB);
1531 	if (symtab) {
1532 #ifdef KASLR
1533 		tssym = bootspace.head.va;
1534 		tesym = bootspace.head.va; /* (unused...) */
1535 #else
1536 		tssym = (vaddr_t)symtab->ssym + KERNBASE;
1537 		tesym = (vaddr_t)symtab->esym + KERNBASE;
1538 #endif
1539 		ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
1540 	} else {
1541 		uintptr_t endp = (uintptr_t)(void *)&end;
1542 
1543 		if (vm_guest == VM_GUEST_GENPVH)
1544 			ksyms_addsyms_elf(0, ((long *)endp) + 1, esym);
1545 		else
1546 			ksyms_addsyms_elf(*(long *)endp, ((long *)endp) + 1, esym);
1547 	}
1548 #endif
1549 }
1550 #endif /* XENPV */
1551 
1552 void __noasan
1553 init_bootspace(void)
1554 {
1555 	extern char __rodata_start;
1556 	extern char __data_start;
1557 	extern char __kernel_end;
1558 	size_t i = 0;
1559 
1560 	memset(&bootspace, 0, sizeof(bootspace));
1561 
1562 	bootspace.head.va = KERNTEXTOFF;
1563 	bootspace.head.pa = KERNTEXTOFF - KERNBASE;
1564 	bootspace.head.sz = 0;
1565 
1566 	bootspace.segs[i].type = BTSEG_TEXT;
1567 	bootspace.segs[i].va = KERNTEXTOFF;
1568 	bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
1569 	bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
1570 	i++;
1571 
1572 	bootspace.segs[i].type = BTSEG_RODATA;
1573 	bootspace.segs[i].va = (vaddr_t)&__rodata_start;
1574 	bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE;
1575 	bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
1576 	i++;
1577 
1578 	bootspace.segs[i].type = BTSEG_DATA;
1579 	bootspace.segs[i].va = (vaddr_t)&__data_start;
1580 	bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE;
1581 	bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
1582 	i++;
1583 
1584 	bootspace.boot.va = (vaddr_t)&__kernel_end;
1585 	bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE;
1586 	bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
1587 	    (size_t)&__kernel_end;
1588 
1589 	/* In locore.S, we allocated a tmp va. We will use it now. */
1590 	bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
1591 
1592 	/* Virtual address of the L4 page. */
1593 	bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
1594 
1595 	/* Kernel module map. */
1596 	bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE;
1597 	bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
1598 }
1599 
1600 static void
1601 init_pte(void)
1602 {
1603 #ifndef XENPV
1604 	extern uint32_t nox_flag;
1605 	pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir;
1606 	pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) |
1607 	    PTE_P;
1608 #endif
1609 
1610 	extern pd_entry_t *normal_pdes[3];
1611 	normal_pdes[0] = L2_BASE;
1612 	normal_pdes[1] = L3_BASE;
1613 	normal_pdes[2] = L4_BASE;
1614 }
1615 
1616 void
1617 init_slotspace(void)
1618 {
1619 	/*
1620 	 * XXX Too early to use cprng(9), or even entropy_extract.
1621 	 */
1622 	struct entpool pool;
1623 	size_t randhole;
1624 	vaddr_t randva;
1625 	uint64_t sample;
1626 	vaddr_t va;
1627 
1628 	memset(&pool, 0, sizeof pool);
1629 	cpu_rng_early_sample(&sample);
1630 	entpool_enter(&pool, &sample, sizeof sample);
1631 
1632 	memset(&slotspace, 0, sizeof(slotspace));
1633 
1634 	/* User. [256, because we want to land in >= 256] */
1635 	slotspace.area[SLAREA_USER].sslot = 0;
1636 	slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1;
1637 	slotspace.area[SLAREA_USER].active = true;
1638 
1639 #ifdef XENPV
1640 	/* PTE. */
1641 	slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE;
1642 	slotspace.area[SLAREA_PTE].nslot = 1;
1643 	slotspace.area[SLAREA_PTE].active = true;
1644 #endif
1645 
1646 #ifdef __HAVE_PCPU_AREA
1647 	/* Per-CPU. */
1648 	slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU;
1649 	slotspace.area[SLAREA_PCPU].nslot = 1;
1650 	slotspace.area[SLAREA_PCPU].active = true;
1651 #endif
1652 
1653 #ifdef __HAVE_DIRECT_MAP
1654 	/* Direct Map. [Randomized later] */
1655 	slotspace.area[SLAREA_DMAP].active = false;
1656 #endif
1657 
1658 #ifdef XENPV
1659 	/* Hypervisor. */
1660 	slotspace.area[SLAREA_HYPV].sslot = 256;
1661 	slotspace.area[SLAREA_HYPV].nslot = 17;
1662 	slotspace.area[SLAREA_HYPV].active = true;
1663 #endif
1664 
1665 #ifdef KASAN
1666 	/* ASAN. */
1667 	slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN;
1668 	slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN;
1669 	slotspace.area[SLAREA_ASAN].active = true;
1670 #endif
1671 
1672 #ifdef KMSAN
1673 	/* MSAN. */
1674 	slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN;
1675 	slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN;
1676 	slotspace.area[SLAREA_MSAN].active = true;
1677 #endif
1678 
1679 	/* Kernel. */
1680 	slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE;
1681 	slotspace.area[SLAREA_KERN].nslot = 1;
1682 	slotspace.area[SLAREA_KERN].active = true;
1683 
1684 	/* Main. */
1685 	cpu_rng_early_sample(&sample);
1686 	entpool_enter(&pool, &sample, sizeof sample);
1687 	entpool_extract(&pool, &randhole, sizeof randhole);
1688 	entpool_extract(&pool, &randva, sizeof randva);
1689 	va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4,
1690 	    NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */
1691 	vm_min_kernel_address = va;
1692 	vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4;
1693 
1694 #ifndef XENPV
1695 	/* PTE. */
1696 	cpu_rng_early_sample(&sample);
1697 	entpool_enter(&pool, &sample, sizeof sample);
1698 	entpool_extract(&pool, &randhole, sizeof randhole);
1699 	entpool_extract(&pool, &randva, sizeof randva);
1700 	va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva);
1701 	pte_base = (pd_entry_t *)va;
1702 #endif
1703 
1704 	explicit_memset(&pool, 0, sizeof pool);
1705 }
1706 
1707 void
1708 init_x86_64(paddr_t first_avail)
1709 {
1710 	extern void consinit(void);
1711 	struct region_descriptor region;
1712 	struct mem_segment_descriptor *ldt_segp;
1713 	struct idt_vec *iv;
1714 	idt_descriptor_t *idt;
1715 	int x;
1716 	struct pcb *pcb;
1717 	extern vaddr_t lwp0uarea;
1718 #ifndef XENPV
1719 	extern paddr_t local_apic_pa;
1720 #endif
1721 
1722 	KASSERT(first_avail % PAGE_SIZE == 0);
1723 
1724 #ifdef XENPV
1725 	KASSERT(HYPERVISOR_shared_info != NULL);
1726 	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1727 #endif
1728 
1729 #ifdef XEN
1730 	if (vm_guest == VM_GUEST_XENPVH || vm_guest == VM_GUEST_GENPVH)
1731 		xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1732 #endif
1733 	init_pte();
1734 
1735 	uvm_lwp_setuarea(&lwp0, lwp0uarea);
1736 
1737 	cpu_probe(&cpu_info_primary);
1738 #ifdef SVS
1739 	svs_init();
1740 #endif
1741 
1742 	/*
1743 	 * Initialize MSRs on cpu0:
1744 	 *
1745 	 * - Enables SYSCALL/SYSRET.
1746 	 *
1747 	 * - Sets up %fs and %gs so that %gs points to the current
1748 	 *   struct cpu_info as needed for CPUVAR(...), curcpu(), and
1749 	 *   curlwp.
1750 	 *
1751 	 * - Enables the no-execute bit if supported.
1752 	 *
1753 	 * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
1754 	 * will work on cpu0.
1755 	 *
1756 	 * Note: The call to cpu_init_msrs for secondary CPUs happens
1757 	 * in cpu_hatch.
1758 	 */
1759 	cpu_init_msrs(&cpu_info_primary, true);
1760 
1761 #ifndef XENPV
1762 	cpu_speculation_init(&cpu_info_primary);
1763 #endif
1764 
1765 	use_pae = 1; /* PAE always enabled in long mode */
1766 
1767 	pcb = lwp_getpcb(&lwp0);
1768 #ifdef XENPV
1769 	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1770 	pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
1771 #else
1772 	pcb->pcb_cr3 = PDPpaddr;
1773 #endif
1774 
1775 #if NISA > 0 || NPCI > 0
1776 	x86_bus_space_init();
1777 #endif
1778 
1779 	pat_init(&cpu_info_primary);
1780 
1781 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
1782 
1783 	/*
1784 	 * Initialize PAGE_SIZE-dependent variables.
1785 	 */
1786 	uvm_md_init();
1787 
1788 	uvmexp.ncolors = 2;
1789 
1790 	avail_start = first_avail;
1791 
1792 #ifndef XENPV
1793 	/*
1794 	 * Low memory reservations:
1795 	 * Page 0:	BIOS data
1796 	 * Page 1:	BIOS callback (not used yet, for symmetry with i386)
1797 	 * Page 2:	MP bootstrap code (MP_TRAMPOLINE)
1798 	 * Page 3:	ACPI wakeup code (ACPI_WAKEUP_ADDR)
1799 	 * Page 4:	Temporary page table for 0MB-4MB
1800 	 * Page 5:	Temporary page directory
1801 	 * Page 6:	Temporary page map level 3
1802 	 * Page 7:	Temporary page map level 4
1803 	 */
1804 	lowmem_rsvd = 8 * PAGE_SIZE;
1805 
1806 	/* Initialize the memory clusters (needed in pmap_bootstrap). */
1807 	init_x86_clusters();
1808 #else
1809 	/* Parse Xen command line (replace bootinfo) */
1810 	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1811 
1812 	avail_end = ctob(xen_start_info.nr_pages);
1813 	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1814 	pmap_pa_end = avail_end;
1815 #endif
1816 
1817 	/*
1818 	 * Call pmap initialization to make new kernel address space.
1819 	 * We must do this before loading pages into the VM system.
1820 	 */
1821 	pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
1822 
1823 	/*
1824 	 * Initialize RNG to get entropy ASAP either from CPU
1825 	 * RDRAND/RDSEED or from seed on disk.  Constraints:
1826 	 *
1827 	 * - Must happen after cpu_init_msrs so that curcpu() and
1828 	 *   curlwp work.
1829 	 *
1830 	 * - Must happen after consinit so we have the opportunity to
1831 	 *   print useful feedback.
1832 	 *
1833 	 * - On KASLR kernels, must happen after pmap_bootstrap because
1834 	 *   x86_rndseed requires access to the direct map.
1835 	 */
1836 	cpu_rng_init();
1837 	x86_rndseed();
1838 
1839 #ifndef XENPV
1840 	/* Internalize the physical pages into the VM system. */
1841 	init_x86_vm(avail_start);
1842 #else
1843 	physmem = xen_start_info.nr_pages;
1844 	uvm_page_physload(atop(avail_start), atop(avail_end),
1845 	    atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT);
1846 #endif
1847 
1848 	init_x86_msgbuf();
1849 
1850 	kasan_init();
1851 	kcsan_init();
1852 	kmsan_init((void *)lwp0uarea);
1853 
1854 	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1855 
1856 	kpreempt_disable();
1857 
1858 #ifndef XENPV
1859 	pmap_kenter_pa(local_apic_va, local_apic_pa,
1860 	    VM_PROT_READ|VM_PROT_WRITE, 0);
1861 	pmap_update(pmap_kernel());
1862 	memset((void *)local_apic_va, 0, PAGE_SIZE);
1863 #endif
1864 
1865 	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1866 	pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1867 	pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1868 	pmap_update(pmap_kernel());
1869 	memset((void *)idt_vaddr, 0, PAGE_SIZE);
1870 	memset((void *)gdt_vaddr, 0, PAGE_SIZE);
1871 	memset((void *)ldt_vaddr, 0, PAGE_SIZE);
1872 
1873 #ifndef XENPV
1874 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1875 #endif
1876 
1877 	pmap_update(pmap_kernel());
1878 
1879 	iv = &(cpu_info_primary.ci_idtvec);
1880 	idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
1881 	idt = iv->iv_idt;
1882 	gdtstore = (char *)gdt_vaddr;
1883 	ldtstore = (char *)ldt_vaddr;
1884 
1885 	/*
1886 	 * Make GDT gates and memory segments.
1887 	 */
1888 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
1889 	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1890 
1891 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
1892 	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1893 
1894 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
1895 	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1896 
1897 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
1898 	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1899 
1900 #ifndef XENPV
1901 	set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
1902 	    LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
1903 #endif
1904 
1905 	/*
1906 	 * Make LDT memory segments.
1907 	 */
1908 	*(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
1909 	    *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
1910 	*(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
1911 	    *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
1912 
1913 	/*
1914 	 * 32 bit GDT entries.
1915 	 */
1916 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
1917 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1918 
1919 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
1920 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1921 
1922 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
1923 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1924 
1925 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
1926 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1927 
1928 	/*
1929 	 * 32 bit LDT entries.
1930 	 */
1931 	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
1932 	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1933 	    SDT_MEMERA, SEL_UPL, 1, 1, 0);
1934 	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
1935 	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1936 	    SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1937 
1938 	/* CPU-specific IDT exceptions. */
1939 	for (x = 0; x < NCPUIDT; x++) {
1940 		int sel, ist;
1941 
1942 		/* Reset to default. Special cases below */
1943 		sel = SEL_KPL;
1944 		ist = 0;
1945 
1946 		idt_vec_reserve(iv, x);
1947 
1948 		switch (x) {
1949 		case 1:	/* DB */
1950 			ist = 4;
1951 			break;
1952 		case 2:	/* NMI */
1953 			ist = 3;
1954 			break;
1955 		case 3:
1956 		case 4:
1957 			sel = SEL_UPL;
1958 			break;
1959 		case 8:	/* double fault */
1960 			ist = 2;
1961 			break;
1962 #ifdef XENPV
1963 		case 18: /* MCA */
1964 			sel |= 0x4; /* Auto EOI/mask */
1965 			break;
1966 #endif /* XENPV */
1967 		default:
1968 			break;
1969 		}
1970 
1971 		set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT,
1972 		    sel, GSEL(GCODE_SEL, SEL_KPL));
1973 	}
1974 
1975 	/* new-style interrupt gate for syscalls */
1976 	idt_vec_reserve(iv, 128);
1977 	set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
1978 	    GSEL(GCODE_SEL, SEL_KPL));
1979 
1980 	kpreempt_enable();
1981 
1982 	setregion(&region, gdtstore, DYNSEL_START - 1);
1983 	lgdt(&region);
1984 
1985 #ifdef XENPV
1986 	/* Init Xen callbacks and syscall handlers */
1987 	if (HYPERVISOR_set_callbacks(
1988 	    (unsigned long) hypervisor_callback,
1989 	    (unsigned long) failsafe_callback,
1990 	    (unsigned long) Xsyscall))
1991 		panic("HYPERVISOR_set_callbacks() failed");
1992 #endif /* XENPV */
1993 
1994 	cpu_init_idt(&cpu_info_primary);
1995 
1996 #ifdef XENPV
1997 	xen_init_ksyms();
1998 #else /* XENPV */
1999 #ifdef XEN
2000 	if (vm_guest == VM_GUEST_XENPVH)
2001 		xen_init_ksyms();
2002 	else
2003 #endif /* XEN */
2004 		init_x86_64_ksyms();
2005 #endif /* XENPV */
2006 
2007 #ifndef XENPV
2008 	intr_default_setup();
2009 #else
2010 	events_default_setup();
2011 #endif
2012 
2013 	splraise(IPL_HIGH);
2014 	x86_enable_intr();
2015 
2016 #ifdef DDB
2017 	if (boothowto & RB_KDB)
2018 		Debugger();
2019 #endif
2020 #ifdef KGDB
2021 	kgdb_port_init();
2022 	if (boothowto & RB_KDB) {
2023 		kgdb_debug_init = 1;
2024 		kgdb_connect(1);
2025 	}
2026 #endif
2027 
2028 	pcb->pcb_dbregs = NULL;
2029 	x86_dbregs_init();
2030 }
2031 
2032 void
2033 cpu_reset(void)
2034 {
2035 #ifndef XENPV
2036 	idt_descriptor_t *idt;
2037 	vaddr_t vaddr;
2038 
2039 	idt = cpu_info_primary.ci_idtvec.iv_idt;
2040 	vaddr = (vaddr_t)idt;
2041 #endif
2042 
2043 	x86_disable_intr();
2044 
2045 #ifdef XENPV
2046 	HYPERVISOR_reboot();
2047 #else
2048 
2049 	x86_reset();
2050 
2051 	/*
2052 	 * Try to cause a triple fault and watchdog reset by making the IDT
2053 	 * invalid and causing a fault.
2054 	 */
2055 	kpreempt_disable();
2056 	pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
2057 	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
2058 	kpreempt_enable();
2059 	breakpoint();
2060 
2061 #if 0
2062 	/*
2063 	 * Try to cause a triple fault and watchdog reset by unmapping the
2064 	 * entire address space and doing a TLB flush.
2065 	 */
2066 	memset((void *)PTD, 0, PAGE_SIZE);
2067 	tlbflush();
2068 #endif
2069 #endif	/* XENPV */
2070 
2071 	for (;;);
2072 }
2073 
2074 void
2075 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
2076 {
2077 	const struct trapframe *tf = l->l_md.md_regs;
2078 	__greg_t ras_rip;
2079 
2080 	mcp->__gregs[_REG_RDI] = tf->tf_rdi;
2081 	mcp->__gregs[_REG_RSI] = tf->tf_rsi;
2082 	mcp->__gregs[_REG_RDX] = tf->tf_rdx;
2083 	mcp->__gregs[_REG_R10] = tf->tf_r10;
2084 	mcp->__gregs[_REG_R8]  = tf->tf_r8;
2085 	mcp->__gregs[_REG_R9]  = tf->tf_r9;
2086 	/* argX not touched */
2087 	mcp->__gregs[_REG_RCX] = tf->tf_rcx;
2088 	mcp->__gregs[_REG_R11] = tf->tf_r11;
2089 	mcp->__gregs[_REG_R12] = tf->tf_r12;
2090 	mcp->__gregs[_REG_R13] = tf->tf_r13;
2091 	mcp->__gregs[_REG_R14] = tf->tf_r14;
2092 	mcp->__gregs[_REG_R15] = tf->tf_r15;
2093 	mcp->__gregs[_REG_RBP] = tf->tf_rbp;
2094 	mcp->__gregs[_REG_RBX] = tf->tf_rbx;
2095 	mcp->__gregs[_REG_RAX] = tf->tf_rax;
2096 	mcp->__gregs[_REG_GS]  = 0;
2097 	mcp->__gregs[_REG_FS]  = 0;
2098 	mcp->__gregs[_REG_ES]  = GSEL(GUDATA_SEL, SEL_UPL);
2099 	mcp->__gregs[_REG_DS]  = GSEL(GUDATA_SEL, SEL_UPL);
2100 	mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno;
2101 	mcp->__gregs[_REG_ERR] = tf->tf_err;
2102 	mcp->__gregs[_REG_RIP] = tf->tf_rip;
2103 	mcp->__gregs[_REG_CS]  = LSEL(LUCODE_SEL, SEL_UPL);
2104 	mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags;
2105 	mcp->__gregs[_REG_RSP] = tf->tf_rsp;
2106 	mcp->__gregs[_REG_SS]  = LSEL(LUDATA_SEL, SEL_UPL);
2107 
2108 	if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
2109 	    (void *) mcp->__gregs[_REG_RIP])) != -1)
2110 		mcp->__gregs[_REG_RIP] = ras_rip;
2111 
2112 	*flags |= _UC_CPU;
2113 
2114 	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
2115 	*flags |= _UC_TLSBASE;
2116 
2117 	process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
2118 	*flags |= _UC_FPU;
2119 }
2120 
2121 int
2122 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
2123 {
2124 	struct trapframe *tf = l->l_md.md_regs;
2125 	const __greg_t *gr = mcp->__gregs;
2126 	struct proc *p = l->l_proc;
2127 	int error;
2128 	int64_t rflags;
2129 
2130 	CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);
2131 
2132 	if ((flags & _UC_CPU) != 0) {
2133 		error = cpu_mcontext_validate(l, mcp);
2134 		if (error != 0)
2135 			return error;
2136 
2137 		tf->tf_rdi  = gr[_REG_RDI];
2138 		tf->tf_rsi  = gr[_REG_RSI];
2139 		tf->tf_rdx  = gr[_REG_RDX];
2140 		tf->tf_r10  = gr[_REG_R10];
2141 		tf->tf_r8   = gr[_REG_R8];
2142 		tf->tf_r9   = gr[_REG_R9];
2143 		/* argX not touched */
2144 		tf->tf_rcx  = gr[_REG_RCX];
2145 		tf->tf_r11  = gr[_REG_R11];
2146 		tf->tf_r12  = gr[_REG_R12];
2147 		tf->tf_r13  = gr[_REG_R13];
2148 		tf->tf_r14  = gr[_REG_R14];
2149 		tf->tf_r15  = gr[_REG_R15];
2150 		tf->tf_rbp  = gr[_REG_RBP];
2151 		tf->tf_rbx  = gr[_REG_RBX];
2152 		tf->tf_rax  = gr[_REG_RAX];
2153 		tf->tf_gs   = 0;
2154 		tf->tf_fs   = 0;
2155 		tf->tf_es   = GSEL(GUDATA_SEL, SEL_UPL);
2156 		tf->tf_ds   = GSEL(GUDATA_SEL, SEL_UPL);
2157 		/* trapno, err not touched */
2158 		tf->tf_rip  = gr[_REG_RIP];
2159 		tf->tf_cs   = LSEL(LUCODE_SEL, SEL_UPL);
2160 		rflags = tf->tf_rflags;
2161 		rflags &= ~PSL_USER;
2162 		tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
2163 		tf->tf_rsp  = gr[_REG_RSP];
2164 		tf->tf_ss   = LSEL(LUDATA_SEL, SEL_UPL);
2165 
2166 		l->l_md.md_flags |= MDL_IRET;
2167 	}
2168 
2169 	if ((flags & _UC_FPU) != 0)
2170 		process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);
2171 
2172 	if ((flags & _UC_TLSBASE) != 0)
2173 		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
2174 
2175 	mutex_enter(p->p_lock);
2176 	if (flags & _UC_SETSTACK)
2177 		l->l_sigstk.ss_flags |= SS_ONSTACK;
2178 	if (flags & _UC_CLRSTACK)
2179 		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
2180 	mutex_exit(p->p_lock);
2181 
2182 	return 0;
2183 }
2184 
2185 int
2186 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
2187 {
2188 	struct proc *p __diagused = l->l_proc;
2189 	struct trapframe *tf = l->l_md.md_regs;
2190 	const __greg_t *gr;
2191 	uint16_t sel;
2192 
2193 	KASSERT((p->p_flag & PK_32) == 0);
2194 	gr = mcp->__gregs;
2195 
2196 	if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2197 		return EINVAL;
2198 
2199 	sel = gr[_REG_ES] & 0xffff;
2200 	if (sel != 0 && !VALID_USER_DSEL(sel))
2201 		return EINVAL;
2202 
2203 	sel = gr[_REG_FS] & 0xffff;
2204 	if (sel != 0 && !VALID_USER_DSEL(sel))
2205 		return EINVAL;
2206 
2207 	sel = gr[_REG_GS] & 0xffff;
2208 	if (sel != 0 && !VALID_USER_DSEL(sel))
2209 		return EINVAL;
2210 
2211 	sel = gr[_REG_DS] & 0xffff;
2212 	if (!VALID_USER_DSEL(sel))
2213 		return EINVAL;
2214 
2215 #ifndef XENPV
2216 	sel = gr[_REG_SS] & 0xffff;
2217 	if (!VALID_USER_DSEL(sel))
2218 		return EINVAL;
2219 
2220 	sel = gr[_REG_CS] & 0xffff;
2221 	if (!VALID_USER_CSEL(sel))
2222 		return EINVAL;
2223 #endif
2224 
2225 	if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
2226 		return EINVAL;
2227 
2228 	return 0;
2229 }
2230 
2231 int
2232 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
2233 {
2234 	const vaddr_t v = (vaddr_t)ptr;
2235 	vaddr_t kva, kva_end;
2236 	size_t i;
2237 
2238 	kva = bootspace.head.va;
2239 	kva_end = kva + bootspace.head.sz;
2240 	if (v >= kva && v < kva_end) {
2241 		*handled = true;
2242 		return 0;
2243 	}
2244 
2245 	for (i = 0; i < BTSPACE_NSEGS; i++) {
2246 		kva = bootspace.segs[i].va;
2247 		kva_end = kva + bootspace.segs[i].sz;
2248 		if (v < kva || v >= kva_end)
2249 			continue;
2250 		*handled = true;
2251 		if (bootspace.segs[i].type == BTSEG_TEXT ||
2252 		    bootspace.segs[i].type == BTSEG_RODATA) {
2253 			if (prot & VM_PROT_WRITE) {
2254 				return EFAULT;
2255 			}
2256 		}
2257 		return 0;
2258 	}
2259 
2260 	kva = bootspace.boot.va;
2261 	kva_end = kva + bootspace.boot.sz;
2262 	if (v >= kva && v < kva_end) {
2263 		*handled = true;
2264 		return 0;
2265 	}
2266 
2267 	if (v >= bootspace.smodule && v < bootspace.emodule) {
2268 		*handled = true;
2269 		if (!uvm_map_checkprot(module_map, v, v + 1, prot)) {
2270 			return EFAULT;
2271 		}
2272 	} else {
2273 		*handled = false;
2274 	}
2275 	return 0;
2276 }
2277 
2278 /*
2279  * Zero out a 64bit LWP's segments registers. Used when exec'ing a new
2280  * 64bit program.
2281  */
2282 void
2283 cpu_segregs64_zero(struct lwp *l)
2284 {
2285 	struct trapframe * const tf = l->l_md.md_regs;
2286 	struct pcb *pcb;
2287 	uint64_t zero = 0;
2288 
2289 	KASSERT(kpreempt_disabled());
2290 	KASSERT((l->l_proc->p_flag & PK_32) == 0);
2291 	KASSERT(l == curlwp);
2292 
2293 	pcb = lwp_getpcb(l);
2294 
2295 	tf->tf_fs = 0;
2296 	tf->tf_gs = 0;
2297 	setds(GSEL(GUDATA_SEL, SEL_UPL));
2298 	setes(GSEL(GUDATA_SEL, SEL_UPL));
2299 	setfs(0);
2300 	setusergs(0);
2301 
2302 #ifndef XENPV
2303 	wrmsr(MSR_FSBASE, 0);
2304 	wrmsr(MSR_KERNELGSBASE, 0);
2305 #else
2306 	HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
2307 	HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
2308 #endif
2309 
2310 	pcb->pcb_fs = 0;
2311 	pcb->pcb_gs = 0;
2312 	update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2313 	update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2314 }
2315 
2316 /*
2317  * Zero out a 32bit LWP's segments registers. Used when exec'ing a new
2318  * 32bit program.
2319  */
2320 void
2321 cpu_segregs32_zero(struct lwp *l)
2322 {
2323 	struct trapframe * const tf = l->l_md.md_regs;
2324 	struct pcb *pcb;
2325 	uint64_t zero = 0;
2326 
2327 	KASSERT(kpreempt_disabled());
2328 	KASSERT(l->l_proc->p_flag & PK_32);
2329 	KASSERT(l == curlwp);
2330 
2331 	pcb = lwp_getpcb(l);
2332 
2333 	tf->tf_fs = 0;
2334 	tf->tf_gs = 0;
2335 	setds(GSEL(GUDATA32_SEL, SEL_UPL));
2336 	setes(GSEL(GUDATA32_SEL, SEL_UPL));
2337 	setfs(0);
2338 	setusergs(0);
2339 	pcb->pcb_fs = 0;
2340 	pcb->pcb_gs = 0;
2341 	update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2342 	update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2343 }
2344 
2345 /*
2346  * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
2347  * Used only for 32-bit processes.
2348  */
2349 void
2350 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
2351 {
2352 	struct trapframe *tf;
2353 	struct pcb *pcb;
2354 
2355 	KASSERT(l->l_proc->p_flag & PK_32);
2356 	KASSERT(l == curlwp);
2357 
2358 	tf = l->l_md.md_regs;
2359 	fssel &= 0xFFFF;
2360 	gssel &= 0xFFFF;
2361 
2362 	pcb = lwp_getpcb(l);
2363 	kpreempt_disable();
2364 	update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
2365 	update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
2366 
2367 #ifdef XENPV
2368 	setusergs(gssel);
2369 #endif
2370 
2371 	tf->tf_fs = fssel;
2372 	tf->tf_gs = gssel;
2373 	kpreempt_enable();
2374 }
2375 
2376 bool
2377 mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
2378 {
2379 	vaddr_t va = (vaddr_t)addr;
2380 
2381 #ifdef __HAVE_DIRECT_MAP
2382 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2383 		*paddr = PMAP_DIRECT_UNMAP(va);
2384 		return true;
2385 	}
2386 #else
2387 	__USE(va);
2388 #endif
2389 
2390 	return false;
2391 }
2392 
2393 bool
2394 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
2395 {
2396 #ifdef __HAVE_DIRECT_MAP
2397 	*vaddr = PMAP_DIRECT_MAP(paddr);
2398 	return true;
2399 #else
2400 	return false;
2401 #endif
2402 }
2403 
2404 static void
2405 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
2406 {
2407 	idt_descriptor_t *idt_dst;
2408 
2409 	idt_dst = dst->iv_idt;
2410 
2411 	kpreempt_disable();
2412 	pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE);
2413 
2414 	memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
2415 	memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
2416 
2417 	pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ);
2418 	kpreempt_enable();
2419 }
2420 
2421 void
2422 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
2423 {
2424 	vaddr_t va;
2425 
2426 	if (cid != cpu_index(&cpu_info_primary) &&
2427 	    idt_vec_is_pcpu()) {
2428 #ifdef __HAVE_PCPU_AREA
2429 		va = (vaddr_t)&pcpuarea->ent[cid].idt;
2430 #else
2431 		struct vm_page *pg;
2432 
2433 		va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
2434 		    UVM_KMF_VAONLY);
2435 		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
2436 		if (pg == NULL) {
2437 			panic("failed to allocate a page for IDT");
2438 		}
2439 		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
2440 		    VM_PROT_READ|VM_PROT_WRITE, 0);
2441 		pmap_update(pmap_kernel());
2442 #endif
2443 
2444 		memset((void *)va, 0, PAGE_SIZE);
2445 #ifndef XENPV
2446 		pmap_changeprot_local(va, VM_PROT_READ);
2447 #endif
2448 		pmap_update(pmap_kernel());
2449 
2450 		iv->iv_idt = (void *)va;
2451 		idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
2452 	} else {
2453 		iv->iv_idt = (void *)idt_vaddr;
2454 	}
2455 }
2456