1 /* $NetBSD: machdep.c,v 1.371 2025/01/22 10:03:55 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center. 11 * 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Coyote Point Systems, Inc. which was written under contract to Coyote 14 * Point by Jed Davis and Devon O'Dell. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38 /* 39 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 40 * 41 * Permission to use, copy, modify, and distribute this software for any 42 * purpose with or without fee is hereby granted, provided that the above 43 * copyright notice and this permission notice appear in all copies. 44 * 45 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 46 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 47 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 48 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 49 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 50 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 51 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 52 */ 53 54 /* 55 * Copyright (c) 2007 Manuel Bouyer. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 67 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 68 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 69 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 70 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 71 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 72 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 73 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 74 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 75 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* 79 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 80 * All rights reserved. 81 * 82 * This code is derived from software contributed to Berkeley by 83 * William Jolitz. 84 * 85 * Redistribution and use in source and binary forms, with or without 86 * modification, are permitted provided that the following conditions 87 * are met: 88 * 1. Redistributions of source code must retain the above copyright 89 * notice, this list of conditions and the following disclaimer. 90 * 2. Redistributions in binary form must reproduce the above copyright 91 * notice, this list of conditions and the following disclaimer in the 92 * documentation and/or other materials provided with the distribution. 93 * 3. Neither the name of the University nor the names of its contributors 94 * may be used to endorse or promote products derived from this software 95 * without specific prior written permission. 96 * 97 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 98 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 99 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 100 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 101 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 102 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 103 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 104 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 105 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 106 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 107 * SUCH DAMAGE. 108 * 109 * @(#)machdep.c 7.4 (Berkeley) 6/3/91 110 */ 111 112 #include <sys/cdefs.h> 113 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.371 2025/01/22 10:03:55 riastradh Exp $"); 114 115 #include "opt_modular.h" 116 #include "opt_user_ldt.h" 117 #include "opt_ddb.h" 118 #include "opt_kgdb.h" 119 #include "opt_cpureset_delay.h" 120 #include "opt_mtrr.h" 121 #include "opt_realmem.h" 122 #include "opt_xen.h" 123 #include "opt_svs.h" 124 #include "opt_kaslr.h" 125 #ifndef XENPV 126 #include "opt_physmem.h" 127 #endif 128 #include "isa.h" 129 #include "pci.h" 130 131 #include <sys/param.h> 132 #include <sys/systm.h> 133 #include <sys/signal.h> 134 #include <sys/signalvar.h> 135 #include <sys/kernel.h> 136 #include <sys/cpu.h> 137 #include <sys/exec.h> 138 #include <sys/exec_aout.h> /* for MID_* */ 139 #include <sys/reboot.h> 140 #include <sys/conf.h> 141 #include <sys/msgbuf.h> 142 #include <sys/mount.h> 143 #include <sys/core.h> 144 #include <sys/kcore.h> 145 #include <sys/ucontext.h> 146 #include <machine/kcore.h> 147 #include <sys/ras.h> 148 #include <sys/syscallargs.h> 149 #include <sys/ksyms.h> 150 #include <sys/device.h> 151 #include <sys/lwp.h> 152 #include <sys/proc.h> 153 #include <sys/asan.h> 154 #include <sys/csan.h> 155 #include <sys/msan.h> 156 #include <sys/module.h> 157 #include <sys/timevar.h> 158 159 #ifdef KGDB 160 #include <sys/kgdb.h> 161 #endif 162 163 #include <lib/libkern/entpool.h> /* XXX */ 164 165 #include <dev/cons.h> 166 #include <dev/mm.h> 167 168 #include <uvm/uvm.h> 169 #include <uvm/uvm_page.h> 170 171 #include <sys/sysctl.h> 172 173 #include <machine/cpu.h> 174 #include <machine/cpu_rng.h> 175 #include <machine/cpufunc.h> 176 #include <machine/gdt.h> 177 #include <machine/intr.h> 178 #include <machine/pio.h> 179 #include <machine/psl.h> 180 #include <machine/reg.h> 181 #include <machine/specialreg.h> 182 #include <machine/bootinfo.h> 183 #include <x86/fpu.h> 184 #include <x86/dbregs.h> 185 #include <machine/mtrr.h> 186 #include <machine/mpbiosvar.h> 187 #include <machine/pmap_private.h> 188 189 #include <x86/bootspace.h> 190 #include <x86/cputypes.h> 191 #include <x86/cpuvar.h> 192 #include <x86/machdep.h> 193 #include <x86/x86/tsc.h> 194 195 #include <dev/isa/isareg.h> 196 #include <machine/isa_machdep.h> 197 #include <dev/ic/i8042reg.h> 198 199 #ifdef XEN 200 #include <xen/xen.h> 201 #include <xen/hypervisor.h> 202 #include <xen/evtchn.h> 203 #include <xen/include/public/version.h> 204 #include <xen/include/public/vcpu.h> 205 #endif /* XEN */ 206 207 #include <ddb/db_active.h> 208 209 #ifdef DDB 210 #include <machine/db_machdep.h> 211 #include <ddb/db_extern.h> 212 #include <ddb/db_output.h> 213 #include <ddb/db_interface.h> 214 #endif 215 216 #include "acpica.h" 217 218 #if NACPICA > 0 219 #include <dev/acpi/acpivar.h> 220 #define ACPI_MACHDEP_PRIVATE 221 #include <machine/acpi_machdep.h> 222 #else 223 #include <machine/i82489var.h> 224 #endif 225 226 #include "isa.h" 227 #include "isadma.h" 228 #include "ksyms.h" 229 230 /* the following is used externally (sysctl_hw) */ 231 char machine[] = "amd64"; /* CPU "architecture" */ 232 char machine_arch[] = "x86_64"; /* machine == machine_arch */ 233 234 #ifdef CPURESET_DELAY 235 int cpureset_delay = CPURESET_DELAY; 236 #else 237 int cpureset_delay = 2000; /* default to 2s */ 238 #endif 239 240 int cpu_class = CPUCLASS_686; 241 242 #ifdef MTRR 243 const struct mtrr_funcs *mtrr_funcs; 244 #endif 245 246 int cpu_class; 247 int use_pae; 248 249 #ifndef NO_SPARSE_DUMP 250 int sparse_dump = 1; 251 252 paddr_t max_paddr = 0; 253 unsigned char *sparse_dump_physmap; 254 #endif 255 256 char *dump_headerbuf, *dump_headerbuf_ptr; 257 #define dump_headerbuf_size PAGE_SIZE 258 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size) 259 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr) 260 daddr_t dump_header_blkno; 261 262 size_t dump_nmemsegs; 263 size_t dump_npages; 264 size_t dump_header_size; 265 size_t dump_totalbytesleft; 266 267 vaddr_t idt_vaddr; 268 paddr_t idt_paddr; 269 vaddr_t gdt_vaddr; 270 paddr_t gdt_paddr; 271 vaddr_t ldt_vaddr; 272 paddr_t ldt_paddr; 273 274 static struct vm_map module_map_store; 275 extern struct bootspace bootspace; 276 extern struct slotspace slotspace; 277 278 vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT; 279 vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT; 280 pd_entry_t *pte_base __read_mostly; 281 282 struct vm_map *phys_map = NULL; 283 284 extern paddr_t lowmem_rsvd; 285 extern paddr_t avail_start, avail_end; 286 #ifdef XENPV 287 extern paddr_t pmap_pa_start, pmap_pa_end; 288 #endif 289 290 struct nmistore { 291 uint64_t cr3; 292 uint64_t scratch; 293 } __packed; 294 295 /* 296 * Size of memory segments, before any memory is stolen. 297 */ 298 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; 299 int mem_cluster_cnt; 300 301 int cpu_dump(void); 302 int cpu_dumpsize(void); 303 u_long cpu_dump_mempagecnt(void); 304 void dodumpsys(void); 305 void dumpsys(void); 306 307 static void x86_64_proc0_pcb_ldt_init(void); 308 309 void dump_misc_init(void); 310 void dump_seg_prep(void); 311 int dump_seg_iter(int (*)(paddr_t, paddr_t)); 312 313 #ifndef NO_SPARSE_DUMP 314 void sparse_dump_reset(void); 315 void sparse_dump_mark(void); 316 void cpu_dump_prep_sparse(void); 317 #endif 318 319 void dump_header_start(void); 320 int dump_header_flush(void); 321 int dump_header_addbytes(const void*, size_t); 322 int dump_header_addseg(paddr_t, paddr_t); 323 int dump_header_finish(void); 324 325 int dump_seg_count_range(paddr_t, paddr_t); 326 int dumpsys_seg(paddr_t, paddr_t); 327 328 void init_bootspace(void); 329 void init_slotspace(void); 330 void init_x86_64(paddr_t); 331 332 /* 333 * Machine-dependent startup code 334 */ 335 void 336 cpu_startup(void) 337 { 338 int x, y; 339 vaddr_t minaddr, maxaddr; 340 psize_t sz; 341 342 /* 343 * For console drivers that require uvm and pmap to be initialized, 344 * we'll give them one more chance here... 345 */ 346 consinit(); 347 348 /* 349 * Initialize error message buffer (at end of core). 350 */ 351 if (msgbuf_p_cnt == 0) 352 panic("msgbuf paddr map has not been set up"); 353 for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz) 354 continue; 355 356 msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY); 357 if (msgbuf_vaddr == 0) 358 panic("failed to valloc msgbuf_vaddr"); 359 360 for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) { 361 for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE) 362 pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz, 363 msgbuf_p_seg[y].paddr + x * PAGE_SIZE, 364 VM_PROT_READ|VM_PROT_WRITE, 0); 365 } 366 367 pmap_update(pmap_kernel()); 368 369 initmsgbuf((void *)msgbuf_vaddr, round_page(sz)); 370 371 minaddr = 0; 372 373 /* 374 * Allocate a submap for physio. 375 */ 376 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 377 VM_PHYS_SIZE, 0, false, NULL); 378 379 /* 380 * Create the module map. 381 * 382 * The kernel uses RIP-relative addressing with a maximum offset of 383 * 2GB. Because of that, we can't put the kernel modules in kernel_map 384 * (like i386 does), since kernel_map is too far away in memory from 385 * the kernel sections. So we have to create a special module_map. 386 * 387 * The module map is taken as what is left of the bootstrap memory 388 * created in locore/prekern. 389 */ 390 uvm_map_setup(&module_map_store, bootspace.smodule, 391 bootspace.emodule, 0); 392 module_map_store.pmap = pmap_kernel(); 393 module_map = &module_map_store; 394 395 /* Say hello. */ 396 banner(); 397 398 #if NISA > 0 || NPCI > 0 399 /* Safe for i/o port / memory space allocation to use malloc now. */ 400 x86_bus_space_mallocok(); 401 #endif 402 403 #ifdef __HAVE_PCPU_AREA 404 cpu_pcpuarea_init(&cpu_info_primary); 405 #endif 406 gdt_init(); 407 x86_64_proc0_pcb_ldt_init(); 408 409 cpu_init_tss(&cpu_info_primary); 410 #if !defined(XENPV) 411 ltr(cpu_info_primary.ci_tss_sel); 412 #endif 413 414 x86_startup(); 415 } 416 417 #ifdef XENPV 418 /* used in assembly */ 419 void hypervisor_callback(void); 420 void failsafe_callback(void); 421 void x86_64_switch_context(struct pcb *); 422 void x86_64_tls_switch(struct lwp *); 423 424 void 425 x86_64_switch_context(struct pcb *new) 426 { 427 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0); 428 struct physdev_set_iopl set_iopl; 429 set_iopl.iopl = new->pcb_iopl; 430 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 431 } 432 433 void 434 x86_64_tls_switch(struct lwp *l) 435 { 436 struct cpu_info *ci = curcpu(); 437 struct pcb *pcb = lwp_getpcb(l); 438 struct trapframe *tf = l->l_md.md_regs; 439 uint64_t zero = 0; 440 441 /* 442 * Raise the IPL to IPL_HIGH. XXX Still needed? 443 */ 444 (void)splhigh(); 445 446 /* Update segment registers */ 447 if (pcb->pcb_flags & PCB_COMPAT32) { 448 update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs); 449 update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs); 450 setds(GSEL(GUDATA32_SEL, SEL_UPL)); 451 setes(GSEL(GUDATA32_SEL, SEL_UPL)); 452 setfs(GSEL(GUDATA32_SEL, SEL_UPL)); 453 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs); 454 } else { 455 update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero); 456 update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero); 457 setds(GSEL(GUDATA_SEL, SEL_UPL)); 458 setes(GSEL(GUDATA_SEL, SEL_UPL)); 459 setfs(0); 460 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); 461 HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs); 462 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs); 463 } 464 } 465 #endif /* XENPV */ 466 467 /* 468 * Set up proc0's PCB and LDT. 469 */ 470 static void 471 x86_64_proc0_pcb_ldt_init(void) 472 { 473 struct lwp *l = &lwp0; 474 struct pcb *pcb = lwp_getpcb(l); 475 476 pcb->pcb_flags = 0; 477 pcb->pcb_fs = 0; 478 pcb->pcb_gs = 0; 479 pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf; 480 pcb->pcb_iopl = IOPL_KPL; 481 pcb->pcb_dbregs = NULL; 482 pcb->pcb_cr0 = rcr0() & ~CR0_TS; 483 l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1; 484 485 #if !defined(XENPV) 486 lldt(GSYSSEL(GLDT_SEL, SEL_KPL)); 487 #else 488 xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3); 489 /* Reset TS bit and set kernel stack for interrupt handlers */ 490 HYPERVISOR_fpu_taskswitch(1); 491 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0); 492 struct physdev_set_iopl set_iopl; 493 set_iopl.iopl = pcb->pcb_iopl; 494 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 495 #endif 496 } 497 498 /* 499 * Set up TSS and I/O bitmap. 500 */ 501 void 502 cpu_init_tss(struct cpu_info *ci) 503 { 504 #ifdef __HAVE_PCPU_AREA 505 const cpuid_t cid = cpu_index(ci); 506 #endif 507 struct cpu_tss *cputss; 508 struct nmistore *store; 509 uintptr_t p; 510 511 #ifdef __HAVE_PCPU_AREA 512 cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss; 513 #else 514 cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map, 515 sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 516 #endif 517 518 cputss->tss.tss_iobase = IOMAP_INVALOFF << 16; 519 520 /* DDB stack */ 521 #ifdef __HAVE_PCPU_AREA 522 p = (vaddr_t)&pcpuarea->ent[cid].ist0; 523 #else 524 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 525 #endif 526 cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16; 527 528 /* double fault */ 529 #ifdef __HAVE_PCPU_AREA 530 p = (vaddr_t)&pcpuarea->ent[cid].ist1; 531 #else 532 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 533 #endif 534 cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16; 535 536 /* NMI - store a structure at the top of the stack */ 537 #ifdef __HAVE_PCPU_AREA 538 p = (vaddr_t)&pcpuarea->ent[cid].ist2; 539 #else 540 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 541 #endif 542 cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore); 543 store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore)); 544 store->cr3 = pmap_pdirpa(pmap_kernel(), 0); 545 546 /* DB */ 547 #ifdef __HAVE_PCPU_AREA 548 p = (vaddr_t)&pcpuarea->ent[cid].ist3; 549 #else 550 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 551 #endif 552 cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16; 553 554 ci->ci_tss = cputss; 555 ci->ci_tss_sel = tss_alloc(&cputss->tss); 556 } 557 558 void 559 buildcontext(struct lwp *l, void *catcher, void *f) 560 { 561 struct trapframe *tf = l->l_md.md_regs; 562 563 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 564 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 565 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); 566 tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); 567 568 tf->tf_rip = (uint64_t)catcher; 569 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 570 tf->tf_rflags &= ~PSL_CLEARSIG; 571 tf->tf_rsp = (uint64_t)f; 572 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 573 574 /* Ensure FP state is sane */ 575 fpu_sigreset(l); 576 } 577 578 void 579 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask) 580 { 581 582 printf("sendsig_sigcontext: illegal\n"); 583 sigexit(curlwp, SIGILL); 584 } 585 586 void 587 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) 588 { 589 struct lwp *l = curlwp; 590 struct proc *p = l->l_proc; 591 struct sigacts *ps = p->p_sigacts; 592 int onstack, error; 593 int sig = ksi->ksi_signo; 594 struct sigframe_siginfo *fp, frame; 595 sig_t catcher = SIGACTION(p, sig).sa_handler; 596 struct trapframe *tf = l->l_md.md_regs; 597 char *sp; 598 599 KASSERT(mutex_owned(p->p_lock)); 600 601 /* Do we need to jump onto the signal stack? */ 602 onstack = 603 (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && 604 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; 605 606 /* Allocate space for the signal handler context. */ 607 if (onstack) 608 sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size); 609 else 610 /* AMD64 ABI 128-bytes "red zone". */ 611 sp = (char *)tf->tf_rsp - 128; 612 613 sp -= sizeof(struct sigframe_siginfo); 614 /* Round down the stackpointer to a multiple of 16 for the ABI. */ 615 fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8); 616 617 memset(&frame, 0, sizeof(frame)); 618 frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp; 619 frame.sf_si._info = ksi->ksi_info; 620 frame.sf_uc.uc_flags = _UC_SIGMASK; 621 frame.sf_uc.uc_sigmask = *mask; 622 frame.sf_uc.uc_link = l->l_ctxlink; 623 frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK) 624 ? _UC_SETSTACK : _UC_CLRSTACK; 625 sendsig_reset(l, sig); 626 627 mutex_exit(p->p_lock); 628 cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); 629 /* Copyout all the fp regs, the signal handler might expect them. */ 630 error = copyout(&frame, fp, sizeof frame); 631 mutex_enter(p->p_lock); 632 633 if (error != 0) { 634 /* 635 * Process has trashed its stack; give it an illegal 636 * instruction to halt it in its tracks. 637 */ 638 sigexit(l, SIGILL); 639 /* NOTREACHED */ 640 } 641 642 buildcontext(l, catcher, fp); 643 644 tf->tf_rdi = sig; 645 tf->tf_rsi = (uint64_t)&fp->sf_si; 646 tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc; 647 648 /* Remember that we're now on the signal stack. */ 649 if (onstack) 650 l->l_sigstk.ss_flags |= SS_ONSTACK; 651 652 if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) { 653 /* 654 * process has given an invalid address for the 655 * handler. Stop it, but do not do it before so 656 * we can return the right info to userland (or in core dump) 657 */ 658 sigexit(l, SIGILL); 659 /* NOTREACHED */ 660 } 661 } 662 663 struct pcb dumppcb; 664 665 void 666 cpu_reboot(int howto, char *bootstr) 667 { 668 static bool syncdone = false; 669 int s = IPL_NONE; 670 __USE(s); /* ugly otherwise */ 671 672 if (cold) { 673 howto |= RB_HALT; 674 goto haltsys; 675 } 676 677 boothowto = howto; 678 679 /* i386 maybe_dump() */ 680 681 /* 682 * If we've panic'd, don't make the situation potentially 683 * worse by syncing or unmounting the file systems. 684 */ 685 if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) { 686 if (!syncdone) { 687 syncdone = true; 688 /* XXX used to force unmount as well, here */ 689 vfs_sync_all(curlwp); 690 } 691 692 while (vfs_unmountall1(curlwp, false, false) || 693 config_detach_all(boothowto) || 694 vfs_unmount_forceone(curlwp)) 695 ; /* do nothing */ 696 } else { 697 if (!db_active) 698 suspendsched(); 699 } 700 701 pmf_system_shutdown(boothowto); 702 703 /* Disable interrupts. */ 704 s = splhigh(); 705 706 /* Do a dump if requested. */ 707 if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) 708 dumpsys(); 709 710 haltsys: 711 doshutdownhooks(); 712 713 if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { 714 #if NACPICA > 0 715 if (s != IPL_NONE) 716 splx(s); 717 718 acpi_enter_sleep_state(ACPI_STATE_S5); 719 #endif 720 #ifdef XEN 721 if (vm_guest == VM_GUEST_XENPV || 722 vm_guest == VM_GUEST_XENPVH || 723 vm_guest == VM_GUEST_XENPVHVM) 724 HYPERVISOR_shutdown(); 725 #endif /* XEN */ 726 } 727 728 cpu_broadcast_halt(); 729 730 if (howto & RB_HALT) { 731 #if NACPICA > 0 732 acpi_disable(); 733 #endif 734 735 printf("\n"); 736 printf("The operating system has halted.\n"); 737 printf("Please press any key to reboot.\n\n"); 738 cnpollc(1); /* for proper keyboard command handling */ 739 if (cngetc() == 0) { 740 /* no console attached, so just hlt */ 741 printf("No keyboard - cannot reboot after all.\n"); 742 for(;;) { 743 x86_hlt(); 744 } 745 } 746 cnpollc(0); 747 } 748 749 printf("rebooting...\n"); 750 if (cpureset_delay > 0) 751 delay(cpureset_delay * 1000); 752 cpu_reset(); 753 for(;;) ; 754 /*NOTREACHED*/ 755 } 756 757 /* 758 * XXXfvdl share dumpcode. 759 */ 760 761 /* 762 * Perform assorted dump-related initialization tasks. Assumes that 763 * the maximum physical memory address will not increase afterwards. 764 */ 765 void 766 dump_misc_init(void) 767 { 768 #ifndef NO_SPARSE_DUMP 769 int i; 770 #endif 771 772 if (dump_headerbuf != NULL) 773 return; /* already called */ 774 775 #ifndef NO_SPARSE_DUMP 776 for (i = 0; i < mem_cluster_cnt; ++i) { 777 paddr_t top = mem_clusters[i].start + mem_clusters[i].size; 778 if (max_paddr < top) 779 max_paddr = top; 780 } 781 #ifdef DEBUG 782 printf("dump_misc_init: max_paddr = 0x%lx\n", 783 (unsigned long)max_paddr); 784 #endif 785 if (max_paddr == 0) { 786 printf("Your machine does not initialize mem_clusters; " 787 "sparse_dumps disabled\n"); 788 sparse_dump = 0; 789 } else { 790 sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map, 791 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), 792 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); 793 } 794 #endif 795 dump_headerbuf = (void *)uvm_km_alloc(kernel_map, 796 dump_headerbuf_size, 797 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); 798 /* XXXjld should check for failure here, disable dumps if so. */ 799 } 800 801 #ifndef NO_SPARSE_DUMP 802 /* 803 * Clear the set of pages to include in a sparse dump. 804 */ 805 void 806 sparse_dump_reset(void) 807 { 808 memset(sparse_dump_physmap, 0, 809 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); 810 } 811 812 /* 813 * Include or exclude pages in a sparse dump. 814 */ 815 void 816 sparse_dump_mark(void) 817 { 818 paddr_t p, pstart, pend; 819 struct vm_page *pg; 820 int i; 821 uvm_physseg_t upm; 822 823 /* 824 * Mark all memory pages, then unmark pages that are uninteresting. 825 * Dereferenceing pg->uobject might crash again if another CPU 826 * frees the object out from under us, but we can't lock anything 827 * so it's a risk we have to take. 828 */ 829 830 for (i = 0; i < mem_cluster_cnt; ++i) { 831 pstart = mem_clusters[i].start / PAGE_SIZE; 832 pend = pstart + mem_clusters[i].size / PAGE_SIZE; 833 834 for (p = pstart; p < pend; p++) { 835 setbit(sparse_dump_physmap, p); 836 } 837 } 838 for (upm = uvm_physseg_get_first(); 839 uvm_physseg_valid_p(upm); 840 upm = uvm_physseg_get_next(upm)) { 841 paddr_t pfn; 842 843 /* 844 * We assume that seg->start to seg->end are 845 * uvm_page_physload()ed 846 */ 847 for (pfn = uvm_physseg_get_start(upm); 848 pfn < uvm_physseg_get_end(upm); 849 pfn++) { 850 pg = PHYS_TO_VM_PAGE(ptoa(pfn)); 851 852 if (pg->uanon || (pg->flags & PG_FREE) || 853 (pg->uobject && pg->uobject->pgops)) { 854 p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE; 855 clrbit(sparse_dump_physmap, p); 856 } 857 } 858 } 859 } 860 861 /* 862 * Machine-dependently decides on the contents of a sparse dump, using 863 * the above. 864 */ 865 void 866 cpu_dump_prep_sparse(void) 867 { 868 sparse_dump_reset(); 869 /* XXX could the alternate recursive page table be skipped? */ 870 sparse_dump_mark(); 871 /* Memory for I/O buffers could be unmarked here, for example. */ 872 /* The kernel text could also be unmarked, but gdb would be upset. */ 873 } 874 #endif 875 876 /* 877 * Abstractly iterate over the collection of memory segments to be 878 * dumped; the callback lacks the customary environment-pointer 879 * argument because none of the current users really need one. 880 * 881 * To be used only after dump_seg_prep is called to set things up. 882 */ 883 int 884 dump_seg_iter(int (*callback)(paddr_t, paddr_t)) 885 { 886 int error, i; 887 888 #define CALLBACK(start,size) do { \ 889 error = callback(start,size); \ 890 if (error) \ 891 return error; \ 892 } while(0) 893 894 for (i = 0; i < mem_cluster_cnt; ++i) { 895 #ifndef NO_SPARSE_DUMP 896 /* 897 * The bitmap is scanned within each memory segment, 898 * rather than over its entire domain, in case any 899 * pages outside of the memory proper have been mapped 900 * into kva; they might be devices that wouldn't 901 * appreciate being arbitrarily read, and including 902 * them could also break the assumption that a sparse 903 * dump will always be smaller than a full one. 904 */ 905 if (sparse_dump && sparse_dump_physmap) { 906 paddr_t p, sp_start, sp_end; 907 int lastset; 908 909 sp_start = mem_clusters[i].start; 910 sp_end = sp_start + mem_clusters[i].size; 911 sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */ 912 lastset = 0; 913 for (p = sp_start; p < sp_end; p += PAGE_SIZE) { 914 int thisset = isset(sparse_dump_physmap, 915 p/PAGE_SIZE); 916 917 if (!lastset && thisset) 918 sp_start = p; 919 if (lastset && !thisset) 920 CALLBACK(sp_start, p - sp_start); 921 lastset = thisset; 922 } 923 if (lastset) 924 CALLBACK(sp_start, p - sp_start); 925 } else 926 #endif 927 CALLBACK(mem_clusters[i].start, mem_clusters[i].size); 928 } 929 return 0; 930 #undef CALLBACK 931 } 932 933 /* 934 * Prepare for an impending core dump: decide what's being dumped and 935 * how much space it will take up. 936 */ 937 void 938 dump_seg_prep(void) 939 { 940 #ifndef NO_SPARSE_DUMP 941 if (sparse_dump && sparse_dump_physmap) 942 cpu_dump_prep_sparse(); 943 #endif 944 945 dump_nmemsegs = 0; 946 dump_npages = 0; 947 dump_seg_iter(dump_seg_count_range); 948 949 dump_header_size = ALIGN(sizeof(kcore_seg_t)) + 950 ALIGN(sizeof(cpu_kcore_hdr_t)) + 951 ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); 952 dump_header_size = roundup(dump_header_size, dbtob(1)); 953 954 /* 955 * savecore(8) will read this to decide how many pages to 956 * copy, and cpu_dumpconf has already used the pessimistic 957 * value to set dumplo, so it's time to tell the truth. 958 */ 959 dumpsize = dump_npages; /* XXX could these just be one variable? */ 960 } 961 962 int 963 dump_seg_count_range(paddr_t start, paddr_t size) 964 { 965 ++dump_nmemsegs; 966 dump_npages += size / PAGE_SIZE; 967 return 0; 968 } 969 970 /* 971 * A sparse dump's header may be rather large, due to the number of 972 * "segments" emitted. These routines manage a simple output buffer, 973 * so that the header can be written to disk incrementally. 974 */ 975 void 976 dump_header_start(void) 977 { 978 dump_headerbuf_ptr = dump_headerbuf; 979 dump_header_blkno = dumplo; 980 } 981 982 int 983 dump_header_flush(void) 984 { 985 const struct bdevsw *bdev; 986 size_t to_write; 987 int error; 988 989 bdev = bdevsw_lookup(dumpdev); 990 to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); 991 error = bdev->d_dump(dumpdev, dump_header_blkno, 992 dump_headerbuf, to_write); 993 dump_header_blkno += btodb(to_write); 994 dump_headerbuf_ptr = dump_headerbuf; 995 return error; 996 } 997 998 int 999 dump_header_addbytes(const void* vptr, size_t n) 1000 { 1001 const char* ptr = vptr; 1002 int error; 1003 1004 while (n > dump_headerbuf_avail) { 1005 memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); 1006 ptr += dump_headerbuf_avail; 1007 n -= dump_headerbuf_avail; 1008 dump_headerbuf_ptr = dump_headerbuf_end; 1009 error = dump_header_flush(); 1010 if (error) 1011 return error; 1012 } 1013 memcpy(dump_headerbuf_ptr, ptr, n); 1014 dump_headerbuf_ptr += n; 1015 1016 return 0; 1017 } 1018 1019 int 1020 dump_header_addseg(paddr_t start, paddr_t size) 1021 { 1022 phys_ram_seg_t seg = { start, size }; 1023 int error; 1024 1025 error = dump_header_addbytes(&seg, sizeof(seg)); 1026 if (error) { 1027 printf("[seg 0x%"PRIxPADDR" bytes 0x%"PRIxPSIZE" failed," 1028 " error=%d] ", start, size, error); 1029 } 1030 return error; 1031 } 1032 1033 int 1034 dump_header_finish(void) 1035 { 1036 int error; 1037 1038 memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); 1039 error = dump_header_flush(); 1040 if (error) 1041 printf("[finish failed, error=%d] ", error); 1042 return error; 1043 } 1044 1045 1046 /* 1047 * These variables are needed by /sbin/savecore 1048 */ 1049 uint32_t dumpmag = 0x8fca0101; /* magic number */ 1050 int dumpsize = 0; /* pages */ 1051 long dumplo = 0; /* blocks */ 1052 1053 /* 1054 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers 1055 * for a full (non-sparse) dump. 1056 */ 1057 int 1058 cpu_dumpsize(void) 1059 { 1060 int size; 1061 1062 size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + 1063 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); 1064 if (roundup(size, dbtob(1)) != dbtob(1)) 1065 return (-1); 1066 1067 return (1); 1068 } 1069 1070 /* 1071 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped 1072 * for a full (non-sparse) dump. 1073 */ 1074 u_long 1075 cpu_dump_mempagecnt(void) 1076 { 1077 u_long i, n; 1078 1079 n = 0; 1080 for (i = 0; i < mem_cluster_cnt; i++) 1081 n += atop(mem_clusters[i].size); 1082 return (n); 1083 } 1084 1085 /* 1086 * cpu_dump: dump the machine-dependent kernel core dump headers. 1087 */ 1088 int 1089 cpu_dump(void) 1090 { 1091 kcore_seg_t seg; 1092 cpu_kcore_hdr_t cpuhdr; 1093 const struct bdevsw *bdev; 1094 int error; 1095 1096 bdev = bdevsw_lookup(dumpdev); 1097 if (bdev == NULL) { 1098 printf("[device 0x%llx ENXIO] ", (unsigned long long)dumpdev); 1099 return ENXIO; 1100 } 1101 1102 /* 1103 * Generate a segment header. 1104 */ 1105 CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); 1106 seg.c_size = dump_header_size - ALIGN(sizeof(seg)); 1107 error = dump_header_addbytes(&seg, ALIGN(sizeof(seg))); 1108 if (error) { 1109 printf("[segment header %zu bytes failed, error=%d] ", 1110 ALIGN(sizeof(seg)), error); 1111 /* blithely proceed (can't fail?) */ 1112 } 1113 1114 /* 1115 * Add the machine-dependent header info. 1116 */ 1117 cpuhdr.ptdpaddr = PDPpaddr; 1118 cpuhdr.nmemsegs = dump_nmemsegs; 1119 error = dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); 1120 if (error) { 1121 printf("[MD header %zu bytes failed, error=%d] ", 1122 ALIGN(sizeof(cpuhdr)), error); 1123 /* blithely proceed (can't fail?) */ 1124 } 1125 1126 /* 1127 * Write out the memory segment descriptors. 1128 */ 1129 return dump_seg_iter(dump_header_addseg); 1130 } 1131 1132 /* 1133 * Doadump comes here after turning off memory management and 1134 * getting on the dump stack, either when called above, or by 1135 * the auto-restart code. 1136 */ 1137 #define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ 1138 static vaddr_t dumpspace; 1139 1140 vaddr_t 1141 reserve_dumppages(vaddr_t p) 1142 { 1143 1144 dumpspace = p; 1145 return (p + BYTES_PER_DUMP); 1146 } 1147 1148 int 1149 dumpsys_seg(paddr_t maddr, paddr_t bytes) 1150 { 1151 u_long i, m, n; 1152 daddr_t blkno; 1153 const struct bdevsw *bdev; 1154 int (*dump)(dev_t, daddr_t, void *, size_t); 1155 int error; 1156 1157 if (dumpdev == NODEV) 1158 return ENODEV; 1159 bdev = bdevsw_lookup(dumpdev); 1160 if (bdev == NULL || bdev->d_psize == NULL) 1161 return ENODEV; 1162 1163 dump = bdev->d_dump; 1164 1165 blkno = dump_header_blkno; 1166 for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { 1167 /* Print out how many MBs we have left to go. */ 1168 if ((dump_totalbytesleft % (1024*1024)) == 0) 1169 printf_nolog("%lu ", (unsigned long) 1170 (dump_totalbytesleft / (1024 * 1024))); 1171 1172 /* Limit size for next transfer. */ 1173 n = bytes - i; 1174 if (n > BYTES_PER_DUMP) 1175 n = BYTES_PER_DUMP; 1176 1177 for (m = 0; m < n; m += NBPG) 1178 pmap_kenter_pa(dumpspace + m, maddr + m, 1179 VM_PROT_READ, 0); 1180 pmap_update(pmap_kernel()); 1181 1182 error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); 1183 pmap_kremove_local(dumpspace, n); 1184 if (error) 1185 return error; 1186 maddr += n; 1187 blkno += btodb(n); /* XXX? */ 1188 1189 #if 0 /* XXX this doesn't work. grr. */ 1190 /* operator aborting dump? */ 1191 if (sget() != NULL) 1192 return EINTR; 1193 #endif 1194 } 1195 dump_header_blkno = blkno; 1196 1197 return 0; 1198 } 1199 1200 void 1201 dodumpsys(void) 1202 { 1203 const struct bdevsw *bdev; 1204 int dumpend, psize; 1205 int error; 1206 1207 if (dumpdev == NODEV) 1208 return; 1209 1210 bdev = bdevsw_lookup(dumpdev); 1211 if (bdev == NULL || bdev->d_psize == NULL) 1212 return; 1213 /* 1214 * For dumps during autoconfiguration, 1215 * if dump device has already configured... 1216 */ 1217 if (dumpsize == 0) 1218 cpu_dumpconf(); 1219 1220 printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):", 1221 (unsigned long long)major(dumpdev), 1222 (unsigned long long)minor(dumpdev), dumplo, dumpsize); 1223 1224 if (dumplo <= 0 || dumpsize <= 0) { 1225 printf(" not possible\n"); 1226 return; 1227 } 1228 1229 psize = bdev_size(dumpdev); 1230 printf("\ndump "); 1231 if (psize == -1) { 1232 printf("area unavailable\n"); 1233 return; 1234 } 1235 1236 #if 0 /* XXX this doesn't work. grr. */ 1237 /* toss any characters present prior to dump */ 1238 while (sget() != NULL); /*syscons and pccons differ */ 1239 #endif 1240 1241 dump_seg_prep(); 1242 dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); 1243 if (dumpend > psize) { 1244 printf("failed: insufficient space (%d < %d)\n", 1245 psize, dumpend); 1246 goto failed; 1247 } 1248 1249 dump_header_start(); 1250 if ((error = cpu_dump()) != 0) 1251 goto err; 1252 if ((error = dump_header_finish()) != 0) 1253 goto err; 1254 1255 if (dump_header_blkno != dumplo + btodb(dump_header_size)) { 1256 printf("BAD header size (%ld [written] != %ld [expected])\n", 1257 (long)(dump_header_blkno - dumplo), 1258 (long)btodb(dump_header_size)); 1259 goto failed; 1260 } 1261 1262 dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); 1263 error = dump_seg_iter(dumpsys_seg); 1264 1265 if (error == 0 && dump_header_blkno != dumpend) { 1266 printf("BAD dump size (%ld [written] != %ld [expected])\n", 1267 (long)(dumpend - dumplo), 1268 (long)(dump_header_blkno - dumplo)); 1269 goto failed; 1270 } 1271 1272 err: 1273 switch (error) { 1274 1275 case ENXIO: 1276 printf("device bad\n"); 1277 break; 1278 1279 case EFAULT: 1280 printf("device not ready\n"); 1281 break; 1282 1283 case EINVAL: 1284 printf("area improper\n"); 1285 break; 1286 1287 case EIO: 1288 printf("i/o error\n"); 1289 break; 1290 1291 case EINTR: 1292 printf("aborted from console\n"); 1293 break; 1294 1295 case 0: 1296 printf("succeeded\n"); 1297 break; 1298 1299 default: 1300 printf("error %d\n", error); 1301 break; 1302 } 1303 failed: 1304 printf("\n\n"); 1305 delay(5000000); /* 5 seconds */ 1306 } 1307 1308 /* 1309 * This is called by main to set dumplo and dumpsize. 1310 * Dumps always skip the first PAGE_SIZE of disk space 1311 * in case there might be a disk label stored there. 1312 * If there is extra space, put dump at the end to 1313 * reduce the chance that swapping trashes it. 1314 * 1315 * Sparse dumps can't placed as close to the end as possible, because 1316 * savecore(8) has to know where to start reading in the dump device 1317 * before it has access to any of the crashed system's state. 1318 * 1319 * Note also that a sparse dump will never be larger than a full one: 1320 * in order to add a phys_ram_seg_t to the header, at least one page 1321 * must be removed. 1322 */ 1323 void 1324 cpu_dumpconf(void) 1325 { 1326 int nblks, dumpblks; /* size of dump area */ 1327 1328 if (dumpdev == NODEV) 1329 goto bad; 1330 nblks = bdev_size(dumpdev); 1331 if (nblks <= ctod(1)) 1332 goto bad; 1333 1334 dumpblks = cpu_dumpsize(); 1335 if (dumpblks < 0) 1336 goto bad; 1337 1338 /* dumpsize is in page units, and doesn't include headers. */ 1339 dumpsize = cpu_dump_mempagecnt(); 1340 1341 dumpblks += ctod(dumpsize); 1342 1343 /* If dump won't fit (incl. room for possible label), punt. */ 1344 if (dumpblks > (nblks - ctod(1))) { 1345 #ifndef NO_SPARSE_DUMP 1346 /* A sparse dump might (and hopefully will) fit. */ 1347 dumplo = ctod(1); 1348 #else 1349 /* But if we're not configured for that, punt. */ 1350 goto bad; 1351 #endif 1352 } else { 1353 /* Put dump at end of partition */ 1354 dumplo = nblks - dumpblks; 1355 } 1356 1357 1358 /* Now that we've decided this will work, init ancillary stuff. */ 1359 dump_misc_init(); 1360 return; 1361 1362 bad: 1363 dumpsize = 0; 1364 } 1365 1366 /* 1367 * Clear registers on exec 1368 */ 1369 void 1370 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) 1371 { 1372 struct pcb *pcb = lwp_getpcb(l); 1373 struct trapframe *tf; 1374 1375 #ifdef USER_LDT 1376 pmap_ldt_cleanup(l); 1377 #endif 1378 1379 fpu_clear(l, pack->ep_osversion >= 699002600 1380 ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__); 1381 x86_dbregs_clear(l); 1382 1383 kpreempt_disable(); 1384 pcb->pcb_flags = 0; 1385 l->l_proc->p_flag &= ~PK_32; 1386 l->l_md.md_flags = MDL_IRET; 1387 cpu_segregs64_zero(l); 1388 kpreempt_enable(); 1389 1390 tf = l->l_md.md_regs; 1391 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 1392 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 1393 tf->tf_rdi = 0; 1394 tf->tf_rsi = 0; 1395 tf->tf_rbp = 0; 1396 tf->tf_rbx = l->l_proc->p_psstrp; 1397 tf->tf_rdx = 0; 1398 tf->tf_rcx = 0; 1399 tf->tf_rax = 0; 1400 tf->tf_rip = pack->ep_entry; 1401 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); 1402 tf->tf_rflags = PSL_USERSET; 1403 tf->tf_rsp = stack; 1404 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); 1405 } 1406 1407 /* 1408 * Initialize segments and descriptor tables 1409 */ 1410 char *ldtstore; 1411 char *gdtstore; 1412 1413 void 1414 setgate(struct gate_descriptor *gd, void *func, 1415 int ist, int type, int dpl, int sel) 1416 { 1417 vaddr_t vaddr; 1418 1419 vaddr = ((vaddr_t)gd) & ~PAGE_MASK; 1420 1421 kpreempt_disable(); 1422 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); 1423 1424 gd->gd_looffset = (uint64_t)func & 0xffff; 1425 gd->gd_selector = sel; 1426 gd->gd_ist = ist; 1427 gd->gd_type = type; 1428 gd->gd_dpl = dpl; 1429 gd->gd_p = 1; 1430 gd->gd_hioffset = (uint64_t)func >> 16; 1431 gd->gd_zero = 0; 1432 gd->gd_xx1 = 0; 1433 gd->gd_xx2 = 0; 1434 gd->gd_xx3 = 0; 1435 1436 pmap_changeprot_local(vaddr, VM_PROT_READ); 1437 kpreempt_enable(); 1438 } 1439 1440 void 1441 unsetgate(struct gate_descriptor *gd) 1442 { 1443 vaddr_t vaddr; 1444 1445 vaddr = ((vaddr_t)gd) & ~PAGE_MASK; 1446 1447 kpreempt_disable(); 1448 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); 1449 1450 memset(gd, 0, sizeof (*gd)); 1451 1452 pmap_changeprot_local(vaddr, VM_PROT_READ); 1453 kpreempt_enable(); 1454 } 1455 1456 void 1457 setregion(struct region_descriptor *rd, void *base, uint16_t limit) 1458 { 1459 rd->rd_limit = limit; 1460 rd->rd_base = (uint64_t)base; 1461 } 1462 1463 /* 1464 * Note that the base and limit fields are ignored in long mode. 1465 */ 1466 void 1467 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, 1468 int type, int dpl, int gran, int def32, int is64) 1469 { 1470 sd->sd_lolimit = (unsigned)limit; 1471 sd->sd_lobase = (unsigned long)base; 1472 sd->sd_type = type; 1473 sd->sd_dpl = dpl; 1474 sd->sd_p = 1; 1475 sd->sd_hilimit = (unsigned)limit >> 16; 1476 sd->sd_avl = 0; 1477 sd->sd_long = is64; 1478 sd->sd_def32 = def32; 1479 sd->sd_gran = gran; 1480 sd->sd_hibase = (unsigned long)base >> 24; 1481 } 1482 1483 void 1484 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, 1485 int type, int dpl, int gran) 1486 { 1487 memset(sd, 0, sizeof *sd); 1488 sd->sd_lolimit = (unsigned)limit; 1489 sd->sd_lobase = (uint64_t)base; 1490 sd->sd_type = type; 1491 sd->sd_dpl = dpl; 1492 sd->sd_p = 1; 1493 sd->sd_hilimit = (unsigned)limit >> 16; 1494 sd->sd_gran = gran; 1495 sd->sd_hibase = (uint64_t)base >> 24; 1496 } 1497 1498 void 1499 cpu_init_idt(struct cpu_info *ci) 1500 { 1501 struct region_descriptor region; 1502 idt_descriptor_t *idt; 1503 1504 idt = ci->ci_idtvec.iv_idt; 1505 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); 1506 lidt(®ion); 1507 } 1508 1509 #define IDTVEC(name) __CONCAT(X, name) 1510 typedef void (vector)(void); 1511 extern vector IDTVEC(syscall); 1512 extern vector IDTVEC(syscall32); 1513 extern vector IDTVEC(osyscall); 1514 extern vector *x86_exceptions[]; 1515 1516 #ifndef XENPV 1517 static void 1518 init_x86_64_ksyms(void) 1519 { 1520 #if NKSYMS || defined(DDB) || defined(MODULAR) 1521 extern int end; 1522 extern int *esym; 1523 struct btinfo_symtab *symtab; 1524 vaddr_t tssym, tesym; 1525 1526 #ifdef DDB 1527 db_machine_init(); 1528 #endif 1529 1530 symtab = lookup_bootinfo(BTINFO_SYMTAB); 1531 if (symtab) { 1532 #ifdef KASLR 1533 tssym = bootspace.head.va; 1534 tesym = bootspace.head.va; /* (unused...) */ 1535 #else 1536 tssym = (vaddr_t)symtab->ssym + KERNBASE; 1537 tesym = (vaddr_t)symtab->esym + KERNBASE; 1538 #endif 1539 ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym); 1540 } else { 1541 uintptr_t endp = (uintptr_t)(void *)&end; 1542 1543 if (vm_guest == VM_GUEST_GENPVH) 1544 ksyms_addsyms_elf(0, ((long *)endp) + 1, esym); 1545 else 1546 ksyms_addsyms_elf(*(long *)endp, ((long *)endp) + 1, esym); 1547 } 1548 #endif 1549 } 1550 #endif /* XENPV */ 1551 1552 void __noasan 1553 init_bootspace(void) 1554 { 1555 extern char __rodata_start; 1556 extern char __data_start; 1557 extern char __kernel_end; 1558 size_t i = 0; 1559 1560 memset(&bootspace, 0, sizeof(bootspace)); 1561 1562 bootspace.head.va = KERNTEXTOFF; 1563 bootspace.head.pa = KERNTEXTOFF - KERNBASE; 1564 bootspace.head.sz = 0; 1565 1566 bootspace.segs[i].type = BTSEG_TEXT; 1567 bootspace.segs[i].va = KERNTEXTOFF; 1568 bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE; 1569 bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF; 1570 i++; 1571 1572 bootspace.segs[i].type = BTSEG_RODATA; 1573 bootspace.segs[i].va = (vaddr_t)&__rodata_start; 1574 bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE; 1575 bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start; 1576 i++; 1577 1578 bootspace.segs[i].type = BTSEG_DATA; 1579 bootspace.segs[i].va = (vaddr_t)&__data_start; 1580 bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE; 1581 bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start; 1582 i++; 1583 1584 bootspace.boot.va = (vaddr_t)&__kernel_end; 1585 bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE; 1586 bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) - 1587 (size_t)&__kernel_end; 1588 1589 /* In locore.S, we allocated a tmp va. We will use it now. */ 1590 bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; 1591 1592 /* Virtual address of the L4 page. */ 1593 bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE); 1594 1595 /* Kernel module map. */ 1596 bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE; 1597 bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; 1598 } 1599 1600 static void 1601 init_pte(void) 1602 { 1603 #ifndef XENPV 1604 extern uint32_t nox_flag; 1605 pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir; 1606 pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) | 1607 PTE_P; 1608 #endif 1609 1610 extern pd_entry_t *normal_pdes[3]; 1611 normal_pdes[0] = L2_BASE; 1612 normal_pdes[1] = L3_BASE; 1613 normal_pdes[2] = L4_BASE; 1614 } 1615 1616 void 1617 init_slotspace(void) 1618 { 1619 /* 1620 * XXX Too early to use cprng(9), or even entropy_extract. 1621 */ 1622 struct entpool pool; 1623 size_t randhole; 1624 vaddr_t randva; 1625 uint64_t sample; 1626 vaddr_t va; 1627 1628 memset(&pool, 0, sizeof pool); 1629 cpu_rng_early_sample(&sample); 1630 entpool_enter(&pool, &sample, sizeof sample); 1631 1632 memset(&slotspace, 0, sizeof(slotspace)); 1633 1634 /* User. [256, because we want to land in >= 256] */ 1635 slotspace.area[SLAREA_USER].sslot = 0; 1636 slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1; 1637 slotspace.area[SLAREA_USER].active = true; 1638 1639 #ifdef XENPV 1640 /* PTE. */ 1641 slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE; 1642 slotspace.area[SLAREA_PTE].nslot = 1; 1643 slotspace.area[SLAREA_PTE].active = true; 1644 #endif 1645 1646 #ifdef __HAVE_PCPU_AREA 1647 /* Per-CPU. */ 1648 slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU; 1649 slotspace.area[SLAREA_PCPU].nslot = 1; 1650 slotspace.area[SLAREA_PCPU].active = true; 1651 #endif 1652 1653 #ifdef __HAVE_DIRECT_MAP 1654 /* Direct Map. [Randomized later] */ 1655 slotspace.area[SLAREA_DMAP].active = false; 1656 #endif 1657 1658 #ifdef XENPV 1659 /* Hypervisor. */ 1660 slotspace.area[SLAREA_HYPV].sslot = 256; 1661 slotspace.area[SLAREA_HYPV].nslot = 17; 1662 slotspace.area[SLAREA_HYPV].active = true; 1663 #endif 1664 1665 #ifdef KASAN 1666 /* ASAN. */ 1667 slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN; 1668 slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN; 1669 slotspace.area[SLAREA_ASAN].active = true; 1670 #endif 1671 1672 #ifdef KMSAN 1673 /* MSAN. */ 1674 slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN; 1675 slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN; 1676 slotspace.area[SLAREA_MSAN].active = true; 1677 #endif 1678 1679 /* Kernel. */ 1680 slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE; 1681 slotspace.area[SLAREA_KERN].nslot = 1; 1682 slotspace.area[SLAREA_KERN].active = true; 1683 1684 /* Main. */ 1685 cpu_rng_early_sample(&sample); 1686 entpool_enter(&pool, &sample, sizeof sample); 1687 entpool_extract(&pool, &randhole, sizeof randhole); 1688 entpool_extract(&pool, &randva, sizeof randva); 1689 va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4, 1690 NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */ 1691 vm_min_kernel_address = va; 1692 vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4; 1693 1694 #ifndef XENPV 1695 /* PTE. */ 1696 cpu_rng_early_sample(&sample); 1697 entpool_enter(&pool, &sample, sizeof sample); 1698 entpool_extract(&pool, &randhole, sizeof randhole); 1699 entpool_extract(&pool, &randva, sizeof randva); 1700 va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva); 1701 pte_base = (pd_entry_t *)va; 1702 #endif 1703 1704 explicit_memset(&pool, 0, sizeof pool); 1705 } 1706 1707 void 1708 init_x86_64(paddr_t first_avail) 1709 { 1710 extern void consinit(void); 1711 struct region_descriptor region; 1712 struct mem_segment_descriptor *ldt_segp; 1713 struct idt_vec *iv; 1714 idt_descriptor_t *idt; 1715 int x; 1716 struct pcb *pcb; 1717 extern vaddr_t lwp0uarea; 1718 #ifndef XENPV 1719 extern paddr_t local_apic_pa; 1720 #endif 1721 1722 KASSERT(first_avail % PAGE_SIZE == 0); 1723 1724 #ifdef XENPV 1725 KASSERT(HYPERVISOR_shared_info != NULL); 1726 cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; 1727 #endif 1728 1729 #ifdef XEN 1730 if (vm_guest == VM_GUEST_XENPVH || vm_guest == VM_GUEST_GENPVH) 1731 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); 1732 #endif 1733 init_pte(); 1734 1735 uvm_lwp_setuarea(&lwp0, lwp0uarea); 1736 1737 cpu_probe(&cpu_info_primary); 1738 #ifdef SVS 1739 svs_init(); 1740 #endif 1741 1742 /* 1743 * Initialize MSRs on cpu0: 1744 * 1745 * - Enables SYSCALL/SYSRET. 1746 * 1747 * - Sets up %fs and %gs so that %gs points to the current 1748 * struct cpu_info as needed for CPUVAR(...), curcpu(), and 1749 * curlwp. 1750 * 1751 * - Enables the no-execute bit if supported. 1752 * 1753 * Thus, after this point, CPUVAR(...), curcpu(), and curlwp 1754 * will work on cpu0. 1755 * 1756 * Note: The call to cpu_init_msrs for secondary CPUs happens 1757 * in cpu_hatch. 1758 */ 1759 cpu_init_msrs(&cpu_info_primary, true); 1760 1761 #ifndef XENPV 1762 cpu_speculation_init(&cpu_info_primary); 1763 #endif 1764 1765 use_pae = 1; /* PAE always enabled in long mode */ 1766 1767 pcb = lwp_getpcb(&lwp0); 1768 #ifdef XENPV 1769 mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM); 1770 pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE; 1771 #else 1772 pcb->pcb_cr3 = PDPpaddr; 1773 #endif 1774 1775 #if NISA > 0 || NPCI > 0 1776 x86_bus_space_init(); 1777 #endif 1778 1779 pat_init(&cpu_info_primary); 1780 1781 consinit(); /* XXX SHOULD NOT BE DONE HERE */ 1782 1783 /* 1784 * Initialize PAGE_SIZE-dependent variables. 1785 */ 1786 uvm_md_init(); 1787 1788 uvmexp.ncolors = 2; 1789 1790 avail_start = first_avail; 1791 1792 #ifndef XENPV 1793 /* 1794 * Low memory reservations: 1795 * Page 0: BIOS data 1796 * Page 1: BIOS callback (not used yet, for symmetry with i386) 1797 * Page 2: MP bootstrap code (MP_TRAMPOLINE) 1798 * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR) 1799 * Page 4: Temporary page table for 0MB-4MB 1800 * Page 5: Temporary page directory 1801 * Page 6: Temporary page map level 3 1802 * Page 7: Temporary page map level 4 1803 */ 1804 lowmem_rsvd = 8 * PAGE_SIZE; 1805 1806 /* Initialize the memory clusters (needed in pmap_bootstrap). */ 1807 init_x86_clusters(); 1808 #else 1809 /* Parse Xen command line (replace bootinfo) */ 1810 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); 1811 1812 avail_end = ctob(xen_start_info.nr_pages); 1813 pmap_pa_start = (KERNTEXTOFF - KERNBASE); 1814 pmap_pa_end = avail_end; 1815 #endif 1816 1817 /* 1818 * Call pmap initialization to make new kernel address space. 1819 * We must do this before loading pages into the VM system. 1820 */ 1821 pmap_bootstrap(VM_MIN_KERNEL_ADDRESS); 1822 1823 /* 1824 * Initialize RNG to get entropy ASAP either from CPU 1825 * RDRAND/RDSEED or from seed on disk. Constraints: 1826 * 1827 * - Must happen after cpu_init_msrs so that curcpu() and 1828 * curlwp work. 1829 * 1830 * - Must happen after consinit so we have the opportunity to 1831 * print useful feedback. 1832 * 1833 * - On KASLR kernels, must happen after pmap_bootstrap because 1834 * x86_rndseed requires access to the direct map. 1835 */ 1836 cpu_rng_init(); 1837 x86_rndseed(); 1838 1839 #ifndef XENPV 1840 /* Internalize the physical pages into the VM system. */ 1841 init_x86_vm(avail_start); 1842 #else 1843 physmem = xen_start_info.nr_pages; 1844 uvm_page_physload(atop(avail_start), atop(avail_end), 1845 atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT); 1846 #endif 1847 1848 init_x86_msgbuf(); 1849 1850 kasan_init(); 1851 kcsan_init(); 1852 kmsan_init((void *)lwp0uarea); 1853 1854 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); 1855 1856 kpreempt_disable(); 1857 1858 #ifndef XENPV 1859 pmap_kenter_pa(local_apic_va, local_apic_pa, 1860 VM_PROT_READ|VM_PROT_WRITE, 0); 1861 pmap_update(pmap_kernel()); 1862 memset((void *)local_apic_va, 0, PAGE_SIZE); 1863 #endif 1864 1865 pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1866 pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1867 pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1868 pmap_update(pmap_kernel()); 1869 memset((void *)idt_vaddr, 0, PAGE_SIZE); 1870 memset((void *)gdt_vaddr, 0, PAGE_SIZE); 1871 memset((void *)ldt_vaddr, 0, PAGE_SIZE); 1872 1873 #ifndef XENPV 1874 pmap_changeprot_local(idt_vaddr, VM_PROT_READ); 1875 #endif 1876 1877 pmap_update(pmap_kernel()); 1878 1879 iv = &(cpu_info_primary.ci_idtvec); 1880 idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary)); 1881 idt = iv->iv_idt; 1882 gdtstore = (char *)gdt_vaddr; 1883 ldtstore = (char *)ldt_vaddr; 1884 1885 /* 1886 * Make GDT gates and memory segments. 1887 */ 1888 set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, 1889 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); 1890 1891 set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, 1892 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); 1893 1894 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, 1895 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); 1896 1897 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, 1898 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); 1899 1900 #ifndef XENPV 1901 set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, 1902 LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); 1903 #endif 1904 1905 /* 1906 * Make LDT memory segments. 1907 */ 1908 *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = 1909 *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); 1910 *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = 1911 *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); 1912 1913 /* 1914 * 32 bit GDT entries. 1915 */ 1916 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, 1917 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); 1918 1919 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, 1920 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1921 1922 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0, 1923 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1924 1925 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0, 1926 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1927 1928 /* 1929 * 32 bit LDT entries. 1930 */ 1931 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); 1932 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, 1933 SDT_MEMERA, SEL_UPL, 1, 1, 0); 1934 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); 1935 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, 1936 SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1937 1938 /* CPU-specific IDT exceptions. */ 1939 for (x = 0; x < NCPUIDT; x++) { 1940 int sel, ist; 1941 1942 /* Reset to default. Special cases below */ 1943 sel = SEL_KPL; 1944 ist = 0; 1945 1946 idt_vec_reserve(iv, x); 1947 1948 switch (x) { 1949 case 1: /* DB */ 1950 ist = 4; 1951 break; 1952 case 2: /* NMI */ 1953 ist = 3; 1954 break; 1955 case 3: 1956 case 4: 1957 sel = SEL_UPL; 1958 break; 1959 case 8: /* double fault */ 1960 ist = 2; 1961 break; 1962 #ifdef XENPV 1963 case 18: /* MCA */ 1964 sel |= 0x4; /* Auto EOI/mask */ 1965 break; 1966 #endif /* XENPV */ 1967 default: 1968 break; 1969 } 1970 1971 set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT, 1972 sel, GSEL(GCODE_SEL, SEL_KPL)); 1973 } 1974 1975 /* new-style interrupt gate for syscalls */ 1976 idt_vec_reserve(iv, 128); 1977 set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, 1978 GSEL(GCODE_SEL, SEL_KPL)); 1979 1980 kpreempt_enable(); 1981 1982 setregion(®ion, gdtstore, DYNSEL_START - 1); 1983 lgdt(®ion); 1984 1985 #ifdef XENPV 1986 /* Init Xen callbacks and syscall handlers */ 1987 if (HYPERVISOR_set_callbacks( 1988 (unsigned long) hypervisor_callback, 1989 (unsigned long) failsafe_callback, 1990 (unsigned long) Xsyscall)) 1991 panic("HYPERVISOR_set_callbacks() failed"); 1992 #endif /* XENPV */ 1993 1994 cpu_init_idt(&cpu_info_primary); 1995 1996 #ifdef XENPV 1997 xen_init_ksyms(); 1998 #else /* XENPV */ 1999 #ifdef XEN 2000 if (vm_guest == VM_GUEST_XENPVH) 2001 xen_init_ksyms(); 2002 else 2003 #endif /* XEN */ 2004 init_x86_64_ksyms(); 2005 #endif /* XENPV */ 2006 2007 #ifndef XENPV 2008 intr_default_setup(); 2009 #else 2010 events_default_setup(); 2011 #endif 2012 2013 splraise(IPL_HIGH); 2014 x86_enable_intr(); 2015 2016 #ifdef DDB 2017 if (boothowto & RB_KDB) 2018 Debugger(); 2019 #endif 2020 #ifdef KGDB 2021 kgdb_port_init(); 2022 if (boothowto & RB_KDB) { 2023 kgdb_debug_init = 1; 2024 kgdb_connect(1); 2025 } 2026 #endif 2027 2028 pcb->pcb_dbregs = NULL; 2029 x86_dbregs_init(); 2030 } 2031 2032 void 2033 cpu_reset(void) 2034 { 2035 #ifndef XENPV 2036 idt_descriptor_t *idt; 2037 vaddr_t vaddr; 2038 2039 idt = cpu_info_primary.ci_idtvec.iv_idt; 2040 vaddr = (vaddr_t)idt; 2041 #endif 2042 2043 x86_disable_intr(); 2044 2045 #ifdef XENPV 2046 HYPERVISOR_reboot(); 2047 #else 2048 2049 x86_reset(); 2050 2051 /* 2052 * Try to cause a triple fault and watchdog reset by making the IDT 2053 * invalid and causing a fault. 2054 */ 2055 kpreempt_disable(); 2056 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); 2057 memset((void *)idt, 0, NIDT * sizeof(idt[0])); 2058 kpreempt_enable(); 2059 breakpoint(); 2060 2061 #if 0 2062 /* 2063 * Try to cause a triple fault and watchdog reset by unmapping the 2064 * entire address space and doing a TLB flush. 2065 */ 2066 memset((void *)PTD, 0, PAGE_SIZE); 2067 tlbflush(); 2068 #endif 2069 #endif /* XENPV */ 2070 2071 for (;;); 2072 } 2073 2074 void 2075 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) 2076 { 2077 const struct trapframe *tf = l->l_md.md_regs; 2078 __greg_t ras_rip; 2079 2080 mcp->__gregs[_REG_RDI] = tf->tf_rdi; 2081 mcp->__gregs[_REG_RSI] = tf->tf_rsi; 2082 mcp->__gregs[_REG_RDX] = tf->tf_rdx; 2083 mcp->__gregs[_REG_R10] = tf->tf_r10; 2084 mcp->__gregs[_REG_R8] = tf->tf_r8; 2085 mcp->__gregs[_REG_R9] = tf->tf_r9; 2086 /* argX not touched */ 2087 mcp->__gregs[_REG_RCX] = tf->tf_rcx; 2088 mcp->__gregs[_REG_R11] = tf->tf_r11; 2089 mcp->__gregs[_REG_R12] = tf->tf_r12; 2090 mcp->__gregs[_REG_R13] = tf->tf_r13; 2091 mcp->__gregs[_REG_R14] = tf->tf_r14; 2092 mcp->__gregs[_REG_R15] = tf->tf_r15; 2093 mcp->__gregs[_REG_RBP] = tf->tf_rbp; 2094 mcp->__gregs[_REG_RBX] = tf->tf_rbx; 2095 mcp->__gregs[_REG_RAX] = tf->tf_rax; 2096 mcp->__gregs[_REG_GS] = 0; 2097 mcp->__gregs[_REG_FS] = 0; 2098 mcp->__gregs[_REG_ES] = GSEL(GUDATA_SEL, SEL_UPL); 2099 mcp->__gregs[_REG_DS] = GSEL(GUDATA_SEL, SEL_UPL); 2100 mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno; 2101 mcp->__gregs[_REG_ERR] = tf->tf_err; 2102 mcp->__gregs[_REG_RIP] = tf->tf_rip; 2103 mcp->__gregs[_REG_CS] = LSEL(LUCODE_SEL, SEL_UPL); 2104 mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags; 2105 mcp->__gregs[_REG_RSP] = tf->tf_rsp; 2106 mcp->__gregs[_REG_SS] = LSEL(LUDATA_SEL, SEL_UPL); 2107 2108 if ((ras_rip = (__greg_t)ras_lookup(l->l_proc, 2109 (void *) mcp->__gregs[_REG_RIP])) != -1) 2110 mcp->__gregs[_REG_RIP] = ras_rip; 2111 2112 *flags |= _UC_CPU; 2113 2114 mcp->_mc_tlsbase = (uintptr_t)l->l_private; 2115 *flags |= _UC_TLSBASE; 2116 2117 process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs); 2118 *flags |= _UC_FPU; 2119 } 2120 2121 int 2122 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) 2123 { 2124 struct trapframe *tf = l->l_md.md_regs; 2125 const __greg_t *gr = mcp->__gregs; 2126 struct proc *p = l->l_proc; 2127 int error; 2128 int64_t rflags; 2129 2130 CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512); 2131 2132 if ((flags & _UC_CPU) != 0) { 2133 error = cpu_mcontext_validate(l, mcp); 2134 if (error != 0) 2135 return error; 2136 2137 tf->tf_rdi = gr[_REG_RDI]; 2138 tf->tf_rsi = gr[_REG_RSI]; 2139 tf->tf_rdx = gr[_REG_RDX]; 2140 tf->tf_r10 = gr[_REG_R10]; 2141 tf->tf_r8 = gr[_REG_R8]; 2142 tf->tf_r9 = gr[_REG_R9]; 2143 /* argX not touched */ 2144 tf->tf_rcx = gr[_REG_RCX]; 2145 tf->tf_r11 = gr[_REG_R11]; 2146 tf->tf_r12 = gr[_REG_R12]; 2147 tf->tf_r13 = gr[_REG_R13]; 2148 tf->tf_r14 = gr[_REG_R14]; 2149 tf->tf_r15 = gr[_REG_R15]; 2150 tf->tf_rbp = gr[_REG_RBP]; 2151 tf->tf_rbx = gr[_REG_RBX]; 2152 tf->tf_rax = gr[_REG_RAX]; 2153 tf->tf_gs = 0; 2154 tf->tf_fs = 0; 2155 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 2156 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 2157 /* trapno, err not touched */ 2158 tf->tf_rip = gr[_REG_RIP]; 2159 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); 2160 rflags = tf->tf_rflags; 2161 rflags &= ~PSL_USER; 2162 tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER); 2163 tf->tf_rsp = gr[_REG_RSP]; 2164 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); 2165 2166 l->l_md.md_flags |= MDL_IRET; 2167 } 2168 2169 if ((flags & _UC_FPU) != 0) 2170 process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs); 2171 2172 if ((flags & _UC_TLSBASE) != 0) 2173 lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); 2174 2175 mutex_enter(p->p_lock); 2176 if (flags & _UC_SETSTACK) 2177 l->l_sigstk.ss_flags |= SS_ONSTACK; 2178 if (flags & _UC_CLRSTACK) 2179 l->l_sigstk.ss_flags &= ~SS_ONSTACK; 2180 mutex_exit(p->p_lock); 2181 2182 return 0; 2183 } 2184 2185 int 2186 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp) 2187 { 2188 struct proc *p __diagused = l->l_proc; 2189 struct trapframe *tf = l->l_md.md_regs; 2190 const __greg_t *gr; 2191 uint16_t sel; 2192 2193 KASSERT((p->p_flag & PK_32) == 0); 2194 gr = mcp->__gregs; 2195 2196 if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) 2197 return EINVAL; 2198 2199 sel = gr[_REG_ES] & 0xffff; 2200 if (sel != 0 && !VALID_USER_DSEL(sel)) 2201 return EINVAL; 2202 2203 sel = gr[_REG_FS] & 0xffff; 2204 if (sel != 0 && !VALID_USER_DSEL(sel)) 2205 return EINVAL; 2206 2207 sel = gr[_REG_GS] & 0xffff; 2208 if (sel != 0 && !VALID_USER_DSEL(sel)) 2209 return EINVAL; 2210 2211 sel = gr[_REG_DS] & 0xffff; 2212 if (!VALID_USER_DSEL(sel)) 2213 return EINVAL; 2214 2215 #ifndef XENPV 2216 sel = gr[_REG_SS] & 0xffff; 2217 if (!VALID_USER_DSEL(sel)) 2218 return EINVAL; 2219 2220 sel = gr[_REG_CS] & 0xffff; 2221 if (!VALID_USER_CSEL(sel)) 2222 return EINVAL; 2223 #endif 2224 2225 if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS) 2226 return EINVAL; 2227 2228 return 0; 2229 } 2230 2231 int 2232 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled) 2233 { 2234 const vaddr_t v = (vaddr_t)ptr; 2235 vaddr_t kva, kva_end; 2236 size_t i; 2237 2238 kva = bootspace.head.va; 2239 kva_end = kva + bootspace.head.sz; 2240 if (v >= kva && v < kva_end) { 2241 *handled = true; 2242 return 0; 2243 } 2244 2245 for (i = 0; i < BTSPACE_NSEGS; i++) { 2246 kva = bootspace.segs[i].va; 2247 kva_end = kva + bootspace.segs[i].sz; 2248 if (v < kva || v >= kva_end) 2249 continue; 2250 *handled = true; 2251 if (bootspace.segs[i].type == BTSEG_TEXT || 2252 bootspace.segs[i].type == BTSEG_RODATA) { 2253 if (prot & VM_PROT_WRITE) { 2254 return EFAULT; 2255 } 2256 } 2257 return 0; 2258 } 2259 2260 kva = bootspace.boot.va; 2261 kva_end = kva + bootspace.boot.sz; 2262 if (v >= kva && v < kva_end) { 2263 *handled = true; 2264 return 0; 2265 } 2266 2267 if (v >= bootspace.smodule && v < bootspace.emodule) { 2268 *handled = true; 2269 if (!uvm_map_checkprot(module_map, v, v + 1, prot)) { 2270 return EFAULT; 2271 } 2272 } else { 2273 *handled = false; 2274 } 2275 return 0; 2276 } 2277 2278 /* 2279 * Zero out a 64bit LWP's segments registers. Used when exec'ing a new 2280 * 64bit program. 2281 */ 2282 void 2283 cpu_segregs64_zero(struct lwp *l) 2284 { 2285 struct trapframe * const tf = l->l_md.md_regs; 2286 struct pcb *pcb; 2287 uint64_t zero = 0; 2288 2289 KASSERT(kpreempt_disabled()); 2290 KASSERT((l->l_proc->p_flag & PK_32) == 0); 2291 KASSERT(l == curlwp); 2292 2293 pcb = lwp_getpcb(l); 2294 2295 tf->tf_fs = 0; 2296 tf->tf_gs = 0; 2297 setds(GSEL(GUDATA_SEL, SEL_UPL)); 2298 setes(GSEL(GUDATA_SEL, SEL_UPL)); 2299 setfs(0); 2300 setusergs(0); 2301 2302 #ifndef XENPV 2303 wrmsr(MSR_FSBASE, 0); 2304 wrmsr(MSR_KERNELGSBASE, 0); 2305 #else 2306 HYPERVISOR_set_segment_base(SEGBASE_FS, 0); 2307 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0); 2308 #endif 2309 2310 pcb->pcb_fs = 0; 2311 pcb->pcb_gs = 0; 2312 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); 2313 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); 2314 } 2315 2316 /* 2317 * Zero out a 32bit LWP's segments registers. Used when exec'ing a new 2318 * 32bit program. 2319 */ 2320 void 2321 cpu_segregs32_zero(struct lwp *l) 2322 { 2323 struct trapframe * const tf = l->l_md.md_regs; 2324 struct pcb *pcb; 2325 uint64_t zero = 0; 2326 2327 KASSERT(kpreempt_disabled()); 2328 KASSERT(l->l_proc->p_flag & PK_32); 2329 KASSERT(l == curlwp); 2330 2331 pcb = lwp_getpcb(l); 2332 2333 tf->tf_fs = 0; 2334 tf->tf_gs = 0; 2335 setds(GSEL(GUDATA32_SEL, SEL_UPL)); 2336 setes(GSEL(GUDATA32_SEL, SEL_UPL)); 2337 setfs(0); 2338 setusergs(0); 2339 pcb->pcb_fs = 0; 2340 pcb->pcb_gs = 0; 2341 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); 2342 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); 2343 } 2344 2345 /* 2346 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors. 2347 * Used only for 32-bit processes. 2348 */ 2349 void 2350 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel) 2351 { 2352 struct trapframe *tf; 2353 struct pcb *pcb; 2354 2355 KASSERT(l->l_proc->p_flag & PK_32); 2356 KASSERT(l == curlwp); 2357 2358 tf = l->l_md.md_regs; 2359 fssel &= 0xFFFF; 2360 gssel &= 0xFFFF; 2361 2362 pcb = lwp_getpcb(l); 2363 kpreempt_disable(); 2364 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); 2365 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); 2366 2367 #ifdef XENPV 2368 setusergs(gssel); 2369 #endif 2370 2371 tf->tf_fs = fssel; 2372 tf->tf_gs = gssel; 2373 kpreempt_enable(); 2374 } 2375 2376 bool 2377 mm_md_direct_mapped_io(void *addr, paddr_t *paddr) 2378 { 2379 vaddr_t va = (vaddr_t)addr; 2380 2381 #ifdef __HAVE_DIRECT_MAP 2382 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2383 *paddr = PMAP_DIRECT_UNMAP(va); 2384 return true; 2385 } 2386 #else 2387 __USE(va); 2388 #endif 2389 2390 return false; 2391 } 2392 2393 bool 2394 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr) 2395 { 2396 #ifdef __HAVE_DIRECT_MAP 2397 *vaddr = PMAP_DIRECT_MAP(paddr); 2398 return true; 2399 #else 2400 return false; 2401 #endif 2402 } 2403 2404 static void 2405 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src) 2406 { 2407 idt_descriptor_t *idt_dst; 2408 2409 idt_dst = dst->iv_idt; 2410 2411 kpreempt_disable(); 2412 pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE); 2413 2414 memcpy(idt_dst, src->iv_idt, PAGE_SIZE); 2415 memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap)); 2416 2417 pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ); 2418 kpreempt_enable(); 2419 } 2420 2421 void 2422 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid) 2423 { 2424 vaddr_t va; 2425 2426 if (cid != cpu_index(&cpu_info_primary) && 2427 idt_vec_is_pcpu()) { 2428 #ifdef __HAVE_PCPU_AREA 2429 va = (vaddr_t)&pcpuarea->ent[cid].idt; 2430 #else 2431 struct vm_page *pg; 2432 2433 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 2434 UVM_KMF_VAONLY); 2435 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 2436 if (pg == NULL) { 2437 panic("failed to allocate a page for IDT"); 2438 } 2439 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), 2440 VM_PROT_READ|VM_PROT_WRITE, 0); 2441 pmap_update(pmap_kernel()); 2442 #endif 2443 2444 memset((void *)va, 0, PAGE_SIZE); 2445 #ifndef XENPV 2446 pmap_changeprot_local(va, VM_PROT_READ); 2447 #endif 2448 pmap_update(pmap_kernel()); 2449 2450 iv->iv_idt = (void *)va; 2451 idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec)); 2452 } else { 2453 iv->iv_idt = (void *)idt_vaddr; 2454 } 2455 } 2456