1 /* $OpenBSD: machdep.c,v 1.297 2024/09/21 19:06:07 deraadt Exp $ */ 2 /* $NetBSD: machdep.c,v 1.3 2003/05/07 22:58:18 fvdl Exp $ */ 3 4 /*- 5 * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 36 * All rights reserved. 37 * 38 * This code is derived from software contributed to Berkeley by 39 * William Jolitz. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)machdep.c 7.4 (Berkeley) 6/3/91 66 */ 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/signal.h> 71 #include <sys/signalvar.h> 72 #include <sys/proc.h> 73 #include <sys/user.h> 74 #include <sys/exec.h> 75 #include <sys/buf.h> 76 #include <sys/reboot.h> 77 #include <sys/conf.h> 78 #include <sys/msgbuf.h> 79 #include <sys/mount.h> 80 #include <sys/extent.h> 81 #include <sys/core.h> 82 #include <sys/kcore.h> 83 #include <sys/syscallargs.h> 84 85 #include <dev/cons.h> 86 #include <stand/boot/bootarg.h> 87 88 #include <net/if.h> 89 #include <uvm/uvm_extern.h> 90 91 #include <sys/sysctl.h> 92 93 #include <machine/cpu_full.h> 94 #include <machine/cpufunc.h> 95 #include <machine/pio.h> 96 #include <machine/psl.h> 97 #include <machine/reg.h> 98 #include <machine/fpu.h> 99 #include <machine/biosvar.h> 100 #include <machine/mpbiosvar.h> 101 #include <machine/kcore.h> 102 #include <machine/tss.h> 103 104 #include <dev/isa/isareg.h> 105 #include <dev/ic/i8042reg.h> 106 107 #ifdef DDB 108 #include <machine/db_machdep.h> 109 #include <ddb/db_extern.h> 110 extern int db_console; 111 #endif 112 113 #include "isa.h" 114 #include "isadma.h" 115 #include "ksyms.h" 116 117 #include "acpi.h" 118 #if NACPI > 0 119 #include <dev/acpi/acpireg.h> 120 #include <dev/acpi/acpivar.h> 121 #endif 122 123 #include "com.h" 124 #if NCOM > 0 125 #include <sys/tty.h> 126 #include <dev/ic/comvar.h> 127 #include <dev/ic/comreg.h> 128 #endif 129 130 #include "efi.h" 131 #if NEFI > 0 132 #include <dev/efi/efi.h> 133 #endif 134 135 #include "softraid.h" 136 #if NSOFTRAID > 0 137 #include <dev/softraidvar.h> 138 #endif 139 140 #ifdef HIBERNATE 141 #include <machine/hibernate_var.h> 142 #endif /* HIBERNATE */ 143 144 #include "ukbd.h" 145 #include "pckbc.h" 146 #if NPCKBC > 0 && NUKBD > 0 147 #include <dev/ic/pckbcvar.h> 148 #endif 149 150 /* #define MACHDEP_DEBUG */ 151 152 #ifdef MACHDEP_DEBUG 153 #define DPRINTF(x...) do { printf(x); } while(0) 154 #else 155 #define DPRINTF(x...) 156 #endif /* MACHDEP_DEBUG */ 157 158 /* the following is used externally (sysctl_hw) */ 159 char machine[] = MACHINE; 160 161 /* 162 * switchto vectors 163 */ 164 void cpu_idle_cycle_hlt(void); 165 void (*cpu_idle_cycle_fcn)(void) = &cpu_idle_cycle_hlt; 166 void (*cpu_suspend_cycle_fcn)(void); 167 168 /* the following is used externally for concurrent handlers */ 169 int setperf_prio = 0; 170 171 #ifdef CPURESET_DELAY 172 int cpureset_delay = CPURESET_DELAY; 173 #else 174 int cpureset_delay = 0; 175 #endif 176 177 char *ssym = 0, *esym = 0; /* start and end of symbol table */ 178 dev_t bootdev = 0; /* device we booted from */ 179 int biosbasemem = 0; /* base memory reported by BIOS */ 180 u_int bootapiver = 0; /* /boot API version */ 181 182 int physmem; 183 extern int boothowto; 184 185 paddr_t dumpmem_paddr; 186 vaddr_t dumpmem_vaddr; 187 psize_t dumpmem_sz; 188 189 vaddr_t kern_end; 190 191 vaddr_t msgbuf_vaddr; 192 paddr_t msgbuf_paddr; 193 194 vaddr_t idt_vaddr; 195 paddr_t idt_paddr; 196 197 vaddr_t lo32_vaddr; 198 paddr_t lo32_paddr; 199 paddr_t tramp_pdirpa; 200 201 int kbd_reset; 202 int lid_action = 1; 203 int pwr_action = 1; 204 int forceukbd; 205 206 /* 207 * safepri is a safe priority for sleep to set for a spin-wait 208 * during autoconfiguration or after a panic. 209 */ 210 int safepri = 0; 211 212 struct vm_map *exec_map = NULL; 213 struct vm_map *phys_map = NULL; 214 215 /* UVM constraint ranges. */ 216 struct uvm_constraint_range isa_constraint = { 0x0, 0x00ffffffUL }; 217 struct uvm_constraint_range dma_constraint = { 0x0, 0xffffffffUL }; 218 struct uvm_constraint_range *uvm_md_constraints[] = { 219 &isa_constraint, 220 &dma_constraint, 221 NULL, 222 }; 223 224 paddr_t avail_start; 225 paddr_t avail_end; 226 227 void (*delay_func)(int) = i8254_delay; 228 void (*initclock_func)(void) = i8254_initclocks; 229 void (*startclock_func)(void) = i8254_start_both_clocks; 230 231 /* 232 * Format of boot information passed to us by 32-bit /boot 233 */ 234 typedef struct _boot_args32 { 235 int ba_type; 236 int ba_size; 237 int ba_nextX; /* a ptr in 32-bit world, but not here */ 238 char ba_arg[1]; 239 } bootarg32_t; 240 241 #define BOOTARGC_MAX NBPG /* one page */ 242 243 bios_bootmac_t *bios_bootmac; 244 245 /* locore copies the arguments from /boot to here for us */ 246 char bootinfo[BOOTARGC_MAX]; 247 int bootinfo_size = BOOTARGC_MAX; 248 249 void getbootinfo(char *, int); 250 251 /* Data passed to us by /boot, filled in by getbootinfo() */ 252 bios_diskinfo_t *bios_diskinfo; 253 bios_memmap_t *bios_memmap; 254 u_int32_t bios_cksumlen; 255 bios_efiinfo_t *bios_efiinfo; 256 bios_ucode_t *bios_ucode; 257 258 #if NEFI > 0 259 EFI_MEMORY_DESCRIPTOR *mmap; 260 #endif 261 262 /* 263 * Size of memory segments, before any memory is stolen. 264 */ 265 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; 266 int mem_cluster_cnt; 267 268 int cpu_dump(void); 269 int cpu_dumpsize(void); 270 u_long cpu_dump_mempagecnt(void); 271 void dumpsys(void); 272 void cpu_init_extents(void); 273 void map_tramps(void); 274 void init_x86_64(paddr_t); 275 void (*cpuresetfn)(void); 276 void enter_shared_special_pages(void); 277 278 #ifdef APERTURE 279 int allowaperture = 0; 280 #endif 281 282 /* 283 * Machine-dependent startup code 284 */ 285 void 286 cpu_startup(void) 287 { 288 vaddr_t minaddr, maxaddr; 289 290 msgbuf_vaddr = PMAP_DIRECT_MAP(msgbuf_paddr); 291 initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE)); 292 293 printf("%s", version); 294 startclocks(); 295 rtcinit(); 296 297 printf("real mem = %lu (%luMB)\n", ptoa((psize_t)physmem), 298 ptoa((psize_t)physmem)/1024/1024); 299 300 /* 301 * Allocate a submap for exec arguments. This map effectively 302 * limits the number of processes exec'ing at any time. 303 */ 304 minaddr = vm_map_min(kernel_map); 305 exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 306 16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL); 307 308 /* 309 * Allocate a submap for physio 310 */ 311 minaddr = vm_map_min(kernel_map); 312 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 313 VM_PHYS_SIZE, 0, FALSE, NULL); 314 315 printf("avail mem = %lu (%luMB)\n", ptoa((psize_t)uvmexp.free), 316 ptoa((psize_t)uvmexp.free)/1024/1024); 317 318 bufinit(); 319 320 if (boothowto & RB_CONFIG) { 321 #ifdef BOOT_CONFIG 322 user_config(); 323 #else 324 printf("kernel does not support -c; continuing..\n"); 325 #endif 326 } 327 328 /* Safe for i/o port / memory space allocation to use malloc now. */ 329 x86_bus_space_mallocok(); 330 331 #ifndef SMALL_KERNEL 332 cpu_ucode_setup(); 333 cpu_ucode_apply(&cpu_info_primary); 334 #endif 335 cpu_tsx_disable(&cpu_info_primary); 336 337 /* enter the IDT and trampoline code in the u-k maps */ 338 enter_shared_special_pages(); 339 340 /* initialize CPU0's TSS and GDT and put them in the u-k maps */ 341 cpu_enter_pages(&cpu_info_full_primary); 342 } 343 344 /* 345 * enter_shared_special_pages 346 * 347 * Requests mapping of various special pages required in the Intel Meltdown 348 * case (to be entered into the U-K page table): 349 * 350 * 1 IDT page 351 * Various number of pages covering the U-K ".kutext" section. This section 352 * contains code needed during trampoline operation 353 * Various number of pages covering the U-K ".kudata" section. This section 354 * contains data accessed by the trampoline, before switching to U+K 355 * (for example, various shared global variables used by IPIs, etc) 356 * 357 * The linker script places the required symbols in the sections above. 358 * 359 * On CPUs not affected by Meltdown, the calls to pmap_enter_special below 360 * become no-ops. 361 */ 362 void 363 enter_shared_special_pages(void) 364 { 365 extern char __kutext_start[], __kutext_end[], __kernel_kutext_phys[]; 366 extern char __text_page_start[], __text_page_end[]; 367 extern char __kernel_kutext_page_phys[]; 368 extern char __kudata_start[], __kudata_end[], __kernel_kudata_phys[]; 369 vaddr_t va; 370 paddr_t pa; 371 372 /* idt */ 373 pmap_enter_special(idt_vaddr, idt_paddr, PROT_READ); 374 DPRINTF("%s: entered idt page va 0x%llx pa 0x%llx\n", __func__, 375 (uint64_t)idt_vaddr, (uint64_t)idt_paddr); 376 377 /* .kutext section */ 378 va = (vaddr_t)__kutext_start; 379 pa = (paddr_t)__kernel_kutext_phys; 380 while (va < (vaddr_t)__kutext_end) { 381 pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); 382 DPRINTF("%s: entered kutext page va 0x%llx pa 0x%llx\n", 383 __func__, (uint64_t)va, (uint64_t)pa); 384 va += PAGE_SIZE; 385 pa += PAGE_SIZE; 386 } 387 388 /* .kutext.page section */ 389 va = (vaddr_t)__text_page_start; 390 pa = (paddr_t)__kernel_kutext_page_phys; 391 while (va < (vaddr_t)__text_page_end) { 392 pmap_enter_special(va, pa, PROT_READ | PROT_EXEC); 393 DPRINTF("%s: entered kutext.page va 0x%llx pa 0x%llx\n", 394 __func__, (uint64_t)va, (uint64_t)pa); 395 va += PAGE_SIZE; 396 pa += PAGE_SIZE; 397 } 398 399 /* .kudata section */ 400 va = (vaddr_t)__kudata_start; 401 pa = (paddr_t)__kernel_kudata_phys; 402 while (va < (vaddr_t)__kudata_end) { 403 pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); 404 DPRINTF("%s: entered kudata page va 0x%llx pa 0x%llx\n", 405 __func__, (uint64_t)va, (uint64_t)pa); 406 va += PAGE_SIZE; 407 pa += PAGE_SIZE; 408 } 409 } 410 411 /* 412 * Set up proc0's PCB and the cpu's TSS. 413 */ 414 void 415 x86_64_proc0_tss_ldt_init(void) 416 { 417 struct pcb *pcb; 418 419 cpu_info_primary.ci_curpcb = pcb = &proc0.p_addr->u_pcb; 420 pcb->pcb_fsbase = 0; 421 pcb->pcb_kstack = (u_int64_t)proc0.p_addr + USPACE - 16; 422 proc0.p_md.md_regs = (struct trapframe *)pcb->pcb_kstack - 1; 423 424 ltr(GSYSSEL(GPROC0_SEL, SEL_KPL)); 425 lldt(0); 426 } 427 428 bios_diskinfo_t * 429 bios_getdiskinfo(dev_t dev) 430 { 431 bios_diskinfo_t *pdi; 432 433 if (bios_diskinfo == NULL) 434 return NULL; 435 436 for (pdi = bios_diskinfo; pdi->bios_number != -1; pdi++) { 437 if ((dev & B_MAGICMASK) == B_DEVMAGIC) { /* search by bootdev */ 438 if (pdi->bsd_dev == dev) 439 break; 440 } else { 441 if (pdi->bios_number == dev) 442 break; 443 } 444 } 445 446 if (pdi->bios_number == -1) 447 return NULL; 448 else 449 return pdi; 450 } 451 452 int 453 bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 454 size_t newlen, struct proc *p) 455 { 456 bios_diskinfo_t *pdi; 457 int biosdev; 458 459 /* all sysctl names at this level except diskinfo are terminal */ 460 if (namelen != 1 && name[0] != BIOS_DISKINFO) 461 return (ENOTDIR); /* overloaded */ 462 463 if (!(bootapiver & BAPIV_VECTOR)) 464 return EOPNOTSUPP; 465 466 switch (name[0]) { 467 case BIOS_DEV: 468 if ((pdi = bios_getdiskinfo(bootdev)) == NULL) 469 return ENXIO; 470 biosdev = pdi->bios_number; 471 return sysctl_rdint(oldp, oldlenp, newp, biosdev); 472 case BIOS_DISKINFO: 473 if (namelen != 2) 474 return ENOTDIR; 475 if ((pdi = bios_getdiskinfo(name[1])) == NULL) 476 return ENXIO; 477 return sysctl_rdstruct(oldp, oldlenp, newp, pdi, sizeof(*pdi)); 478 case BIOS_CKSUMLEN: 479 return sysctl_rdint(oldp, oldlenp, newp, bios_cksumlen); 480 default: 481 return EOPNOTSUPP; 482 } 483 /* NOTREACHED */ 484 } 485 486 extern int tsc_is_invariant; 487 extern int amd64_has_xcrypt; 488 extern int need_retpoline; 489 490 const struct sysctl_bounded_args cpuctl_vars[] = { 491 { CPU_LIDACTION, &lid_action, -1, 2 }, 492 { CPU_PWRACTION, &pwr_action, 0, 2 }, 493 { CPU_CPUID, &cpu_id, SYSCTL_INT_READONLY }, 494 { CPU_CPUFEATURE, &cpu_feature, SYSCTL_INT_READONLY }, 495 { CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY }, 496 { CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY }, 497 { CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY }, 498 }; 499 500 /* 501 * machine dependent system variables. 502 */ 503 int 504 cpu_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 505 size_t newlen, struct proc *p) 506 { 507 extern uint64_t tsc_frequency; 508 dev_t consdev; 509 dev_t dev; 510 511 switch (name[0]) { 512 case CPU_CONSDEV: 513 if (namelen != 1) 514 return (ENOTDIR); /* overloaded */ 515 if (cn_tab != NULL) 516 consdev = cn_tab->cn_dev; 517 else 518 consdev = NODEV; 519 return (sysctl_rdstruct(oldp, oldlenp, newp, &consdev, 520 sizeof consdev)); 521 case CPU_CHR2BLK: 522 if (namelen != 2) 523 return (ENOTDIR); /* overloaded */ 524 dev = chrtoblk((dev_t)name[1]); 525 return sysctl_rdstruct(oldp, oldlenp, newp, &dev, sizeof(dev)); 526 case CPU_BIOS: 527 return bios_sysctl(name + 1, namelen - 1, oldp, oldlenp, 528 newp, newlen, p); 529 case CPU_CPUVENDOR: 530 return (sysctl_rdstring(oldp, oldlenp, newp, cpu_vendor)); 531 case CPU_KBDRESET: 532 return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen, 533 &kbd_reset)); 534 case CPU_ALLOWAPERTURE: 535 if (namelen != 1) 536 return (ENOTDIR); /* overloaded */ 537 #ifdef APERTURE 538 if (securelevel > 0) 539 return (sysctl_int_lower(oldp, oldlenp, newp, newlen, 540 &allowaperture)); 541 else 542 return (sysctl_int(oldp, oldlenp, newp, newlen, 543 &allowaperture)); 544 #else 545 return (sysctl_rdint(oldp, oldlenp, newp, 0)); 546 #endif 547 #if NPCKBC > 0 && NUKBD > 0 548 case CPU_FORCEUKBD: 549 { 550 int error; 551 552 if (forceukbd) 553 return (sysctl_rdint(oldp, oldlenp, newp, forceukbd)); 554 555 error = sysctl_int(oldp, oldlenp, newp, newlen, &forceukbd); 556 if (forceukbd) 557 pckbc_release_console(); 558 return (error); 559 } 560 #endif 561 case CPU_TSCFREQ: 562 return (sysctl_rdquad(oldp, oldlenp, newp, tsc_frequency)); 563 default: 564 return (sysctl_bounded_arr(cpuctl_vars, nitems(cpuctl_vars), 565 name, namelen, oldp, oldlenp, newp, newlen)); 566 } 567 /* NOTREACHED */ 568 } 569 570 static inline void 571 maybe_enable_user_cet(struct proc *p) 572 { 573 #ifndef SMALL_KERNEL 574 /* Enable indirect-branch tracking if present and not disabled */ 575 if ((xsave_mask & XFEATURE_CET_U) && 576 (p->p_p->ps_flags & PS_NOBTCFI) == 0) { 577 uint64_t msr = rdmsr(MSR_U_CET); 578 wrmsr(MSR_U_CET, msr | MSR_CET_ENDBR_EN | MSR_CET_NO_TRACK_EN); 579 } 580 #endif 581 } 582 583 static inline void 584 initialize_thread_xstate(struct proc *p) 585 { 586 if (cpu_use_xsaves) { 587 xrstors(fpu_cleandata, xsave_mask); 588 maybe_enable_user_cet(p); 589 } else { 590 /* Reset FPU state in PCB */ 591 memcpy(&p->p_addr->u_pcb.pcb_savefpu, fpu_cleandata, 592 fpu_save_len); 593 594 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) { 595 /* state in CPU is obsolete; reset it */ 596 fpureset(); 597 } 598 } 599 600 /* The reset state _is_ the userspace state for this thread now */ 601 curcpu()->ci_pflags |= CPUPF_USERXSTATE; 602 } 603 604 /* 605 * Copy out the FPU state, massaging it to be usable from userspace 606 * and acceptable to xrstor_user() 607 */ 608 static inline int 609 copyoutfpu(struct savefpu *sfp, char *sp, size_t len) 610 { 611 uint64_t bvs[2]; 612 613 if (copyout(sfp, sp, len)) 614 return 1; 615 if (len > offsetof(struct savefpu, fp_xstate.xstate_bv)) { 616 sp += offsetof(struct savefpu, fp_xstate.xstate_bv); 617 len -= offsetof(struct savefpu, fp_xstate.xstate_bv); 618 bvs[0] = sfp->fp_xstate.xstate_bv & XFEATURE_XCR0_MASK; 619 bvs[1] = sfp->fp_xstate.xstate_xcomp_bv & 620 (XFEATURE_XCR0_MASK | XFEATURE_COMPRESSED); 621 if (copyout(bvs, sp, min(len, sizeof bvs))) 622 return 1; 623 } 624 return 0; 625 } 626 627 /* 628 * Send an interrupt to process. 629 * 630 * Stack is set up to allow sigcode to call routine, followed by 631 * syscall to sigreturn routine below. After sigreturn resets the 632 * signal mask, the stack, and the frame pointer, it returns to the 633 * user specified pc. 634 */ 635 int 636 sendsig(sig_t catcher, int sig, sigset_t mask, const siginfo_t *ksip, 637 int info, int onstack) 638 { 639 struct proc *p = curproc; 640 struct trapframe *tf = p->p_md.md_regs; 641 struct sigcontext ksc; 642 struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu; 643 register_t sp, scp, sip; 644 u_long sss; 645 646 memset(&ksc, 0, sizeof ksc); 647 ksc.sc_rdi = tf->tf_rdi; 648 ksc.sc_rsi = tf->tf_rsi; 649 ksc.sc_rdx = tf->tf_rdx; 650 ksc.sc_rcx = tf->tf_rcx; 651 ksc.sc_r8 = tf->tf_r8; 652 ksc.sc_r9 = tf->tf_r9; 653 ksc.sc_r10 = tf->tf_r10; 654 ksc.sc_r11 = tf->tf_r11; 655 ksc.sc_r12 = tf->tf_r12; 656 ksc.sc_r13 = tf->tf_r13; 657 ksc.sc_r14 = tf->tf_r14; 658 ksc.sc_r15 = tf->tf_r15; 659 ksc.sc_rbx = tf->tf_rbx; 660 ksc.sc_rax = tf->tf_rax; 661 ksc.sc_rbp = tf->tf_rbp; 662 ksc.sc_rip = tf->tf_rip; 663 ksc.sc_cs = tf->tf_cs; 664 ksc.sc_rflags = tf->tf_rflags; 665 ksc.sc_rsp = tf->tf_rsp; 666 ksc.sc_ss = tf->tf_ss; 667 ksc.sc_mask = mask; 668 669 /* Allocate space for the signal handler context. */ 670 if ((p->p_sigstk.ss_flags & SS_DISABLE) == 0 && 671 !sigonstack(tf->tf_rsp) && onstack) 672 sp = trunc_page((vaddr_t)p->p_sigstk.ss_sp + p->p_sigstk.ss_size); 673 else 674 sp = tf->tf_rsp - 128; 675 676 sp -= fpu_save_len; 677 if (cpu_use_xsaves) 678 sp &= ~63ULL; /* just in case */ 679 else 680 sp &= ~15ULL; /* just in case */ 681 682 /* Save FPU state to PCB if necessary, then copy it out */ 683 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) 684 fpusave(&p->p_addr->u_pcb.pcb_savefpu); 685 if (copyoutfpu(sfp, (void *)sp, fpu_save_len)) 686 return 1; 687 688 initialize_thread_xstate(p); 689 690 ksc.sc_fpstate = (struct fxsave64 *)sp; 691 sss = (sizeof(ksc) + 15) & ~15; 692 sip = 0; 693 if (info) { 694 sip = sp - ((sizeof(*ksip) + 15) & ~15); 695 sss += (sizeof(*ksip) + 15) & ~15; 696 697 if (copyout(ksip, (void *)sip, sizeof(*ksip))) 698 return 1; 699 } 700 scp = sp - sss; 701 702 ksc.sc_cookie = (long)scp ^ p->p_p->ps_sigcookie; 703 if (copyout(&ksc, (void *)scp, sizeof(ksc))) 704 return 1; 705 706 /* 707 * Build context to run handler in. 708 */ 709 tf->tf_rax = (u_int64_t)catcher; 710 tf->tf_rdi = sig; 711 tf->tf_rsi = sip; 712 tf->tf_rdx = scp; 713 714 tf->tf_rip = (u_int64_t)p->p_p->ps_sigcode; 715 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 716 tf->tf_rflags &= ~(PSL_T|PSL_D|PSL_VM|PSL_AC); 717 tf->tf_rsp = scp; 718 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 719 720 return 0; 721 } 722 723 /* 724 * System call to cleanup state after a signal 725 * has been taken. Reset signal mask and 726 * stack state from context left by sendsig (above). 727 * Return to previous pc and psl as specified by 728 * context left by sendsig. Check carefully to 729 * make sure that the user has not modified the 730 * psl to gain improper privileges or to cause 731 * a machine fault. 732 */ 733 int 734 sys_sigreturn(struct proc *p, void *v, register_t *retval) 735 { 736 struct sys_sigreturn_args /* { 737 syscallarg(struct sigcontext *) sigcntxp; 738 } */ *uap = v; 739 struct sigcontext ksc, *scp = SCARG(uap, sigcntxp); 740 struct trapframe *tf = p->p_md.md_regs; 741 struct savefpu *sfp = &p->p_addr->u_pcb.pcb_savefpu; 742 int error; 743 744 if (PROC_PC(p) != p->p_p->ps_sigcoderet) { 745 sigexit(p, SIGILL); 746 return (EPERM); 747 } 748 749 if ((error = copyin((caddr_t)scp, &ksc, sizeof ksc))) 750 return (error); 751 752 if (ksc.sc_cookie != ((long)scp ^ p->p_p->ps_sigcookie)) { 753 sigexit(p, SIGILL); 754 return (EFAULT); 755 } 756 757 /* Prevent reuse of the sigcontext cookie */ 758 ksc.sc_cookie = 0; 759 (void)copyout(&ksc.sc_cookie, (caddr_t)scp + 760 offsetof(struct sigcontext, sc_cookie), sizeof (ksc.sc_cookie)); 761 762 if (((ksc.sc_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0 || 763 !USERMODE(ksc.sc_cs, ksc.sc_eflags)) 764 return (EINVAL); 765 766 /* Current FPU state is obsolete; toss it and force a reload */ 767 if (curcpu()->ci_pflags & CPUPF_USERXSTATE) { 768 curcpu()->ci_pflags &= ~CPUPF_USERXSTATE; 769 fpureset(); 770 } 771 772 /* Copy in the FPU state to restore */ 773 if (__predict_true(ksc.sc_fpstate != NULL)) { 774 if ((error = copyin(ksc.sc_fpstate, sfp, fpu_save_len))) 775 return error; 776 if (xrstor_user(sfp, xsave_mask)) { 777 memcpy(sfp, fpu_cleandata, fpu_save_len); 778 return EINVAL; 779 } 780 maybe_enable_user_cet(p); 781 curcpu()->ci_pflags |= CPUPF_USERXSTATE; 782 } else { 783 /* shouldn't happen, but handle it */ 784 initialize_thread_xstate(p); 785 } 786 787 tf->tf_rdi = ksc.sc_rdi; 788 tf->tf_rsi = ksc.sc_rsi; 789 tf->tf_rdx = ksc.sc_rdx; 790 tf->tf_rcx = ksc.sc_rcx; 791 tf->tf_r8 = ksc.sc_r8; 792 tf->tf_r9 = ksc.sc_r9; 793 tf->tf_r10 = ksc.sc_r10; 794 tf->tf_r11 = ksc.sc_r11; 795 tf->tf_r12 = ksc.sc_r12; 796 tf->tf_r13 = ksc.sc_r13; 797 tf->tf_r14 = ksc.sc_r14; 798 tf->tf_r15 = ksc.sc_r15; 799 tf->tf_rbx = ksc.sc_rbx; 800 tf->tf_rax = ksc.sc_rax; 801 tf->tf_rbp = ksc.sc_rbp; 802 tf->tf_rip = ksc.sc_rip; 803 tf->tf_cs = ksc.sc_cs; 804 tf->tf_rflags = ksc.sc_rflags; 805 tf->tf_rsp = ksc.sc_rsp; 806 tf->tf_ss = ksc.sc_ss; 807 808 /* Restore signal mask. */ 809 p->p_sigmask = ksc.sc_mask & ~sigcantmask; 810 811 /* 812 * sigreturn() needs to return to userspace via the 'iretq' 813 * method, so that if the process was interrupted (by tick, 814 * an IPI, whatever) as opposed to already being in the kernel 815 * when a signal was being delivered, the process will be 816 * completely restored, including the userland %rcx and %r11 817 * registers which the 'sysretq' instruction cannot restore. 818 * Also need to make sure we can handle faulting on xrstor. 819 */ 820 p->p_md.md_flags |= MDP_IRET; 821 822 return (EJUSTRETURN); 823 } 824 825 #ifdef MULTIPROCESSOR 826 /* force a CPU into the kernel, whether or not it's idle */ 827 void 828 cpu_kick(struct cpu_info *ci) 829 { 830 /* only need to kick other CPUs */ 831 if (ci != curcpu()) { 832 if (cpu_mwait_size > 0) { 833 /* 834 * If not idling, then send an IPI, else 835 * just clear the "keep idling" bit. 836 */ 837 if ((ci->ci_mwait & MWAIT_IN_IDLE) == 0) 838 x86_send_ipi(ci, X86_IPI_NOP); 839 else 840 atomic_clearbits_int(&ci->ci_mwait, 841 MWAIT_KEEP_IDLING); 842 } else { 843 /* no mwait, so need an IPI */ 844 x86_send_ipi(ci, X86_IPI_NOP); 845 } 846 } 847 } 848 #endif 849 850 /* 851 * Notify the current process (p) that it has a signal pending, 852 * process as soon as possible. 853 */ 854 void 855 signotify(struct proc *p) 856 { 857 aston(p); 858 cpu_kick(p->p_cpu); 859 } 860 861 #ifdef MULTIPROCESSOR 862 void 863 cpu_unidle(struct cpu_info *ci) 864 { 865 if (cpu_mwait_size > 0 && (ci->ci_mwait & MWAIT_ONLY)) { 866 /* 867 * Just clear the "keep idling" bit; if it wasn't 868 * idling then we didn't need to do anything anyway. 869 */ 870 atomic_clearbits_int(&ci->ci_mwait, MWAIT_KEEP_IDLING); 871 return; 872 } 873 874 if (ci != curcpu()) 875 x86_send_ipi(ci, X86_IPI_NOP); 876 } 877 #endif 878 879 int waittime = -1; 880 struct pcb dumppcb; 881 882 __dead void 883 boot(int howto) 884 { 885 #if NACPI > 0 886 if ((howto & RB_POWERDOWN) != 0 && acpi_softc) 887 acpi_softc->sc_state = ACPI_STATE_S5; 888 #endif 889 890 if ((howto & RB_POWERDOWN) != 0) 891 lid_action = 0; 892 893 if ((howto & RB_RESET) != 0) 894 goto doreset; 895 896 if (cold) { 897 if ((howto & RB_USERREQ) == 0) 898 howto |= RB_HALT; 899 goto haltsys; 900 } 901 902 boothowto = howto; 903 if ((howto & RB_NOSYNC) == 0 && waittime < 0) { 904 waittime = 0; 905 vfs_shutdown(curproc); 906 907 if ((howto & RB_TIMEBAD) == 0) { 908 resettodr(); 909 } else { 910 printf("WARNING: not updating battery clock\n"); 911 } 912 } 913 if_downall(); 914 915 uvm_shutdown(); 916 splhigh(); 917 cold = 1; 918 919 if ((howto & RB_DUMP) != 0) 920 dumpsys(); 921 922 haltsys: 923 config_suspend_all(DVACT_POWERDOWN); 924 925 #ifdef MULTIPROCESSOR 926 x86_broadcast_ipi(X86_IPI_HALT); 927 #endif 928 929 if ((howto & RB_HALT) != 0) { 930 #if NACPI > 0 && !defined(SMALL_KERNEL) 931 extern int acpi_enabled; 932 933 if (acpi_enabled) { 934 delay(500000); 935 if ((howto & RB_POWERDOWN) != 0) 936 acpi_powerdown(); 937 } 938 #endif 939 printf("\n"); 940 printf("The operating system has halted.\n"); 941 printf("Please press any key to reboot.\n\n"); 942 cnpollc(1); /* for proper keyboard command handling */ 943 cngetc(); 944 cnpollc(0); 945 } 946 947 doreset: 948 printf("rebooting...\n"); 949 if (cpureset_delay > 0) 950 delay(cpureset_delay * 1000); 951 cpu_reset(); 952 for (;;) 953 continue; 954 /* NOTREACHED */ 955 } 956 957 /* 958 * These variables are needed by /sbin/savecore 959 */ 960 u_long dumpmag = 0x8fca0101; /* magic number */ 961 int dumpsize = 0; /* pages */ 962 long dumplo = 0; /* blocks */ 963 964 /* 965 * cpu_dump: dump the machine-dependent kernel core dump headers. 966 */ 967 int 968 cpu_dump(void) 969 { 970 int (*dump)(dev_t, daddr_t, caddr_t, size_t); 971 char buf[dbtob(1)]; 972 kcore_seg_t *segp; 973 cpu_kcore_hdr_t *cpuhdrp; 974 phys_ram_seg_t *memsegp; 975 caddr_t va; 976 int i; 977 978 dump = bdevsw[major(dumpdev)].d_dump; 979 980 memset(buf, 0, sizeof buf); 981 segp = (kcore_seg_t *)buf; 982 cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))]; 983 memsegp = (phys_ram_seg_t *)&buf[ALIGN(sizeof(*segp)) + 984 ALIGN(sizeof(*cpuhdrp))]; 985 986 /* 987 * Generate a segment header. 988 */ 989 CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU); 990 segp->c_size = dbtob(1) - ALIGN(sizeof(*segp)); 991 992 /* 993 * Add the machine-dependent header info. 994 */ 995 cpuhdrp->ptdpaddr = proc0.p_addr->u_pcb.pcb_cr3; 996 cpuhdrp->nmemsegs = mem_cluster_cnt; 997 998 /* 999 * Fill in the memory segment descriptors. 1000 */ 1001 for (i = 0; i < mem_cluster_cnt; i++) { 1002 memsegp[i].start = mem_clusters[i].start; 1003 memsegp[i].size = mem_clusters[i].size & ~PAGE_MASK; 1004 } 1005 1006 /* 1007 * If we have dump memory then assume the kernel stack is in high 1008 * memory and bounce 1009 */ 1010 if (dumpmem_vaddr != 0) { 1011 memcpy((char *)dumpmem_vaddr, buf, sizeof(buf)); 1012 va = (caddr_t)dumpmem_vaddr; 1013 } else { 1014 va = (caddr_t)buf; 1015 } 1016 return (dump(dumpdev, dumplo, va, dbtob(1))); 1017 } 1018 1019 /* 1020 * This is called by main to set dumplo and dumpsize. 1021 * Dumps always skip the first PAGE_SIZE of disk space 1022 * in case there might be a disk label stored there. 1023 * If there is extra space, put dump at the end to 1024 * reduce the chance that swapping trashes it. 1025 */ 1026 void 1027 dumpconf(void) 1028 { 1029 int nblks, dumpblks; /* size of dump area */ 1030 1031 if (dumpdev == NODEV || 1032 (nblks = (bdevsw[major(dumpdev)].d_psize)(dumpdev)) == 0) 1033 return; 1034 if (nblks <= ctod(1)) 1035 return; 1036 1037 dumpblks = cpu_dumpsize(); 1038 if (dumpblks < 0) 1039 return; 1040 dumpblks += ctod(cpu_dump_mempagecnt()); 1041 1042 /* If dump won't fit (incl. room for possible label), punt. */ 1043 if (dumpblks > (nblks - ctod(1))) 1044 return; 1045 1046 /* Put dump at end of partition */ 1047 dumplo = nblks - dumpblks; 1048 1049 /* dumpsize is in page units, and doesn't include headers. */ 1050 dumpsize = cpu_dump_mempagecnt(); 1051 } 1052 1053 /* 1054 * Doadump comes here after turning off memory management and 1055 * getting on the dump stack, either when called above, or by 1056 * the auto-restart code. 1057 */ 1058 #define BYTES_PER_DUMP MAXPHYS /* must be a multiple of pagesize */ 1059 1060 void 1061 dumpsys(void) 1062 { 1063 u_long totalbytesleft, bytes, i, n, memseg; 1064 u_long maddr; 1065 daddr_t blkno; 1066 void *va; 1067 int (*dump)(dev_t, daddr_t, caddr_t, size_t); 1068 int error; 1069 1070 /* Save registers. */ 1071 savectx(&dumppcb); 1072 1073 if (dumpdev == NODEV) 1074 return; 1075 1076 /* 1077 * For dumps during autoconfiguration, 1078 * if dump device has already configured... 1079 */ 1080 if (dumpsize == 0) 1081 dumpconf(); 1082 if (dumplo <= 0 || dumpsize == 0) { 1083 printf("\ndump to dev %u,%u not possible\n", major(dumpdev), 1084 minor(dumpdev)); 1085 return; 1086 } 1087 printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev), 1088 minor(dumpdev), dumplo); 1089 1090 error = (*bdevsw[major(dumpdev)].d_psize)(dumpdev); 1091 printf("dump "); 1092 if (error == -1) { 1093 printf("area unavailable\n"); 1094 return; 1095 } 1096 1097 if ((error = cpu_dump()) != 0) 1098 goto err; 1099 1100 totalbytesleft = ptoa(cpu_dump_mempagecnt()); 1101 blkno = dumplo + cpu_dumpsize(); 1102 dump = bdevsw[major(dumpdev)].d_dump; 1103 error = 0; 1104 1105 for (memseg = 0; memseg < mem_cluster_cnt; memseg++) { 1106 maddr = mem_clusters[memseg].start; 1107 bytes = mem_clusters[memseg].size; 1108 1109 for (i = 0; i < bytes; i += n, totalbytesleft -= n) { 1110 /* Print out how many MBs we have left to go. */ 1111 if ((totalbytesleft % (1024*1024)) < BYTES_PER_DUMP) 1112 printf("%ld ", totalbytesleft / (1024 * 1024)); 1113 1114 /* Limit size for next transfer. */ 1115 n = bytes - i; 1116 if (n > BYTES_PER_DUMP) 1117 n = BYTES_PER_DUMP; 1118 if (maddr > 0xffffffff) { 1119 va = (void *)dumpmem_vaddr; 1120 if (n > dumpmem_sz) 1121 n = dumpmem_sz; 1122 memcpy(va, (void *)PMAP_DIRECT_MAP(maddr), n); 1123 } else { 1124 va = (void *)PMAP_DIRECT_MAP(maddr); 1125 } 1126 1127 error = (*dump)(dumpdev, blkno, va, n); 1128 if (error) 1129 goto err; 1130 maddr += n; 1131 blkno += btodb(n); /* XXX? */ 1132 1133 #if 0 /* XXX this doesn't work. grr. */ 1134 /* operator aborting dump? */ 1135 if (sget() != NULL) { 1136 error = EINTR; 1137 break; 1138 } 1139 #endif 1140 } 1141 } 1142 1143 err: 1144 switch (error) { 1145 1146 case ENXIO: 1147 printf("device bad\n"); 1148 break; 1149 1150 case EFAULT: 1151 printf("device not ready\n"); 1152 break; 1153 1154 case EINVAL: 1155 printf("area improper\n"); 1156 break; 1157 1158 case EIO: 1159 printf("i/o error\n"); 1160 break; 1161 1162 case EINTR: 1163 printf("aborted from console\n"); 1164 break; 1165 1166 case 0: 1167 printf("succeeded\n"); 1168 break; 1169 1170 default: 1171 printf("error %d\n", error); 1172 break; 1173 } 1174 printf("\n\n"); 1175 delay(5000000); /* 5 seconds */ 1176 } 1177 1178 /* 1179 * Force the userspace FS.base to be reloaded from the PCB on return from 1180 * the kernel, and reset the segment registers (%ds, %es, %fs, and %gs) 1181 * to their expected userspace value. 1182 */ 1183 void 1184 reset_segs(void) 1185 { 1186 /* 1187 * This operates like the cpu_switchto() sequence: if we 1188 * haven't reset %[defg]s already, do so now. 1189 */ 1190 if (curcpu()->ci_pflags & CPUPF_USERSEGS) { 1191 curcpu()->ci_pflags &= ~CPUPF_USERSEGS; 1192 __asm volatile( 1193 "movw %%ax,%%ds\n\t" 1194 "movw %%ax,%%es\n\t" 1195 "movw %%ax,%%fs\n\t" 1196 "cli\n\t" /* block intr when on user GS.base */ 1197 "swapgs\n\t" /* swap from kernel to user GS.base */ 1198 "movw %%ax,%%gs\n\t"/* set %gs to UDATA and GS.base to 0 */ 1199 "swapgs\n\t" /* back to kernel GS.base */ 1200 "sti" : : "a"(GSEL(GUDATA_SEL, SEL_UPL))); 1201 } 1202 } 1203 1204 /* 1205 * Clear registers on exec 1206 */ 1207 void 1208 setregs(struct proc *p, struct exec_package *pack, u_long stack, 1209 struct ps_strings *arginfo) 1210 { 1211 struct trapframe *tf; 1212 1213 initialize_thread_xstate(p); 1214 1215 /* To reset all registers we have to return via iretq */ 1216 p->p_md.md_flags |= MDP_IRET; 1217 1218 reset_segs(); 1219 p->p_addr->u_pcb.pcb_fsbase = 0; 1220 1221 tf = p->p_md.md_regs; 1222 memset(tf, 0, sizeof *tf); 1223 tf->tf_rip = pack->ep_entry; 1224 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 1225 tf->tf_rflags = PSL_USERSET; 1226 tf->tf_rsp = stack; 1227 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 1228 } 1229 1230 /* 1231 * Initialize segments and descriptor tables 1232 */ 1233 1234 struct gate_descriptor *idt; 1235 char idt_allocmap[NIDT]; 1236 struct user *proc0paddr = NULL; 1237 1238 void 1239 setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, 1240 int sel) 1241 { 1242 gd->gd_looffset = (u_int64_t)func & 0xffff; 1243 gd->gd_selector = sel; 1244 gd->gd_ist = ist; 1245 gd->gd_type = type; 1246 gd->gd_dpl = dpl; 1247 gd->gd_p = 1; 1248 gd->gd_hioffset = (u_int64_t)func >> 16; 1249 gd->gd_zero = 0; 1250 gd->gd_xx1 = 0; 1251 gd->gd_xx2 = 0; 1252 gd->gd_xx3 = 0; 1253 } 1254 1255 void 1256 unsetgate(struct gate_descriptor *gd) 1257 { 1258 memset(gd, 0, sizeof (*gd)); 1259 } 1260 1261 void 1262 setregion(struct region_descriptor *rd, void *base, u_int16_t limit) 1263 { 1264 rd->rd_limit = limit; 1265 rd->rd_base = (u_int64_t)base; 1266 } 1267 1268 /* 1269 * Note that the base and limit fields are ignored in long mode. 1270 */ 1271 void 1272 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, 1273 int type, int dpl, int gran, int def32, int is64) 1274 { 1275 sd->sd_lolimit = (unsigned)limit; 1276 sd->sd_lobase = (unsigned long)base; 1277 sd->sd_type = type; 1278 sd->sd_dpl = dpl; 1279 sd->sd_p = 1; 1280 sd->sd_hilimit = (unsigned)limit >> 16; 1281 sd->sd_avl = 0; 1282 sd->sd_long = is64; 1283 sd->sd_def32 = def32; 1284 sd->sd_gran = gran; 1285 sd->sd_hibase = (unsigned long)base >> 24; 1286 } 1287 1288 void 1289 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, 1290 int type, int dpl, int gran) 1291 { 1292 memset(sd, 0, sizeof *sd); 1293 sd->sd_lolimit = (unsigned)limit; 1294 sd->sd_lobase = (u_int64_t)base; 1295 sd->sd_type = type; 1296 sd->sd_dpl = dpl; 1297 sd->sd_p = 1; 1298 sd->sd_hilimit = (unsigned)limit >> 16; 1299 sd->sd_gran = gran; 1300 sd->sd_hibase = (u_int64_t)base >> 24; 1301 } 1302 1303 void 1304 cpu_init_idt(void) 1305 { 1306 struct region_descriptor region; 1307 1308 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); 1309 lidt(®ion); 1310 } 1311 1312 void 1313 cpu_init_extents(void) 1314 { 1315 extern struct extent *iomem_ex; 1316 static int already_done; 1317 int i; 1318 1319 /* We get called for each CPU, only first should do this */ 1320 if (already_done) 1321 return; 1322 1323 /* 1324 * Allocate the physical addresses used by RAM from the iomem 1325 * extent map. 1326 */ 1327 for (i = 0; i < mem_cluster_cnt; i++) { 1328 if (extent_alloc_region(iomem_ex, mem_clusters[i].start, 1329 mem_clusters[i].size, EX_NOWAIT)) { 1330 /* XXX What should we do? */ 1331 printf("WARNING: CAN'T ALLOCATE RAM (%llx-%llx)" 1332 " FROM IOMEM EXTENT MAP!\n", mem_clusters[i].start, 1333 mem_clusters[i].start + mem_clusters[i].size - 1); 1334 } 1335 } 1336 1337 already_done = 1; 1338 } 1339 1340 void 1341 map_tramps(void) 1342 { 1343 #if defined(MULTIPROCESSOR) || \ 1344 (NACPI > 0 && !defined(SMALL_KERNEL)) 1345 struct pmap *kmp = pmap_kernel(); 1346 extern paddr_t tramp_pdirpa; 1347 #ifdef MULTIPROCESSOR 1348 extern u_char cpu_spinup_trampoline[]; 1349 extern u_char cpu_spinup_trampoline_end[]; 1350 extern u_char mp_tramp_data_start[]; 1351 extern u_char mp_tramp_data_end[]; 1352 extern u_int32_t mp_pdirpa; 1353 #endif 1354 1355 /* 1356 * The initial PML4 pointer must be below 4G, so if the 1357 * current one isn't, use a "bounce buffer" and save it 1358 * for tramps to use. 1359 */ 1360 if (kmp->pm_pdirpa > 0xffffffff) { 1361 pmap_kenter_pa(lo32_vaddr, lo32_paddr, PROT_READ | PROT_WRITE); 1362 memcpy((void *)lo32_vaddr, kmp->pm_pdir, PAGE_SIZE); 1363 tramp_pdirpa = lo32_paddr; 1364 pmap_kremove(lo32_vaddr, PAGE_SIZE); 1365 } else 1366 tramp_pdirpa = kmp->pm_pdirpa; 1367 1368 1369 #ifdef MULTIPROCESSOR 1370 /* Map MP tramp code and data pages RW for copy */ 1371 pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE, 1372 PROT_READ | PROT_WRITE); 1373 1374 pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA, 1375 PROT_READ | PROT_WRITE); 1376 1377 memset((caddr_t)MP_TRAMPOLINE, 0xcc, PAGE_SIZE); 1378 memset((caddr_t)MP_TRAMP_DATA, 0xcc, PAGE_SIZE); 1379 1380 memcpy((caddr_t)MP_TRAMPOLINE, 1381 cpu_spinup_trampoline, 1382 cpu_spinup_trampoline_end-cpu_spinup_trampoline); 1383 1384 memcpy((caddr_t)MP_TRAMP_DATA, 1385 mp_tramp_data_start, 1386 mp_tramp_data_end - mp_tramp_data_start); 1387 1388 /* 1389 * We need to patch this after we copy the tramp data, 1390 * the symbol points into the copied tramp data page. 1391 */ 1392 mp_pdirpa = tramp_pdirpa; 1393 1394 /* Unmap, will be remapped in cpu_start_secondary */ 1395 pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE); 1396 pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE); 1397 #endif /* MULTIPROCESSOR */ 1398 #endif 1399 } 1400 1401 void 1402 cpu_set_vendor(struct cpu_info *ci, int level, const char *vendor) 1403 { 1404 ci->ci_cpuid_level = level; 1405 cpuid_level = MIN(cpuid_level, level); 1406 1407 /* map the vendor string to an integer */ 1408 if (strcmp(vendor, "AuthenticAMD") == 0) 1409 ci->ci_vendor = CPUV_AMD; 1410 else if (strcmp(vendor, "GenuineIntel") == 0) 1411 ci->ci_vendor = CPUV_INTEL; 1412 else if (strcmp(vendor, "CentaurHauls") == 0) 1413 ci->ci_vendor = CPUV_VIA; 1414 else 1415 ci->ci_vendor = CPUV_UNKNOWN; 1416 } 1417 1418 #define IDTVEC(name) __CONCAT(X, name) 1419 typedef void (vector)(void); 1420 extern vector *IDTVEC(exceptions)[]; 1421 1422 paddr_t early_pte_pages; 1423 1424 void 1425 init_x86_64(paddr_t first_avail) 1426 { 1427 struct region_descriptor region; 1428 bios_memmap_t *bmp; 1429 int x, ist; 1430 uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30; 1431 1432 /* 1433 * locore0 mapped 3 pages for use before the pmap is initialized 1434 * starting at first_avail. These pages are currently used by 1435 * efifb to create early-use VAs for the framebuffer before efifb 1436 * is attached. 1437 */ 1438 early_pte_pages = first_avail; 1439 first_avail += 3 * NBPG; 1440 1441 cpu_set_vendor(&cpu_info_primary, cpuid_level, cpu_vendor); 1442 cpu_init_msrs(&cpu_info_primary); 1443 1444 proc0.p_addr = proc0paddr; 1445 cpu_info_primary.ci_curpcb = &proc0.p_addr->u_pcb; 1446 1447 x86_bus_space_init(); 1448 1449 i8254_startclock(); 1450 1451 /* 1452 * Initialize PAGE_SIZE-dependent variables. 1453 */ 1454 uvm_setpagesize(); 1455 1456 /* 1457 * Boot arguments are in a single page specified by /boot. 1458 * 1459 * We require the "new" vector form, as well as memory ranges 1460 * to be given in bytes rather than KB. 1461 * 1462 * locore copies the data into bootinfo[] for us. 1463 */ 1464 if ((bootapiver & (BAPIV_VECTOR | BAPIV_BMEMMAP)) == 1465 (BAPIV_VECTOR | BAPIV_BMEMMAP)) { 1466 if (bootinfo_size >= sizeof(bootinfo)) 1467 panic("boot args too big"); 1468 1469 getbootinfo(bootinfo, bootinfo_size); 1470 } else 1471 panic("invalid /boot"); 1472 1473 cninit(); 1474 1475 /* 1476 * Memory on the AMD64 port is described by three different things. 1477 * 1478 * 1. biosbasemem - This is outdated, and should really only be used to 1479 * sanitize the other values. This is what we get back from the BIOS 1480 * using the legacy routines, describing memory below 640KB. 1481 * 1482 * 2. bios_memmap[] - This is the memory map as the bios has returned 1483 * it to us. It includes memory the kernel occupies, etc. 1484 * 1485 * 3. mem_cluster[] - This is the massaged free memory segments after 1486 * taking into account the contents of bios_memmap, biosbasemem, 1487 * and locore/machdep/pmap kernel allocations of physical 1488 * pages. 1489 * 1490 * The other thing is that the physical page *RANGE* is described by 1491 * three more variables: 1492 * 1493 * avail_start - This is a physical address of the start of available 1494 * pages, until IOM_BEGIN. This is basically the start 1495 * of the UVM managed range of memory, with some holes... 1496 * 1497 * avail_end - This is the end of physical pages. All physical pages 1498 * that UVM manages are between avail_start and avail_end. 1499 * There are holes... 1500 * 1501 * first_avail - This is the first available physical page after the 1502 * kernel, page tables, etc. 1503 * 1504 * We skip the first few pages for trampolines, hibernate, and to avoid 1505 * buggy SMI implementations that could corrupt the first 64KB. 1506 */ 1507 avail_start = 16*PAGE_SIZE; 1508 1509 #ifdef MULTIPROCESSOR 1510 if (avail_start < MP_TRAMPOLINE + PAGE_SIZE) 1511 avail_start = MP_TRAMPOLINE + PAGE_SIZE; 1512 if (avail_start < MP_TRAMP_DATA + PAGE_SIZE) 1513 avail_start = MP_TRAMP_DATA + PAGE_SIZE; 1514 #endif 1515 1516 #if (NACPI > 0 && !defined(SMALL_KERNEL)) 1517 if (avail_start < ACPI_TRAMPOLINE + PAGE_SIZE) 1518 avail_start = ACPI_TRAMPOLINE + PAGE_SIZE; 1519 if (avail_start < ACPI_TRAMP_DATA + PAGE_SIZE) 1520 avail_start = ACPI_TRAMP_DATA + PAGE_SIZE; 1521 #endif 1522 1523 #ifdef HIBERNATE 1524 if (avail_start < HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE) 1525 avail_start = HIBERNATE_HIBALLOC_PAGE + PAGE_SIZE; 1526 #endif /* HIBERNATE */ 1527 1528 /* 1529 * We need to go through the BIOS memory map given, and 1530 * fill out mem_clusters and mem_cluster_cnt stuff, taking 1531 * into account all the points listed above. 1532 */ 1533 avail_end = mem_cluster_cnt = 0; 1534 for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) { 1535 paddr_t s1, s2, e1, e2; 1536 1537 /* Ignore non-free memory */ 1538 if (bmp->type != BIOS_MAP_FREE) 1539 continue; 1540 if (bmp->size < PAGE_SIZE) 1541 continue; 1542 1543 /* Init our segment(s), round/trunc to pages */ 1544 s1 = round_page(bmp->addr); 1545 e1 = trunc_page(bmp->addr + bmp->size); 1546 s2 = e2 = 0; 1547 1548 /* 1549 * XXX Some buggy ACPI BIOSes use memory that they 1550 * declare as free. Current worst offender is 1551 * Supermicro 5019D-FTN4. Typically the affected memory 1552 * areas are small blocks between areas reserved for 1553 * ACPI and other BIOS goo. So skip areas smaller 1554 * than 32 MB above the 16 MB boundary (to avoid 1555 * affecting legacy stuff). 1556 */ 1557 if (s1 > 16*1024*1024 && (e1 - s1) < 32*1024*1024) 1558 continue; 1559 1560 /* Check and adjust our segment(s) */ 1561 /* Nuke low pages */ 1562 if (s1 < avail_start) { 1563 s1 = avail_start; 1564 if (s1 > e1) 1565 continue; 1566 } 1567 1568 /* 1569 * The direct map is limited to 512GB * NUM_L4_SLOT_DIRECT of 1570 * memory, so discard anything above that. 1571 */ 1572 if (e1 >= max_dm_size) { 1573 e1 = max_dm_size; 1574 if (s1 > e1) 1575 continue; 1576 } 1577 1578 /* Crop stuff into "640K hole" */ 1579 if (s1 < IOM_BEGIN && e1 > IOM_BEGIN) 1580 e1 = IOM_BEGIN; 1581 if (s1 < biosbasemem && e1 > biosbasemem) 1582 e1 = biosbasemem; 1583 1584 /* Split any segments straddling the 16MB boundary */ 1585 if (s1 < 16*1024*1024 && e1 > 16*1024*1024) { 1586 e2 = e1; 1587 s2 = e1 = 16*1024*1024; 1588 } 1589 1590 /* Store segment(s) */ 1591 if (e1 - s1 >= PAGE_SIZE) { 1592 mem_clusters[mem_cluster_cnt].start = s1; 1593 mem_clusters[mem_cluster_cnt].size = e1 - s1; 1594 mem_cluster_cnt++; 1595 } 1596 if (e2 - s2 >= PAGE_SIZE) { 1597 mem_clusters[mem_cluster_cnt].start = s2; 1598 mem_clusters[mem_cluster_cnt].size = e2 - s2; 1599 mem_cluster_cnt++; 1600 } 1601 if (avail_end < e1) avail_end = e1; 1602 if (avail_end < e2) avail_end = e2; 1603 } 1604 1605 /* 1606 * Call pmap initialization to make new kernel address space. 1607 * We must do this before loading pages into the VM system. 1608 */ 1609 first_avail = pmap_bootstrap(first_avail, trunc_page(avail_end)); 1610 1611 #if NEFI > 0 1612 /* Relocate the EFI memory map. */ 1613 if (bios_efiinfo && bios_efiinfo->mmap_start) { 1614 mmap = (EFI_MEMORY_DESCRIPTOR *)PMAP_DIRECT_MAP(first_avail); 1615 memcpy(mmap, (void *)PMAP_DIRECT_MAP(bios_efiinfo->mmap_start), 1616 bios_efiinfo->mmap_size); 1617 first_avail += round_page(bios_efiinfo->mmap_size); 1618 } 1619 #endif 1620 1621 /* Allocate these out of the 640KB base memory */ 1622 if (avail_start != PAGE_SIZE) 1623 avail_start = pmap_prealloc_lowmem_ptps(avail_start); 1624 1625 cpu_init_extents(); 1626 1627 /* Make sure the end of the space used by the kernel is rounded. */ 1628 first_avail = round_page(first_avail); 1629 kern_end = KERNBASE + first_avail; 1630 1631 /* 1632 * Now, load the memory clusters (which have already been 1633 * flensed) into the VM system. 1634 */ 1635 for (x = 0; x < mem_cluster_cnt; x++) { 1636 paddr_t seg_start = mem_clusters[x].start; 1637 paddr_t seg_end = seg_start + mem_clusters[x].size; 1638 1639 if (seg_start < first_avail) seg_start = first_avail; 1640 if (seg_start > seg_end) continue; 1641 if (seg_end - seg_start < PAGE_SIZE) continue; 1642 1643 physmem += atop(mem_clusters[x].size); 1644 1645 #if DEBUG_MEMLOAD 1646 printf("loading 0x%lx-0x%lx (0x%lx-0x%lx)\n", 1647 seg_start, seg_end, atop(seg_start), atop(seg_end)); 1648 #endif 1649 uvm_page_physload(atop(seg_start), atop(seg_end), 1650 atop(seg_start), atop(seg_end), 0); 1651 } 1652 1653 /* 1654 * Now, load the memory between the end of I/O memory "hole" 1655 * and the kernel. 1656 */ 1657 { 1658 paddr_t seg_start = round_page(IOM_END); 1659 paddr_t seg_end = trunc_page(KERNTEXTOFF - KERNBASE); 1660 1661 if (seg_start < seg_end) { 1662 #if DEBUG_MEMLOAD 1663 printf("loading 0x%lx-0x%lx\n", seg_start, seg_end); 1664 #endif 1665 uvm_page_physload(atop(seg_start), atop(seg_end), 1666 atop(seg_start), atop(seg_end), 0); 1667 } 1668 } 1669 1670 #if DEBUG_MEMLOAD 1671 printf("avail_start = 0x%lx\n", avail_start); 1672 printf("avail_end = 0x%lx\n", avail_end); 1673 printf("first_avail = 0x%lx\n", first_avail); 1674 #endif 1675 1676 /* 1677 * Steal memory for the message buffer (at end of core). 1678 */ 1679 { 1680 struct vm_physseg *vps = NULL; 1681 psize_t sz = round_page(MSGBUFSIZE); 1682 psize_t reqsz = sz; 1683 1684 for (x = 0; x < vm_nphysseg; x++) { 1685 vps = &vm_physmem[x]; 1686 if (ptoa(vps->avail_end) == avail_end) 1687 break; 1688 } 1689 if (x == vm_nphysseg) 1690 panic("init_x86_64: can't find end of memory"); 1691 1692 /* Shrink so it'll fit in the last segment. */ 1693 if ((vps->avail_end - vps->avail_start) < atop(sz)) 1694 sz = ptoa(vps->avail_end - vps->avail_start); 1695 1696 vps->avail_end -= atop(sz); 1697 vps->end -= atop(sz); 1698 msgbuf_paddr = ptoa(vps->avail_end); 1699 1700 /* Remove the last segment if it now has no pages. */ 1701 if (vps->start == vps->end) { 1702 for (vm_nphysseg--; x < vm_nphysseg; x++) 1703 vm_physmem[x] = vm_physmem[x + 1]; 1704 } 1705 1706 /* Now find where the new avail_end is. */ 1707 for (avail_end = 0, x = 0; x < vm_nphysseg; x++) 1708 if (vm_physmem[x].avail_end > avail_end) 1709 avail_end = vm_physmem[x].avail_end; 1710 avail_end = ptoa(avail_end); 1711 1712 /* Warn if the message buffer had to be shrunk. */ 1713 if (sz != reqsz) 1714 printf("WARNING: %ld bytes not available for msgbuf " 1715 "in last cluster (%ld used)\n", reqsz, sz); 1716 } 1717 1718 /* 1719 * Steal some memory for a dump bouncebuffer if we have memory over 1720 * the 32-bit barrier. 1721 */ 1722 if (avail_end > 0xffffffff) { 1723 struct vm_physseg *vps = NULL; 1724 psize_t sz = round_page(MAX(BYTES_PER_DUMP, dbtob(1))); 1725 1726 /* XXX assumes segments are ordered */ 1727 for (x = 0; x < vm_nphysseg; x++) { 1728 vps = &vm_physmem[x]; 1729 /* Find something between 16meg and 4gig */ 1730 if (ptoa(vps->avail_end) <= 0xffffffff && 1731 ptoa(vps->avail_start) >= 0xffffff) 1732 break; 1733 } 1734 if (x == vm_nphysseg) 1735 panic("init_x86_64: no memory between " 1736 "0xffffff-0xffffffff"); 1737 1738 /* Shrink so it'll fit in the segment. */ 1739 if ((vps->avail_end - vps->avail_start) < atop(sz)) 1740 sz = ptoa(vps->avail_end - vps->avail_start); 1741 1742 vps->avail_end -= atop(sz); 1743 vps->end -= atop(sz); 1744 dumpmem_paddr = ptoa(vps->avail_end); 1745 dumpmem_vaddr = PMAP_DIRECT_MAP(dumpmem_paddr); 1746 dumpmem_sz = sz; 1747 1748 /* Remove the last segment if it now has no pages. */ 1749 if (vps->start == vps->end) { 1750 for (vm_nphysseg--; x < vm_nphysseg; x++) 1751 vm_physmem[x] = vm_physmem[x + 1]; 1752 } 1753 } 1754 1755 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); 1756 1757 pmap_kenter_pa(idt_vaddr, idt_paddr, PROT_READ | PROT_WRITE); 1758 1759 idt = (struct gate_descriptor *)idt_vaddr; 1760 cpu_info_primary.ci_tss = &cpu_info_full_primary.cif_tss; 1761 cpu_info_primary.ci_gdt = &cpu_info_full_primary.cif_gdt; 1762 1763 /* make gdt gates and memory segments */ 1764 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GCODE_SEL), 0, 1765 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); 1766 1767 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GDATA_SEL), 0, 1768 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); 1769 1770 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUDATA_SEL), 0, 1771 atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); 1772 1773 set_mem_segment(GDT_ADDR_MEM(cpu_info_primary.ci_gdt, GUCODE_SEL), 0, 1774 atop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); 1775 1776 set_sys_segment(GDT_ADDR_SYS(cpu_info_primary.ci_gdt, GPROC0_SEL), 1777 cpu_info_primary.ci_tss, sizeof (struct x86_64_tss)-1, 1778 SDT_SYS386TSS, SEL_KPL, 0); 1779 1780 /* exceptions */ 1781 for (x = 0; x < 32; x++) { 1782 /* trap2 == NMI, trap8 == double fault */ 1783 ist = (x == 2) ? 2 : (x == 8) ? 1 : 0; 1784 setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, 1785 (x == 3) ? SEL_UPL : SEL_KPL, 1786 GSEL(GCODE_SEL, SEL_KPL)); 1787 idt_allocmap[x] = 1; 1788 } 1789 1790 setregion(®ion, cpu_info_primary.ci_gdt, GDT_SIZE - 1); 1791 lgdt(®ion); 1792 1793 cpu_init_idt(); 1794 1795 intr_default_setup(); 1796 1797 fpuinit(&cpu_info_primary); 1798 1799 softintr_init(); 1800 splraise(IPL_IPI); 1801 intr_enable(); 1802 1803 #ifdef DDB 1804 db_machine_init(); 1805 ddb_init(); 1806 if (boothowto & RB_KDB) 1807 db_enter(); 1808 #endif 1809 } 1810 1811 void 1812 cpu_reset(void) 1813 { 1814 intr_disable(); 1815 1816 if (cpuresetfn) 1817 (*cpuresetfn)(); 1818 1819 /* 1820 * The keyboard controller has 4 random output pins, one of which is 1821 * connected to the RESET pin on the CPU in many PCs. We tell the 1822 * keyboard controller to pulse this line a couple of times. 1823 */ 1824 outb(IO_KBD + KBCMDP, KBC_PULSE0); 1825 delay(100000); 1826 outb(IO_KBD + KBCMDP, KBC_PULSE0); 1827 delay(100000); 1828 1829 /* 1830 * Try to cause a triple fault and watchdog reset by making the IDT 1831 * invalid and causing a fault. 1832 */ 1833 memset((caddr_t)idt, 0, NIDT * sizeof(idt[0])); 1834 __asm volatile("divl %0,%1" : : "q" (0), "a" (0)); 1835 1836 for (;;) 1837 continue; 1838 /* NOTREACHED */ 1839 } 1840 1841 /* 1842 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers. 1843 */ 1844 int 1845 cpu_dumpsize(void) 1846 { 1847 int size; 1848 1849 size = ALIGN(sizeof(kcore_seg_t)) + 1850 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); 1851 if (roundup(size, dbtob(1)) != dbtob(1)) 1852 return (-1); 1853 1854 return (1); 1855 } 1856 1857 /* 1858 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped. 1859 */ 1860 u_long 1861 cpu_dump_mempagecnt(void) 1862 { 1863 u_long i, n; 1864 1865 n = 0; 1866 for (i = 0; i < mem_cluster_cnt; i++) 1867 n += atop(mem_clusters[i].size); 1868 return (n); 1869 } 1870 1871 /* 1872 * Figure out which portions of memory are used by the kernel/system. 1873 */ 1874 int 1875 amd64_pa_used(paddr_t addr) 1876 { 1877 struct vm_page *pg; 1878 1879 /* Kernel manages these */ 1880 if ((pg = PHYS_TO_VM_PAGE(addr)) && (pg->pg_flags & PG_DEV) == 0) 1881 return 1; 1882 1883 /* Kernel is loaded here */ 1884 if (addr > IOM_END && addr < (kern_end - KERNBASE)) 1885 return 1; 1886 1887 /* Low memory used for various bootstrap things */ 1888 if (addr < avail_start) 1889 return 1; 1890 1891 /* 1892 * The only regions I can think of that are left are the things 1893 * we steal away from UVM. The message buffer? 1894 * XXX - ignore these for now. 1895 */ 1896 1897 return 0; 1898 } 1899 1900 void 1901 cpu_initclocks(void) 1902 { 1903 (*initclock_func)(); 1904 } 1905 1906 void 1907 cpu_startclock(void) 1908 { 1909 (*startclock_func)(); 1910 } 1911 1912 void 1913 need_resched(struct cpu_info *ci) 1914 { 1915 ci->ci_want_resched = 1; 1916 1917 /* There's a risk we'll be called before the idle threads start */ 1918 if (ci->ci_curproc) { 1919 aston(ci->ci_curproc); 1920 cpu_kick(ci); 1921 } 1922 } 1923 1924 /* 1925 * Allocate an IDT vector slot within the given range. 1926 * XXX needs locking to avoid MP allocation races. 1927 */ 1928 1929 int 1930 idt_vec_alloc(int low, int high) 1931 { 1932 int vec; 1933 1934 for (vec = low; vec <= high; vec++) { 1935 if (idt_allocmap[vec] == 0) { 1936 idt_allocmap[vec] = 1; 1937 return vec; 1938 } 1939 } 1940 return 0; 1941 } 1942 1943 int 1944 idt_vec_alloc_range(int low, int high, int num) 1945 { 1946 int i, vec; 1947 1948 KASSERT(powerof2(num)); 1949 low = (low + num - 1) & ~(num - 1); 1950 high = ((high + 1) & ~(num - 1)) - 1; 1951 1952 for (vec = low; vec <= high; vec += num) { 1953 for (i = 0; i < num; i++) { 1954 if (idt_allocmap[vec + i] != 0) 1955 break; 1956 } 1957 if (i == num) { 1958 for (i = 0; i < num; i++) 1959 idt_allocmap[vec + i] = 1; 1960 return vec; 1961 } 1962 } 1963 return 0; 1964 } 1965 1966 void 1967 idt_vec_set(int vec, void (*function)(void)) 1968 { 1969 /* 1970 * Vector should be allocated, so no locking needed. 1971 */ 1972 KASSERT(idt_allocmap[vec] == 1); 1973 setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL, 1974 GSEL(GCODE_SEL, SEL_KPL)); 1975 } 1976 1977 void 1978 idt_vec_free(int vec) 1979 { 1980 unsetgate(&idt[vec]); 1981 idt_allocmap[vec] = 0; 1982 } 1983 1984 #ifdef DIAGNOSTIC 1985 void 1986 splassert_check(int wantipl, const char *func) 1987 { 1988 int cpl = curcpu()->ci_ilevel; 1989 int floor = curcpu()->ci_handled_intr_level; 1990 1991 if (cpl < wantipl) { 1992 splassert_fail(wantipl, cpl, func); 1993 } 1994 if (floor > wantipl) { 1995 splassert_fail(wantipl, floor, func); 1996 } 1997 1998 } 1999 #endif 2000 2001 int 2002 copyin32(const uint32_t *uaddr, uint32_t *kaddr) 2003 { 2004 if ((vaddr_t)uaddr & 0x3) 2005 return EFAULT; 2006 2007 /* copyin(9) is atomic */ 2008 return copyin(uaddr, kaddr, sizeof(uint32_t)); 2009 } 2010 2011 void 2012 getbootinfo(char *bootinfo, int bootinfo_size) 2013 { 2014 bootarg32_t *q; 2015 bios_ddb_t *bios_ddb; 2016 bios_bootduid_t *bios_bootduid; 2017 bios_bootsr_t *bios_bootsr; 2018 #undef BOOTINFO_DEBUG 2019 #ifdef BOOTINFO_DEBUG 2020 printf("bootargv:"); 2021 #endif 2022 2023 for (q = (bootarg32_t *)bootinfo; 2024 (q->ba_type != BOOTARG_END) && 2025 ((((char *)q) - bootinfo) < bootinfo_size); 2026 q = (bootarg32_t *)(((char *)q) + q->ba_size)) { 2027 2028 switch (q->ba_type) { 2029 case BOOTARG_MEMMAP: 2030 bios_memmap = (bios_memmap_t *)q->ba_arg; 2031 #ifdef BOOTINFO_DEBUG 2032 printf(" memmap %p", bios_memmap); 2033 #endif 2034 break; 2035 case BOOTARG_DISKINFO: 2036 bios_diskinfo = (bios_diskinfo_t *)q->ba_arg; 2037 #ifdef BOOTINFO_DEBUG 2038 printf(" diskinfo %p", bios_diskinfo); 2039 #endif 2040 break; 2041 case BOOTARG_APMINFO: 2042 /* generated by i386 boot loader */ 2043 break; 2044 case BOOTARG_CKSUMLEN: 2045 bios_cksumlen = *(u_int32_t *)q->ba_arg; 2046 #ifdef BOOTINFO_DEBUG 2047 printf(" cksumlen %d", bios_cksumlen); 2048 #endif 2049 break; 2050 case BOOTARG_PCIINFO: 2051 /* generated by i386 boot loader */ 2052 break; 2053 case BOOTARG_CONSDEV: { 2054 #if NCOM > 0 2055 bios_consdev_t *cdp = (bios_consdev_t*)q->ba_arg; 2056 static const int ports[] = 2057 { 0x3f8, 0x2f8, 0x3e8, 0x2e8 }; 2058 int unit = minor(cdp->consdev); 2059 uint64_t consaddr = cdp->consaddr; 2060 if (consaddr == -1 && unit >= 0 && unit < nitems(ports)) 2061 consaddr = ports[unit]; 2062 if (major(cdp->consdev) == 8 && consaddr != -1) { 2063 comconsunit = unit; 2064 comconsaddr = consaddr; 2065 comconsrate = cdp->conspeed; 2066 comconsfreq = cdp->consfreq; 2067 comcons_reg_width = cdp->reg_width; 2068 comcons_reg_shift = cdp->reg_shift; 2069 if (cdp->flags & BCD_MMIO) 2070 comconsiot = X86_BUS_SPACE_MEM; 2071 else 2072 comconsiot = X86_BUS_SPACE_IO; 2073 } 2074 #endif 2075 #ifdef BOOTINFO_DEBUG 2076 printf(" console 0x%x:%d", cdp->consdev, cdp->conspeed); 2077 #endif 2078 break; 2079 } 2080 case BOOTARG_BOOTMAC: 2081 bios_bootmac = (bios_bootmac_t *)q->ba_arg; 2082 break; 2083 2084 case BOOTARG_DDB: 2085 bios_ddb = (bios_ddb_t *)q->ba_arg; 2086 #ifdef DDB 2087 db_console = bios_ddb->db_console; 2088 #endif 2089 break; 2090 2091 case BOOTARG_BOOTDUID: 2092 bios_bootduid = (bios_bootduid_t *)q->ba_arg; 2093 memcpy(bootduid, bios_bootduid, sizeof(bootduid)); 2094 break; 2095 2096 case BOOTARG_BOOTSR: 2097 bios_bootsr = (bios_bootsr_t *)q->ba_arg; 2098 #if NSOFTRAID > 0 2099 memcpy(&sr_bootuuid, &bios_bootsr->uuid, 2100 sizeof(sr_bootuuid)); 2101 memcpy(&sr_bootkey, &bios_bootsr->maskkey, 2102 sizeof(sr_bootkey)); 2103 #endif 2104 explicit_bzero(bios_bootsr, sizeof(bios_bootsr_t)); 2105 break; 2106 2107 case BOOTARG_EFIINFO: 2108 bios_efiinfo = (bios_efiinfo_t *)q->ba_arg; 2109 break; 2110 2111 case BOOTARG_UCODE: 2112 bios_ucode = (bios_ucode_t *)q->ba_arg; 2113 break; 2114 2115 default: 2116 #ifdef BOOTINFO_DEBUG 2117 printf(" unsupported arg (%d) %p", q->ba_type, 2118 q->ba_arg); 2119 #endif 2120 break; 2121 } 2122 } 2123 #ifdef BOOTINFO_DEBUG 2124 printf("\n"); 2125 #endif 2126 } 2127 2128 int 2129 check_context(const struct reg *regs, struct trapframe *tf) 2130 { 2131 uint16_t sel; 2132 2133 if (((regs->r_rflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) 2134 return EINVAL; 2135 2136 sel = regs->r_ss & 0xffff; 2137 if (!VALID_USER_DSEL(sel)) 2138 return EINVAL; 2139 2140 sel = regs->r_cs & 0xffff; 2141 if (!VALID_USER_CSEL(sel)) 2142 return EINVAL; 2143 2144 if (regs->r_rip >= VM_MAXUSER_ADDRESS) 2145 return EINVAL; 2146 2147 return 0; 2148 } 2149 2150 int amd64_delay_quality; 2151 2152 void 2153 delay_init(void(*fn)(int), int fn_quality) 2154 { 2155 if (fn_quality > amd64_delay_quality) { 2156 delay_func = fn; 2157 amd64_delay_quality = fn_quality; 2158 } 2159 } 2160 2161 void 2162 delay_fini(void (*fn)(int)) 2163 { 2164 if (fn == delay_func) { 2165 delay_func = i8254_delay; 2166 amd64_delay_quality = 0; 2167 } 2168 } 2169