1 /* $OpenBSD: cpu.c,v 1.195 2024/11/07 17:24:42 bluhm Exp $ */ 2 /* $NetBSD: cpu.c,v 1.1 2003/04/26 18:39:26 fvdl Exp $ */ 3 4 /*- 5 * Copyright (c) 2000 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by RedBack Networks Inc. 10 * 11 * Author: Bill Sommerfeld 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Copyright (c) 1999 Stefan Grefen 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the NetBSD 49 * Foundation, Inc. and its contributors. 50 * 4. Neither the name of The NetBSD Foundation nor the names of its 51 * contributors may be used to endorse or promote products derived 52 * from this software without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY 55 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 */ 66 67 #include "lapic.h" 68 #include "ioapic.h" 69 #include "vmm.h" 70 #include "pctr.h" 71 #include "pvbus.h" 72 73 #include <sys/param.h> 74 #include <sys/proc.h> 75 #include <sys/timeout.h> 76 #include <sys/systm.h> 77 #include <sys/device.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/atomic.h> 81 #include <sys/user.h> 82 83 #include <uvm/uvm_extern.h> 84 85 #include <machine/codepatch.h> 86 #include <machine/cpu_full.h> 87 #include <machine/cpufunc.h> 88 #include <machine/cpuvar.h> 89 #include <machine/pmap.h> 90 #include <machine/vmparam.h> 91 #include <machine/mpbiosvar.h> 92 #include <machine/pcb.h> 93 #include <machine/specialreg.h> 94 #include <machine/segments.h> 95 #include <machine/gdt.h> 96 #include <machine/pio.h> 97 #include <machine/vmmvar.h> 98 99 #if NLAPIC > 0 100 #include <machine/i82489reg.h> 101 #include <machine/i82489var.h> 102 #endif 103 104 #if NIOAPIC > 0 105 #include <machine/i82093var.h> 106 #endif 107 108 #if NPCTR > 0 109 #include <machine/pctr.h> 110 #endif 111 112 #if NPVBUS > 0 113 #include <dev/pv/pvvar.h> 114 #endif 115 116 #include <dev/ic/mc146818reg.h> 117 #include <amd64/isa/nvram.h> 118 #include <dev/isa/isareg.h> 119 120 #ifdef HIBERNATE 121 #include <sys/hibernate.h> 122 #include <machine/hibernate.h> 123 #endif /* HIBERNATE */ 124 125 /* #define CPU_DEBUG */ 126 127 #ifdef CPU_DEBUG 128 #define DPRINTF(x...) do { printf(x); } while(0) 129 #else 130 #define DPRINTF(x...) 131 #endif /* CPU_DEBUG */ 132 133 int cpu_match(struct device *, void *, void *); 134 void cpu_attach(struct device *, struct device *, void *); 135 int cpu_activate(struct device *, int); 136 void patinit(struct cpu_info *ci); 137 #if NVMM > 0 138 void cpu_init_vmm(struct cpu_info *ci); 139 #endif /* NVMM > 0 */ 140 141 struct cpu_softc { 142 struct device sc_dev; /* device tree glue */ 143 struct cpu_info *sc_info; /* pointer to CPU info */ 144 }; 145 146 void replacesmap(void); 147 void replacemeltdown(void); 148 void replacemds(void); 149 150 extern long _stac; 151 extern long _clac; 152 153 int cpuid_level = 0; /* MIN cpuid(0).eax */ 154 char cpu_vendor[16] = { 0 }; /* CPU0's cpuid(0).e[bdc]x, \0 */ 155 int cpu_id = 0; /* cpuid(1).eax */ 156 int cpu_ebxfeature = 0; /* cpuid(1).ebx */ 157 int cpu_ecxfeature = 0; /* INTERSECTION(cpuid(1).ecx) */ 158 int cpu_feature = 0; /* cpuid(1).edx */ 159 int ecpu_ecxfeature = 0; /* cpuid(0x80000001).ecx */ 160 int cpu_sev_guestmode = 0; 161 int cpu_meltdown = 0; 162 int cpu_use_xsaves = 0; 163 int need_retpoline = 1; /* most systems need retpoline */ 164 165 void 166 replacesmap(void) 167 { 168 static int replacedone = 0; 169 int s; 170 171 if (replacedone) 172 return; 173 replacedone = 1; 174 175 s = splhigh(); 176 177 codepatch_replace(CPTAG_STAC, &_stac, 3); 178 codepatch_replace(CPTAG_CLAC, &_clac, 3); 179 180 splx(s); 181 } 182 183 void 184 replacemeltdown(void) 185 { 186 static int replacedone = 0; 187 struct cpu_info *ci = &cpu_info_primary; 188 int swapgs_vuln = 0, ibrs = 0, s, ibpb = 0; 189 190 if (ci->ci_vendor == CPUV_INTEL) { 191 int family = ci->ci_family; 192 int model = ci->ci_model; 193 194 swapgs_vuln = 1; 195 if (family == 0x6 && 196 (model == 0x37 || model == 0x4a || model == 0x4c || 197 model == 0x4d || model == 0x5a || model == 0x5d || 198 model == 0x6e || model == 0x65 || model == 0x75)) { 199 /* Silvermont, Airmont */ 200 swapgs_vuln = 0; 201 } else if (family == 0x6 && (model == 0x85 || model == 0x57)) { 202 /* KnightsLanding */ 203 swapgs_vuln = 0; 204 } 205 if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP) && 206 (rdmsr(MSR_ARCH_CAPABILITIES) & ARCH_CAP_IBRS_ALL)) { 207 ibrs = 2; 208 } else if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS) { 209 ibrs = 1; 210 } 211 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBRS) 212 ibpb = 1; 213 } else if (ci->ci_vendor == CPUV_AMD && 214 ci->ci_pnfeatset >= 0x80000008) { 215 if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_ALWAYSON) { 216 ibrs = 2; 217 } else if ((ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS) && 218 (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBRS_PREF)) { 219 ibrs = 1; 220 } 221 if (ci->ci_feature_amdspec_ebx & CPUIDEBX_IBPB) 222 ibpb = 1; 223 } 224 225 /* Enhanced IBRS: turn it on once on each CPU and don't touch again */ 226 if (ibrs == 2) 227 wrmsr(MSR_SPEC_CTRL, SPEC_CTRL_IBRS); 228 229 if (replacedone) 230 return; 231 replacedone = 1; 232 233 s = splhigh(); 234 235 /* If we don't have IBRS/IBPB, then don't use IBPB */ 236 if (ibpb == 0) 237 codepatch_nop(CPTAG_IBPB_NOP); 238 239 if (ibrs == 2 || (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT)) { 240 extern const char _jmprax, _jmpr11, _jmpr13; 241 extern const short _jmprax_len, _jmpr11_len, _jmpr13_len; 242 243 codepatch_replace(CPTAG_RETPOLINE_RAX, &_jmprax, _jmprax_len); 244 codepatch_replace(CPTAG_RETPOLINE_R11, &_jmpr11, _jmpr11_len); 245 codepatch_replace(CPTAG_RETPOLINE_R13, &_jmpr13, _jmpr13_len); 246 need_retpoline = 0; 247 } 248 249 if (!cpu_meltdown) 250 codepatch_nop(CPTAG_MELTDOWN_NOP); 251 else { 252 extern long alltraps_kern_meltdown; 253 254 /* eliminate conditional branch in alltraps */ 255 codepatch_jmp(CPTAG_MELTDOWN_ALLTRAPS, &alltraps_kern_meltdown); 256 257 /* enable reuse of PCID for U-K page tables */ 258 if (pmap_use_pcid) { 259 extern long _pcid_set_reuse; 260 DPRINTF("%s: codepatching PCID use\n", __func__); 261 codepatch_replace(CPTAG_PCID_SET_REUSE, 262 &_pcid_set_reuse, PCID_SET_REUSE_SIZE); 263 } 264 } 265 266 /* 267 * CVE-2019-1125: if the CPU has SMAP and it's not vulnerable to 268 * Meltdown, then it's protected both from speculatively mis-skipping 269 * the swapgs during interrupts of userspace and from speculatively 270 * mis-taking a swapgs during interrupts while already in the kernel 271 * as the speculative path will fault from SMAP. Warning: enabling 272 * WRGSBASE would break this 'protection'. 273 * 274 * Otherwise, if the CPU's swapgs can't be speculated over and it 275 * _is_ vulnerable to Meltdown then the %cr3 change will serialize 276 * user->kern transitions, but we still need to mitigate the 277 * already-in-kernel cases. 278 */ 279 if (!cpu_meltdown && (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP)) { 280 codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN); 281 codepatch_nop(CPTAG_FENCE_NO_SAFE_SMAP); 282 } else if (!swapgs_vuln && cpu_meltdown) { 283 codepatch_nop(CPTAG_FENCE_SWAPGS_MIS_TAKEN); 284 } 285 splx(s); 286 } 287 288 void 289 replacemds(void) 290 { 291 static int replacedone = 0; 292 extern long mds_handler_bdw, mds_handler_ivb, mds_handler_skl; 293 extern long mds_handler_skl_sse, mds_handler_skl_avx; 294 extern long mds_handler_skl_avx512; 295 extern long mds_handler_silvermont, mds_handler_knights; 296 struct cpu_info *ci = &cpu_info_primary; 297 CPU_INFO_ITERATOR cii; 298 void *handler = NULL, *vmm_handler = NULL; 299 const char *type; 300 int use_verw = 0, s; 301 uint32_t cap = 0; 302 303 /* ci_mds_tmp must be 64-byte aligned for AVX-512 instructions */ 304 CTASSERT((offsetof(struct cpu_info, ci_mds_tmp) - 305 offsetof(struct cpu_info, ci_PAGEALIGN)) % 64 == 0); 306 307 if (replacedone) 308 return; 309 replacedone = 1; 310 311 if (ci->ci_vendor != CPUV_INTEL) 312 goto notintel; /* VERW only needed on Intel */ 313 314 if ((ci->ci_feature_sefflags_edx & SEFF0EDX_ARCH_CAP)) 315 cap = rdmsr(MSR_ARCH_CAPABILITIES); 316 317 if (cap & ARCH_CAP_MDS_NO) { 318 /* Unaffected, nop out the handling code */ 319 } else if (ci->ci_feature_sefflags_edx & SEFF0EDX_MD_CLEAR) { 320 /* new firmware, use VERW */ 321 use_verw = 1; 322 } else { 323 int family = ci->ci_family; 324 int model = ci->ci_model; 325 int stepping = CPUID2STEPPING(ci->ci_signature); 326 327 if (family == 0x6 && 328 (model == 0x2e || model == 0x1e || model == 0x1f || 329 model == 0x1a || model == 0x2f || model == 0x25 || 330 model == 0x2c || model == 0x2d || model == 0x2a || 331 model == 0x3e || model == 0x3a)) { 332 /* Nehalem, SandyBridge, IvyBridge */ 333 handler = vmm_handler = &mds_handler_ivb; 334 type = "IvyBridge"; 335 CPU_INFO_FOREACH(cii, ci) { 336 ci->ci_mds_buf = malloc(672, M_DEVBUF, 337 M_WAITOK); 338 memset(ci->ci_mds_buf, 0, 16); 339 } 340 } else if (family == 0x6 && 341 (model == 0x3f || model == 0x3c || model == 0x45 || 342 model == 0x46 || model == 0x56 || model == 0x4f || 343 model == 0x47 || model == 0x3d)) { 344 /* Haswell and Broadwell */ 345 handler = vmm_handler = &mds_handler_bdw; 346 type = "Broadwell"; 347 CPU_INFO_FOREACH(cii, ci) { 348 ci->ci_mds_buf = malloc(1536, M_DEVBUF, 349 M_WAITOK); 350 } 351 } else if (family == 0x6 && 352 ((model == 0x55 && stepping <= 5) || model == 0x4e || 353 model == 0x5e || (model == 0x8e && stepping <= 0xb) || 354 (model == 0x9e && stepping <= 0xc))) { 355 /* 356 * Skylake, KabyLake, CoffeeLake, WhiskeyLake, 357 * CascadeLake 358 */ 359 if (xgetbv(0) & XFEATURE_AVX512) { 360 handler = &mds_handler_skl_avx512; 361 type = "Skylake AVX-512"; 362 } else if (xgetbv(0) & XFEATURE_AVX) { 363 handler = &mds_handler_skl_avx; 364 type = "Skylake AVX"; 365 } else { 366 handler = &mds_handler_skl_sse; 367 type = "Skylake SSE"; 368 } 369 vmm_handler = &mds_handler_skl; 370 CPU_INFO_FOREACH(cii, ci) { 371 vaddr_t b64; 372 b64 = (vaddr_t)malloc(6 * 1024 + 64 + 63, 373 M_DEVBUF, M_WAITOK); 374 ci->ci_mds_buf = (void *)((b64 + 63) & ~63); 375 memset(ci->ci_mds_buf, 0, 64); 376 } 377 } else if (family == 0x6 && 378 (model == 0x37 || model == 0x4a || model == 0x4c || 379 model == 0x4d || model == 0x5a || model == 0x5d || 380 model == 0x6e || model == 0x65 || model == 0x75)) { 381 /* Silvermont, Airmont */ 382 handler = vmm_handler = &mds_handler_silvermont; 383 type = "Silvermont"; 384 CPU_INFO_FOREACH(cii, ci) { 385 ci->ci_mds_buf = malloc(256, M_DEVBUF, 386 M_WAITOK); 387 memset(ci->ci_mds_buf, 0, 16); 388 } 389 } else if (family == 0x6 && (model == 0x85 || model == 0x57)) { 390 handler = vmm_handler = &mds_handler_knights; 391 type = "KnightsLanding"; 392 CPU_INFO_FOREACH(cii, ci) { 393 vaddr_t b64; 394 b64 = (vaddr_t)malloc(1152 + 63, M_DEVBUF, 395 M_WAITOK); 396 ci->ci_mds_buf = (void *)((b64 + 63) & ~63); 397 } 398 } 399 } 400 401 /* Register File Data Sampling (RFDS) also has a VERW workaround */ 402 if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR)) 403 use_verw = 1; 404 405 if (handler != NULL) { 406 printf("cpu0: using %s MDS workaround%s\n", type, ""); 407 s = splhigh(); 408 codepatch_call(CPTAG_MDS, handler); 409 codepatch_call(CPTAG_MDS_VMM, vmm_handler); 410 splx(s); 411 } else if (use_verw) { 412 /* 413 * The new firmware enhances L1D_FLUSH MSR to flush MDS too, 414 * but keep the verw if affected by RFDS 415 */ 416 if ((cap & ARCH_CAP_RFDS_NO) == 0 && (cap & ARCH_CAP_RFDS_CLEAR)) { 417 type = ""; 418 } else if (cpu_info_primary.ci_vmm_cap.vcc_vmx.vmx_has_l1_flush_msr == 1) { 419 s = splhigh(); 420 codepatch_nop(CPTAG_MDS_VMM); 421 splx(s); 422 type = " (except on vmm entry)"; 423 } else { 424 type = ""; 425 } 426 printf("cpu0: using %s MDS workaround%s\n", "VERW", type); 427 } else { 428 notintel: 429 s = splhigh(); 430 codepatch_nop(CPTAG_MDS); 431 codepatch_nop(CPTAG_MDS_VMM); 432 splx(s); 433 } 434 } 435 436 #ifdef MULTIPROCESSOR 437 int mp_cpu_start(struct cpu_info *); 438 void mp_cpu_start_cleanup(struct cpu_info *); 439 struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL, 440 mp_cpu_start_cleanup }; 441 #endif /* MULTIPROCESSOR */ 442 443 const struct cfattach cpu_ca = { 444 sizeof(struct cpu_softc), cpu_match, cpu_attach, NULL, cpu_activate 445 }; 446 447 struct cfdriver cpu_cd = { 448 NULL, "cpu", DV_DULL 449 }; 450 451 /* 452 * Statically-allocated CPU info for the primary CPU (or the only 453 * CPU, on uniprocessors). The CPU info list is initialized to 454 * point at it. 455 */ 456 struct cpu_info_full cpu_info_full_primary = { .cif_cpu = { .ci_self = &cpu_info_primary } }; 457 458 struct cpu_info *cpu_info_list = &cpu_info_primary; 459 460 #ifdef MULTIPROCESSOR 461 /* 462 * Array of CPU info structures. Must be statically-allocated because 463 * curproc, etc. are used early. 464 */ 465 struct cpu_info *cpu_info[MAXCPUS] = { &cpu_info_primary }; 466 467 void cpu_hatch(void *); 468 void cpu_boot_secondary(struct cpu_info *ci); 469 void cpu_start_secondary(struct cpu_info *ci); 470 #endif 471 472 int 473 cpu_match(struct device *parent, void *match, void *aux) 474 { 475 struct cfdata *cf = match; 476 struct cpu_attach_args *caa = aux; 477 478 if (strcmp(caa->caa_name, cf->cf_driver->cd_name) != 0) 479 return 0; 480 481 if (cf->cf_unit >= MAXCPUS) 482 return 0; 483 484 return 1; 485 } 486 487 void cpu_idle_mwait_cycle(void); 488 void cpu_init_mwait(struct cpu_softc *, struct cpu_info *); 489 490 u_int cpu_mwait_size, cpu_mwait_states; 491 492 void 493 cpu_idle_mwait_cycle(void) 494 { 495 struct cpu_info *ci = curcpu(); 496 497 if ((read_rflags() & PSL_I) == 0) 498 panic("idle with interrupts blocked!"); 499 500 /* something already queued? */ 501 if (!cpu_is_idle(ci)) 502 return; 503 504 /* 505 * About to idle; setting the MWAIT_IN_IDLE bit tells 506 * cpu_unidle() that it can't be a no-op and tells cpu_kick() 507 * that it doesn't need to use an IPI. We also set the 508 * MWAIT_KEEP_IDLING bit: those routines clear it to stop 509 * the mwait. Once they're set, we do a final check of the 510 * queue, in case another cpu called setrunqueue() and added 511 * something to the queue and called cpu_unidle() between 512 * the check in sched_idle() and here. 513 */ 514 atomic_setbits_int(&ci->ci_mwait, MWAIT_IDLING | MWAIT_ONLY); 515 if (cpu_is_idle(ci)) { 516 monitor(&ci->ci_mwait, 0, 0); 517 if ((ci->ci_mwait & MWAIT_IDLING) == MWAIT_IDLING) 518 mwait(0, 0); 519 } 520 521 /* done idling; let cpu_kick() know that an IPI is required */ 522 atomic_clearbits_int(&ci->ci_mwait, MWAIT_IDLING); 523 } 524 525 void 526 cpu_init_mwait(struct cpu_softc *sc, struct cpu_info *ci) 527 { 528 unsigned int smallest, largest, extensions, c_substates; 529 530 if ((cpu_ecxfeature & CPUIDECX_MWAIT) == 0 || ci->ci_cpuid_level < 0x5) 531 return; 532 533 /* get the monitor granularity */ 534 CPUID(0x5, smallest, largest, extensions, cpu_mwait_states); 535 smallest &= 0xffff; 536 largest &= 0xffff; 537 538 /* mask out states C6/C7 in 31:24 for CHT45 errata */ 539 if (ci->ci_vendor == CPUV_INTEL && 540 ci->ci_family == 0x06 && ci->ci_model == 0x4c) 541 cpu_mwait_states &= 0x00ffffff; 542 543 printf("%s: mwait min=%u, max=%u", sc->sc_dev.dv_xname, 544 smallest, largest); 545 if (extensions & 0x1) { 546 if (cpu_mwait_states > 0) { 547 c_substates = cpu_mwait_states; 548 printf(", C-substates=%u", 0xf & c_substates); 549 while ((c_substates >>= 4) > 0) 550 printf(".%u", 0xf & c_substates); 551 } 552 if (extensions & 0x2) 553 printf(", IBE"); 554 } else { 555 /* substates not supported, forge the default: just C1 */ 556 cpu_mwait_states = 1 << 4; 557 } 558 559 /* paranoia: check the values */ 560 if (smallest < sizeof(int) || largest < smallest || 561 (largest & (sizeof(int)-1))) 562 printf(" (bogus)"); 563 else 564 cpu_mwait_size = largest; 565 printf("\n"); 566 567 /* enable use of mwait; may be overridden by acpicpu later */ 568 if (cpu_mwait_size > 0) 569 cpu_idle_cycle_fcn = &cpu_idle_mwait_cycle; 570 } 571 572 void 573 cpu_attach(struct device *parent, struct device *self, void *aux) 574 { 575 struct cpu_softc *sc = (void *) self; 576 struct cpu_attach_args *caa = aux; 577 struct cpu_info *ci; 578 #if defined(MULTIPROCESSOR) 579 int cpunum = sc->sc_dev.dv_unit; 580 vaddr_t kstack; 581 struct pcb *pcb; 582 #endif 583 584 /* 585 * If we're an Application Processor, allocate a cpu_info 586 * structure, otherwise use the primary's. 587 */ 588 if (caa->cpu_role == CPU_ROLE_AP) { 589 struct cpu_info_full *cif; 590 591 cif = km_alloc(sizeof *cif, &kv_any, &kp_zero, &kd_waitok); 592 ci = &cif->cif_cpu; 593 #if defined(MULTIPROCESSOR) 594 ci->ci_tss = &cif->cif_tss; 595 ci->ci_gdt = &cif->cif_gdt; 596 memcpy(ci->ci_gdt, cpu_info_primary.ci_gdt, GDT_SIZE); 597 cpu_enter_pages(cif); 598 if (cpu_info[cpunum] != NULL) 599 panic("cpu at apic id %d already attached?", cpunum); 600 cpu_info[cpunum] = ci; 601 #endif 602 #ifdef TRAPLOG 603 ci->ci_tlog_base = malloc(sizeof(struct tlog), 604 M_DEVBUF, M_WAITOK); 605 #endif 606 } else { 607 ci = &cpu_info_primary; 608 #if defined(MULTIPROCESSOR) 609 if (caa->cpu_apicid != lapic_cpu_number()) { 610 panic("%s: running cpu is at apic %d" 611 " instead of at expected %d", 612 sc->sc_dev.dv_xname, lapic_cpu_number(), caa->cpu_apicid); 613 } 614 #endif 615 } 616 617 ci->ci_self = ci; 618 sc->sc_info = ci; 619 620 ci->ci_dev = self; 621 ci->ci_apicid = caa->cpu_apicid; 622 ci->ci_acpi_proc_id = caa->cpu_acpi_proc_id; 623 #ifdef MULTIPROCESSOR 624 ci->ci_cpuid = cpunum; 625 #else 626 ci->ci_cpuid = 0; /* False for APs, but they're not used anyway */ 627 #endif 628 ci->ci_func = caa->cpu_func; 629 ci->ci_handled_intr_level = IPL_NONE; 630 631 #ifndef SMALL_KERNEL 632 strlcpy(ci->ci_sensordev.xname, ci->ci_dev->dv_xname, 633 sizeof(ci->ci_sensordev.xname)); 634 #endif 635 636 #if defined(MULTIPROCESSOR) 637 /* 638 * Allocate UPAGES contiguous pages for the idle PCB and stack. 639 */ 640 kstack = (vaddr_t)km_alloc(USPACE, &kv_any, &kp_dirty, &kd_nowait); 641 if (kstack == 0) { 642 if (caa->cpu_role != CPU_ROLE_AP) { 643 panic("cpu_attach: unable to allocate idle stack for" 644 " primary"); 645 } 646 printf("%s: unable to allocate idle stack\n", 647 sc->sc_dev.dv_xname); 648 return; 649 } 650 pcb = ci->ci_idle_pcb = (struct pcb *) kstack; 651 memset(pcb, 0, USPACE); 652 653 pcb->pcb_kstack = kstack + USPACE - 16; 654 pcb->pcb_rbp = pcb->pcb_rsp = kstack + USPACE - 16; 655 pcb->pcb_pmap = pmap_kernel(); 656 pcb->pcb_cr3 = pcb->pcb_pmap->pm_pdirpa; 657 #endif 658 659 /* further PCB init done later. */ 660 661 printf(": "); 662 663 switch (caa->cpu_role) { 664 case CPU_ROLE_SP: 665 printf("(uniprocessor)\n"); 666 atomic_setbits_int(&ci->ci_flags, 667 CPUF_PRESENT | CPUF_SP | CPUF_PRIMARY); 668 cpu_intr_init(ci); 669 identifycpu(ci); 670 cpu_fix_msrs(ci); 671 #ifdef MTRR 672 mem_range_attach(); 673 #endif /* MTRR */ 674 /* XXX SP fpuinit(ci) is done earlier */ 675 cpu_init(ci); 676 cpu_init_mwait(sc, ci); 677 break; 678 679 case CPU_ROLE_BP: 680 printf("apid %d (boot processor)\n", caa->cpu_apicid); 681 atomic_setbits_int(&ci->ci_flags, 682 CPUF_PRESENT | CPUF_BSP | CPUF_PRIMARY); 683 cpu_intr_init(ci); 684 identifycpu(ci); 685 cpu_fix_msrs(ci); 686 #ifdef MTRR 687 mem_range_attach(); 688 #endif /* MTRR */ 689 690 #if NLAPIC > 0 691 /* 692 * Enable local apic 693 */ 694 lapic_enable(); 695 lapic_calibrate_timer(ci); 696 #endif 697 /* XXX BP fpuinit(ci) is done earlier */ 698 cpu_init(ci); 699 700 #if NIOAPIC > 0 701 ioapic_bsp_id = caa->cpu_apicid; 702 #endif 703 cpu_init_mwait(sc, ci); 704 break; 705 706 case CPU_ROLE_AP: 707 /* 708 * report on an AP 709 */ 710 printf("apid %d (application processor)\n", caa->cpu_apicid); 711 712 #if defined(MULTIPROCESSOR) 713 cpu_intr_init(ci); 714 cpu_start_secondary(ci); 715 clockqueue_init(&ci->ci_queue); 716 sched_init_cpu(ci); 717 ncpus++; 718 if (ci->ci_flags & CPUF_PRESENT) { 719 ci->ci_next = cpu_info_list->ci_next; 720 cpu_info_list->ci_next = ci; 721 } 722 #else 723 printf("%s: not started\n", sc->sc_dev.dv_xname); 724 #endif 725 break; 726 727 default: 728 panic("unknown processor type??"); 729 } 730 731 #if defined(MULTIPROCESSOR) 732 if (mp_verbose) { 733 printf("%s: kstack at 0x%lx for %d bytes\n", 734 sc->sc_dev.dv_xname, kstack, USPACE); 735 printf("%s: idle pcb at %p, idle sp at 0x%llx\n", 736 sc->sc_dev.dv_xname, pcb, pcb->pcb_rsp); 737 } 738 #endif 739 #if NVMM > 0 740 cpu_init_vmm(ci); 741 #endif /* NVMM > 0 */ 742 743 #ifndef SMALL_KERNEL 744 if (ci->ci_sensordev.sensors_count > 0) 745 sensordev_install(&ci->ci_sensordev); 746 #endif 747 } 748 749 static void 750 replacexsave(int xsave_ext) 751 { 752 extern long _xrstor, _xrstors, _xsave, _xsaves, _xsaveopt; 753 static int replacedone = 0; 754 int s; 755 756 if (replacedone) 757 return; 758 replacedone = 1; 759 760 s = splhigh(); 761 codepatch_replace(CPTAG_XRSTORS, 762 (xsave_ext & XSAVE_XSAVES) ? &_xrstors : &_xrstor, 4); 763 codepatch_replace(CPTAG_XRSTOR, &_xrstor, 4); 764 codepatch_replace(CPTAG_XSAVE, 765 (xsave_ext & XSAVE_XSAVES) ? &_xsaves : 766 (xsave_ext & XSAVE_XSAVEOPT) ? &_xsaveopt : &_xsave, 4); 767 splx(s); 768 } 769 770 771 /* 772 * Initialize the processor appropriately. 773 */ 774 775 void 776 cpu_init(struct cpu_info *ci) 777 { 778 struct savefpu *sfp; 779 u_int cr4; 780 781 /* configure the CPU if needed */ 782 if (ci->cpu_setup != NULL) 783 (*ci->cpu_setup)(ci); 784 785 cr4 = rcr4() | CR4_DEFAULT; 786 if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMEP) 787 cr4 |= CR4_SMEP; 788 if (ci->ci_feature_sefflags_ebx & SEFF0EBX_SMAP) 789 cr4 |= CR4_SMAP; 790 if (ci->ci_feature_sefflags_ecx & SEFF0ECX_UMIP) 791 cr4 |= CR4_UMIP; 792 if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd) 793 cr4 |= CR4_OSXSAVE; 794 if (pg_xo) 795 cr4 |= CR4_PKE; 796 if (pmap_use_pcid) 797 cr4 |= CR4_PCIDE; 798 lcr4(cr4); 799 800 if ((cpu_ecxfeature & CPUIDECX_XSAVE) && ci->ci_cpuid_level >= 0xd) { 801 u_int32_t eax, ebx, ecx, edx; 802 803 xsave_mask = XFEATURE_X87 | XFEATURE_SSE; 804 CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx); 805 xsave_mask |= eax & XFEATURE_AVX; 806 xsave_mask |= eax & XFEATURE_AVX512; 807 xsetbv(0, xsave_mask); 808 CPUID_LEAF(0xd, 0, eax, ebx, ecx, edx); 809 if (CPU_IS_PRIMARY(ci)) { 810 fpu_save_len = ebx; 811 KASSERT(fpu_save_len <= sizeof(struct savefpu)); 812 } else { 813 KASSERT(ebx == fpu_save_len); 814 } 815 816 /* check for xsaves, xsaveopt, and supervisor features */ 817 CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx); 818 /* Disable XSAVES on AMD family 17h due to Erratum 1386 */ 819 if (ci->ci_vendor == CPUV_AMD && 820 ci->ci_family == 0x17) { 821 eax &= ~XSAVE_XSAVES; 822 } 823 if (eax & XSAVE_XSAVES) { 824 #ifndef SMALL_KERNEL 825 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) 826 xsave_mask |= ecx & XFEATURE_CET_U; 827 #endif 828 if (xsave_mask & XFEATURE_XSS_MASK) { 829 wrmsr(MSR_XSS, xsave_mask & XFEATURE_XSS_MASK); 830 CPUID_LEAF(0xd, 1, eax, ebx, ecx, edx); 831 KASSERT(ebx <= sizeof(struct savefpu)); 832 } 833 if (CPU_IS_PRIMARY(ci)) 834 cpu_use_xsaves = 1; 835 } 836 837 replacexsave(eax); 838 } 839 840 if (CPU_IS_PRIMARY(ci)) { 841 /* Clean our FPU save area */ 842 sfp = fpu_cleandata; 843 memset(sfp, 0, fpu_save_len); 844 sfp->fp_fxsave.fx_fcw = __INITIAL_NPXCW__; 845 sfp->fp_fxsave.fx_mxcsr = __INITIAL_MXCSR__; 846 xrstor_user(sfp, xsave_mask); 847 if (cpu_use_xsaves || !xsave_mask) 848 fpusave(sfp); 849 else { 850 /* must not use xsaveopt here */ 851 xsave(sfp, xsave_mask); 852 } 853 } else { 854 fpureset(); 855 } 856 857 #if NVMM > 0 858 /* Re-enable VMM if needed */ 859 if (ci->ci_flags & CPUF_VMM) 860 start_vmm_on_cpu(ci); 861 #endif /* NVMM > 0 */ 862 863 #ifdef MULTIPROCESSOR 864 atomic_setbits_int(&ci->ci_flags, CPUF_RUNNING); 865 /* 866 * Big hammer: flush all TLB entries, including ones from PTEs 867 * with the G bit set. This should only be necessary if TLB 868 * shootdown falls far behind. 869 */ 870 cr4 = rcr4(); 871 lcr4(cr4 & ~CR4_PGE); 872 lcr4(cr4); 873 874 /* Check if TSC is synchronized. */ 875 if (cold && !CPU_IS_PRIMARY(ci)) 876 tsc_test_sync_ap(ci); 877 #endif 878 } 879 880 #if NVMM > 0 881 /* 882 * cpu_init_vmm 883 * 884 * Initializes per-cpu VMM state 885 * 886 * Parameters: 887 * ci: the cpu for which state is being initialized 888 */ 889 void 890 cpu_init_vmm(struct cpu_info *ci) 891 { 892 uint64_t msr; 893 894 /* 895 * Detect VMX specific features and initialize VMX-related state. 896 */ 897 if (ci->ci_vmm_flags & CI_VMM_VMX) { 898 ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE, 899 M_DEVBUF, M_WAITOK | M_ZERO); 900 if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region, 901 &ci->ci_vmxon_region_pa)) 902 panic("Can't locate VMXON region in phys mem"); 903 904 ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR; 905 rw_init(&ci->ci_vmcs_lock, "vmcslock"); 906 907 if (rdmsr_safe(IA32_VMX_EPT_VPID_CAP, &msr) == 0 && 908 msr & IA32_EPT_VPID_CAP_INVEPT_CONTEXT) 909 ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode = 910 IA32_VMX_INVEPT_SINGLE_CTX; 911 else 912 ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode = 913 IA32_VMX_INVEPT_GLOBAL_CTX; 914 } 915 } 916 #endif /* NVMM > 0 */ 917 918 #ifdef MULTIPROCESSOR 919 void 920 cpu_boot_secondary_processors(void) 921 { 922 struct cpu_info *ci; 923 u_long i; 924 925 for (i=0; i < MAXCPUS; i++) { 926 ci = cpu_info[i]; 927 if (ci == NULL) 928 continue; 929 if (ci->ci_idle_pcb == NULL) 930 continue; 931 if ((ci->ci_flags & CPUF_PRESENT) == 0) 932 continue; 933 if (ci->ci_flags & (CPUF_BSP | CPUF_SP | CPUF_PRIMARY)) 934 continue; 935 ci->ci_randseed = (arc4random() & 0x7fffffff) + 1; 936 cpu_boot_secondary(ci); 937 } 938 } 939 940 void 941 cpu_start_secondary(struct cpu_info *ci) 942 { 943 int i; 944 u_long s; 945 946 atomic_setbits_int(&ci->ci_flags, CPUF_AP); 947 948 pmap_kenter_pa(MP_TRAMPOLINE, MP_TRAMPOLINE, PROT_READ | PROT_EXEC); 949 pmap_kenter_pa(MP_TRAMP_DATA, MP_TRAMP_DATA, PROT_READ | PROT_WRITE); 950 951 CPU_STARTUP(ci); 952 953 /* 954 * wait for it to become ready 955 */ 956 for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i>0;i--) { 957 delay(10); 958 } 959 if (! (ci->ci_flags & CPUF_PRESENT)) { 960 printf("%s: failed to become ready\n", ci->ci_dev->dv_xname); 961 #if defined(MPDEBUG) && defined(DDB) 962 printf("dropping into debugger; continue from here to resume boot\n"); 963 db_enter(); 964 #endif 965 } 966 967 if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { 968 atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFY); 969 970 /* wait for it to identify */ 971 for (i = 2000000; (ci->ci_flags & CPUF_IDENTIFY) && i > 0; i--) 972 delay(10); 973 974 if (ci->ci_flags & CPUF_IDENTIFY) 975 printf("%s: failed to identify\n", 976 ci->ci_dev->dv_xname); 977 } 978 979 if (ci->ci_flags & CPUF_IDENTIFIED) { 980 /* 981 * Test if TSCs are synchronized. Invalidate cache to 982 * minimize possible cache effects. Disable interrupts to 983 * try to rule out external interference. 984 */ 985 s = intr_disable(); 986 wbinvd(); 987 tsc_test_sync_bp(curcpu()); 988 intr_restore(s); 989 } 990 991 CPU_START_CLEANUP(ci); 992 993 pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE); 994 pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE); 995 } 996 997 void 998 cpu_boot_secondary(struct cpu_info *ci) 999 { 1000 int i; 1001 u_long s; 1002 1003 atomic_setbits_int(&ci->ci_flags, CPUF_GO); 1004 1005 for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i>0;i--) { 1006 delay(10); 1007 } 1008 if (! (ci->ci_flags & CPUF_RUNNING)) { 1009 printf("cpu failed to start\n"); 1010 #if defined(MPDEBUG) && defined(DDB) 1011 printf("dropping into debugger; continue from here to resume boot\n"); 1012 db_enter(); 1013 #endif 1014 } else if (cold) { 1015 /* Test if TSCs are synchronized again. */ 1016 s = intr_disable(); 1017 wbinvd(); 1018 tsc_test_sync_bp(curcpu()); 1019 intr_restore(s); 1020 } 1021 } 1022 1023 /* 1024 * The CPU ends up here when it's ready to run 1025 * This is called from code in mptramp.s; at this point, we are running 1026 * in the idle pcb/idle stack of the new cpu. When this function returns, 1027 * this processor will enter the idle loop and start looking for work. 1028 * 1029 * XXX should share some of this with init386 in machdep.c 1030 */ 1031 void 1032 cpu_hatch(void *v) 1033 { 1034 struct cpu_info *ci = (struct cpu_info *)v; 1035 int s; 1036 1037 { 1038 uint32_t vendor[4]; 1039 int level; 1040 1041 CPUID(0, level, vendor[0], vendor[2], vendor[1]); 1042 vendor[3] = 0; 1043 cpu_set_vendor(ci, level, (const char *)vendor); 1044 } 1045 1046 cpu_init_msrs(ci); 1047 1048 #ifdef DEBUG 1049 if (ci->ci_flags & CPUF_PRESENT) 1050 panic("%s: already running!?", ci->ci_dev->dv_xname); 1051 #endif 1052 atomic_setbits_int(&ci->ci_flags, CPUF_PRESENT); 1053 1054 lapic_enable(); 1055 cpu_ucode_apply(ci); 1056 cpu_tsx_disable(ci); 1057 1058 if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { 1059 /* 1060 * We need to wait until we can identify, otherwise dmesg 1061 * output will be messy. 1062 */ 1063 while ((ci->ci_flags & CPUF_IDENTIFY) == 0) 1064 delay(10); 1065 1066 identifycpu(ci); 1067 1068 /* Prevent identifycpu() from running again */ 1069 atomic_setbits_int(&ci->ci_flags, CPUF_IDENTIFIED); 1070 1071 /* Signal we're done */ 1072 atomic_clearbits_int(&ci->ci_flags, CPUF_IDENTIFY); 1073 } 1074 1075 /* These have to run after identifycpu() */ 1076 cpu_fix_msrs(ci); 1077 1078 /* 1079 * Test if our TSC is synchronized for the first time. 1080 * Note that interrupts are off at this point. 1081 */ 1082 wbinvd(); 1083 tsc_test_sync_ap(ci); 1084 1085 while ((ci->ci_flags & CPUF_GO) == 0) 1086 delay(10); 1087 #ifdef HIBERNATE 1088 if ((ci->ci_flags & CPUF_PARK) != 0) { 1089 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) 1090 lcr4(rcr4() & ~CR4_CET); 1091 atomic_clearbits_int(&ci->ci_flags, CPUF_PARK); 1092 hibernate_drop_to_real_mode(); 1093 } 1094 #endif /* HIBERNATE */ 1095 1096 #ifdef DEBUG 1097 if (ci->ci_flags & CPUF_RUNNING) 1098 panic("%s: already running!?", ci->ci_dev->dv_xname); 1099 #endif 1100 1101 cpu_init_idt(); 1102 lapic_set_lvt(); 1103 gdt_init_cpu(ci); 1104 fpuinit(ci); 1105 1106 lldt(0); 1107 1108 cpu_init(ci); 1109 #if NPVBUS > 0 1110 pvbus_init_cpu(); 1111 #endif 1112 1113 /* Re-initialise memory range handling on AP */ 1114 if (mem_range_softc.mr_op != NULL) 1115 mem_range_softc.mr_op->initAP(&mem_range_softc); 1116 1117 s = splhigh(); 1118 lcr8(0); 1119 intr_enable(); 1120 splx(s); 1121 1122 lapic_startclock(); 1123 1124 sched_toidle(); 1125 } 1126 1127 #if defined(DDB) 1128 1129 #include <ddb/db_output.h> 1130 #include <machine/db_machdep.h> 1131 1132 /* 1133 * Dump cpu information from ddb. 1134 */ 1135 void 1136 cpu_debug_dump(void) 1137 { 1138 struct cpu_info *ci; 1139 CPU_INFO_ITERATOR cii; 1140 1141 db_printf("addr dev id flags ipis curproc\n"); 1142 CPU_INFO_FOREACH(cii, ci) { 1143 db_printf("%p %s %u %x %x %10p\n", 1144 ci, 1145 ci->ci_dev == NULL ? "BOOT" : ci->ci_dev->dv_xname, 1146 ci->ci_cpuid, 1147 ci->ci_flags, ci->ci_ipis, 1148 ci->ci_curproc); 1149 } 1150 } 1151 #endif 1152 1153 int 1154 mp_cpu_start(struct cpu_info *ci) 1155 { 1156 unsigned short dwordptr[2]; 1157 1158 /* 1159 * "The BSP must initialize CMOS shutdown code to 0Ah ..." 1160 */ 1161 1162 outb(IO_RTC, NVRAM_RESET); 1163 outb(IO_RTC+1, NVRAM_RESET_JUMP); 1164 1165 /* 1166 * "and the warm reset vector (DWORD based at 40:67) to point 1167 * to the AP startup code ..." 1168 */ 1169 1170 dwordptr[0] = 0; 1171 dwordptr[1] = MP_TRAMPOLINE >> 4; 1172 1173 pmap_kenter_pa(0, 0, PROT_READ | PROT_WRITE); 1174 memcpy((u_int8_t *) 0x467, dwordptr, 4); 1175 pmap_kremove(0, PAGE_SIZE); 1176 1177 #if NLAPIC > 0 1178 /* 1179 * ... prior to executing the following sequence:" 1180 */ 1181 1182 if (ci->ci_flags & CPUF_AP) { 1183 x86_ipi_init(ci->ci_apicid); 1184 1185 delay(10000); 1186 1187 if (cpu_feature & CPUID_APIC) { 1188 x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid, 1189 LAPIC_DLMODE_STARTUP); 1190 delay(200); 1191 1192 x86_ipi(MP_TRAMPOLINE/PAGE_SIZE, ci->ci_apicid, 1193 LAPIC_DLMODE_STARTUP); 1194 delay(200); 1195 } 1196 } 1197 #endif 1198 return 0; 1199 } 1200 1201 void 1202 mp_cpu_start_cleanup(struct cpu_info *ci) 1203 { 1204 /* 1205 * Ensure the NVRAM reset byte contains something vaguely sane. 1206 */ 1207 1208 outb(IO_RTC, NVRAM_RESET); 1209 outb(IO_RTC+1, NVRAM_RESET_RST); 1210 } 1211 #endif /* MULTIPROCESSOR */ 1212 1213 typedef void (vector)(void); 1214 extern vector Xsyscall_meltdown, Xsyscall, Xsyscall32; 1215 1216 void 1217 cpu_init_msrs(struct cpu_info *ci) 1218 { 1219 wrmsr(MSR_STAR, 1220 ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1221 ((uint64_t)GSEL(GUDATA_SEL-1, SEL_UPL) << 48)); 1222 wrmsr(MSR_LSTAR, cpu_meltdown ? (uint64_t)Xsyscall_meltdown : 1223 (uint64_t)Xsyscall); 1224 wrmsr(MSR_CSTAR, 0); 1225 wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC); 1226 1227 wrmsr(MSR_FSBASE, 0); 1228 wrmsr(MSR_GSBASE, (u_int64_t)ci); 1229 wrmsr(MSR_KERNELGSBASE, 0); 1230 patinit(ci); 1231 } 1232 1233 void 1234 cpu_fix_msrs(struct cpu_info *ci) 1235 { 1236 int family = ci->ci_family; 1237 uint64_t msr, nmsr; 1238 1239 if (ci->ci_vendor == CPUV_INTEL) { 1240 if ((family > 6 || (family == 6 && ci->ci_model >= 0xd)) && 1241 rdmsr_safe(MSR_MISC_ENABLE, &msr) == 0 && 1242 (msr & MISC_ENABLE_FAST_STRINGS) == 0) { 1243 msr |= MISC_ENABLE_FAST_STRINGS; 1244 wrmsr(MSR_MISC_ENABLE, msr); 1245 DPRINTF("%s: enabled fast strings\n", ci->ci_dev->dv_xname); 1246 1247 /* 1248 * Attempt to disable Silicon Debug and lock the configuration 1249 * if it's enabled and unlocked. 1250 */ 1251 if (cpu_ecxfeature & CPUIDECX_SDBG) { 1252 msr = rdmsr(IA32_DEBUG_INTERFACE); 1253 if ((msr & IA32_DEBUG_INTERFACE_ENABLE) && 1254 (msr & IA32_DEBUG_INTERFACE_LOCK) == 0) { 1255 msr &= IA32_DEBUG_INTERFACE_MASK; 1256 msr |= IA32_DEBUG_INTERFACE_LOCK; 1257 wrmsr(IA32_DEBUG_INTERFACE, msr); 1258 } else if (msr & IA32_DEBUG_INTERFACE_ENABLE) 1259 printf("%s: cannot disable silicon debug\n", 1260 ci->ci_dev->dv_xname); 1261 } 1262 } 1263 } 1264 1265 if (ci->ci_vendor == CPUV_AMD) { 1266 /* Apply AMD errata */ 1267 amd64_errata(ci); 1268 1269 /* 1270 * "Mitigation G-2" per AMD's Whitepaper "Software Techniques 1271 * for Managing Speculation on AMD Processors" 1272 * 1273 * By setting MSR C001_1029[1]=1, LFENCE becomes a dispatch 1274 * serializing instruction. 1275 * 1276 * This MSR is available on all AMD families >= 10h, except 11h 1277 * where LFENCE is always serializing. 1278 */ 1279 if (family >= 0x10 && family != 0x11) { 1280 nmsr = msr = rdmsr(MSR_DE_CFG); 1281 nmsr |= DE_CFG_SERIALIZE_LFENCE; 1282 if (msr != nmsr) 1283 wrmsr(MSR_DE_CFG, nmsr); 1284 } 1285 if (family == 0x17 && ci->ci_model >= 0x31 && 1286 (cpu_ecxfeature & CPUIDECX_HV) == 0) { 1287 nmsr = msr = rdmsr(MSR_DE_CFG); 1288 nmsr |= DE_CFG_SERIALIZE_9; 1289 if (msr != nmsr) 1290 wrmsr(MSR_DE_CFG, nmsr); 1291 } 1292 } 1293 1294 #ifndef SMALL_KERNEL 1295 if (ci->ci_feature_sefflags_edx & SEFF0EDX_IBT) { 1296 msr = rdmsr(MSR_S_CET); 1297 wrmsr(MSR_S_CET, (msr & ~MSR_CET_NO_TRACK_EN) | MSR_CET_ENDBR_EN); 1298 lcr4(rcr4() | CR4_CET); 1299 } 1300 #endif 1301 } 1302 1303 void 1304 cpu_tsx_disable(struct cpu_info *ci) 1305 { 1306 uint64_t msr; 1307 uint32_t dummy, sefflags_edx; 1308 1309 /* this runs before identifycpu() populates ci_feature_sefflags_edx */ 1310 if (ci->ci_cpuid_level < 0x07) 1311 return; 1312 CPUID_LEAF(0x7, 0, dummy, dummy, dummy, sefflags_edx); 1313 1314 if (ci->ci_vendor == CPUV_INTEL && 1315 (sefflags_edx & SEFF0EDX_ARCH_CAP)) { 1316 msr = rdmsr(MSR_ARCH_CAPABILITIES); 1317 if (msr & ARCH_CAP_TSX_CTRL) { 1318 msr = rdmsr(MSR_TSX_CTRL); 1319 msr |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_TSX_CPUID_CLEAR; 1320 wrmsr(MSR_TSX_CTRL, msr); 1321 } 1322 } 1323 } 1324 1325 void 1326 patinit(struct cpu_info *ci) 1327 { 1328 extern int pmap_pg_wc; 1329 u_int64_t reg; 1330 1331 if ((cpu_feature & CPUID_PAT) == 0) 1332 return; 1333 /* 1334 * Set up PAT bits. 1335 * The default pat table is the following: 1336 * WB, WT, UC-, UC, WB, WT, UC-, UC 1337 * We change it to: 1338 * WB, WC, UC-, UC, WB, WC, UC-, UC 1339 * i.e change the WT bit to be WC. 1340 */ 1341 reg = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 1342 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 1343 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 1344 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 1345 1346 wrmsr(MSR_CR_PAT, reg); 1347 pmap_pg_wc = PG_WC; 1348 } 1349 1350 struct timeout rdrand_tmo; 1351 void rdrand(void *); 1352 1353 void 1354 rdrand(void *v) 1355 { 1356 struct timeout *tmo = v; 1357 extern int has_rdrand, has_rdseed; 1358 union { 1359 uint64_t u64; 1360 uint32_t u32[2]; 1361 } r, t; 1362 uint64_t tsc; 1363 uint8_t valid = 0; 1364 1365 tsc = rdtsc(); 1366 if (has_rdseed) 1367 __asm volatile( 1368 "rdseed %0\n\t" 1369 "setc %1\n" 1370 : "=r" (r.u64), "=qm" (valid) ); 1371 if (has_rdrand && (has_rdseed == 0 || valid == 0)) 1372 __asm volatile( 1373 "rdrand %0\n\t" 1374 "setc %1\n" 1375 : "=r" (r.u64), "=qm" (valid) ); 1376 1377 t.u64 = tsc; 1378 t.u64 ^= r.u64; 1379 t.u64 ^= valid; /* potential rdrand empty */ 1380 if (has_rdrand) 1381 t.u64 += rdtsc(); /* potential vmexit latency */ 1382 1383 enqueue_randomness(t.u32[0]); 1384 enqueue_randomness(t.u32[1]); 1385 1386 if (tmo) 1387 timeout_add_msec(tmo, 10); 1388 } 1389 1390 int 1391 cpu_activate(struct device *self, int act) 1392 { 1393 struct cpu_softc *sc = (struct cpu_softc *)self; 1394 1395 switch (act) { 1396 case DVACT_RESUME: 1397 if (sc->sc_info->ci_cpuid == 0) 1398 rdrand(NULL); 1399 #if NPCTR > 0 1400 pctr_resume(sc->sc_info); 1401 #endif 1402 break; 1403 } 1404 1405 return (0); 1406 } 1407 1408 /* 1409 * cpu_enter_pages 1410 * 1411 * Requests mapping of various special pages required in the Intel Meltdown 1412 * case (to be entered into the U-K page table): 1413 * 1414 * 1 tss+gdt page for each CPU 1415 * 1 trampoline stack page for each CPU 1416 * 1417 * The cpu_info_full struct for each CPU straddles these pages. The offset into 1418 * 'cif' is calculated below, for each page. For more information, consult 1419 * the definition of struct cpu_info_full in cpu_full.h 1420 * 1421 * On CPUs unaffected by Meltdown, this function still configures 'cif' but 1422 * the calls to pmap_enter_special become no-ops. 1423 * 1424 * Parameters: 1425 * cif : the cpu_info_full structure describing a CPU whose pages are to be 1426 * entered into the special meltdown U-K page table. 1427 */ 1428 void 1429 cpu_enter_pages(struct cpu_info_full *cif) 1430 { 1431 vaddr_t va; 1432 paddr_t pa; 1433 1434 /* The TSS+GDT need to be readable */ 1435 va = (vaddr_t)cif; 1436 pmap_extract(pmap_kernel(), va, &pa); 1437 pmap_enter_special(va, pa, PROT_READ); 1438 DPRINTF("%s: entered tss+gdt page at va 0x%llx pa 0x%llx\n", __func__, 1439 (uint64_t)va, (uint64_t)pa); 1440 1441 /* The trampoline stack page needs to be read/write */ 1442 va = (vaddr_t)&cif->cif_tramp_stack; 1443 pmap_extract(pmap_kernel(), va, &pa); 1444 pmap_enter_special(va, pa, PROT_READ | PROT_WRITE); 1445 DPRINTF("%s: entered t.stack page at va 0x%llx pa 0x%llx\n", __func__, 1446 (uint64_t)va, (uint64_t)pa); 1447 1448 cif->cif_tss.tss_rsp0 = va + sizeof(cif->cif_tramp_stack) - 16; 1449 DPRINTF("%s: cif_tss.tss_rsp0 = 0x%llx\n" ,__func__, 1450 (uint64_t)cif->cif_tss.tss_rsp0); 1451 cif->cif_cpu.ci_intr_rsp = cif->cif_tss.tss_rsp0 - 1452 sizeof(struct iretq_frame); 1453 1454 #define SETUP_IST_SPECIAL_STACK(ist, cif, member) do { \ 1455 (cif)->cif_tss.tss_ist[(ist)] = (vaddr_t)&(cif)->member + \ 1456 sizeof((cif)->member) - 16; \ 1457 (cif)->member[nitems((cif)->member) - 2] = (int64_t)&(cif)->cif_cpu; \ 1458 } while (0) 1459 1460 SETUP_IST_SPECIAL_STACK(0, cif, cif_dblflt_stack); 1461 SETUP_IST_SPECIAL_STACK(1, cif, cif_nmi_stack); 1462 1463 /* an empty iomap, by setting its offset to the TSS limit */ 1464 cif->cif_tss.tss_iobase = sizeof(cif->cif_tss); 1465 } 1466 1467 #ifdef MULTIPROCESSOR 1468 int 1469 wbinvd_on_all_cpus(void) 1470 { 1471 x86_broadcast_ipi(X86_IPI_WBINVD); 1472 wbinvd(); 1473 return 0; 1474 } 1475 1476 volatile long wbinvd_wait __attribute__((section(".kudata"))); 1477 1478 void 1479 wbinvd_on_all_cpus_acked(void) 1480 { 1481 struct cpu_info *ci, *self = curcpu();; 1482 CPU_INFO_ITERATOR cii; 1483 long wait = 0; 1484 u_int64_t mask = 0; 1485 int s; 1486 1487 CPU_INFO_FOREACH(cii, ci) { 1488 if (ci == self) 1489 continue; 1490 mask |= (1ULL << ci->ci_cpuid); 1491 wait++; 1492 } 1493 1494 KASSERT(wait > 0); 1495 1496 s = splvm(); 1497 while (atomic_cas_ulong(&wbinvd_wait, 0 , wait) != 0) { 1498 while (wbinvd_wait != 0) { 1499 CPU_BUSY_CYCLE(); 1500 } 1501 } 1502 1503 CPU_INFO_FOREACH(cii, ci) { 1504 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 1505 continue; 1506 if (x86_fast_ipi(ci, LAPIC_IPI_WBINVD) != 0) 1507 panic("%s: ipi failed", __func__); 1508 } 1509 splx(s); 1510 1511 wbinvd(); 1512 1513 while (wbinvd_wait != 0) 1514 CPU_BUSY_CYCLE(); 1515 } 1516 #endif /* MULTIPROCESSOR */ 1517 1518 int cpu_suspended; 1519 int cpu_wakeups; 1520 1521 #ifdef SUSPEND 1522 1523 void 1524 cpu_suspend_cycle(void) 1525 { 1526 if (cpu_suspend_cycle_fcn) 1527 cpu_suspend_cycle_fcn(); 1528 else 1529 cpu_idle_cycle_fcn(); 1530 } 1531 1532 int 1533 cpu_suspend_primary(void) 1534 { 1535 struct cpu_info *ci = curcpu(); 1536 1537 /* Mask clock interrupts. */ 1538 local_pic.pic_hwmask(&local_pic, 0); 1539 1540 /* 1541 * All non-wakeup interrupts should be masked at this point; 1542 * re-enable interrupts such that wakeup interrupts actually 1543 * wake us up. Set a flag such that drivers can tell we're 1544 * suspended and change their behaviour accordingly. They can 1545 * wake us up by clearing the flag. 1546 */ 1547 cpu_suspended = 1; 1548 ci->ci_ilevel = IPL_NONE; 1549 intr_enable(); 1550 1551 while (cpu_suspended) { 1552 cpu_suspend_cycle(); 1553 cpu_wakeups++; 1554 } 1555 1556 intr_disable(); 1557 ci->ci_ilevel = IPL_HIGH; 1558 1559 /* Unmask clock interrupts. */ 1560 local_pic.pic_hwunmask(&local_pic, 0); 1561 1562 return 0; 1563 } 1564 1565 #endif 1566