1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 #include "opt_acpi.h" 29 #ifdef __i386__ 30 #include "opt_apic.h" 31 #endif 32 #include "opt_cpu.h" 33 #include "opt_ddb.h" 34 #include "opt_gdb.h" 35 #include "opt_kstack_pages.h" 36 #include "opt_pmap.h" 37 #include "opt_sched.h" 38 #include "opt_smp.h" 39 #include "opt_stack.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/asan.h> 44 #include <sys/bus.h> 45 #include <sys/cons.h> /* cngetc() */ 46 #include <sys/cpuset.h> 47 #include <sys/csan.h> 48 #include <sys/interrupt.h> 49 #include <sys/kdb.h> 50 #include <sys/kernel.h> 51 #include <sys/ktr.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/memrange.h> 55 #include <sys/mutex.h> 56 #include <sys/pcpu.h> 57 #include <sys/proc.h> 58 #include <sys/sched.h> 59 #include <sys/smp.h> 60 #include <sys/sysctl.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_param.h> 64 #include <vm/pmap.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_extern.h> 67 #include <vm/vm_map.h> 68 69 #include <x86/apicreg.h> 70 #include <machine/clock.h> 71 #include <machine/cpu.h> 72 #include <machine/cputypes.h> 73 #include <x86/mca.h> 74 #include <machine/md_var.h> 75 #include <machine/pcb.h> 76 #include <machine/psl.h> 77 #include <machine/smp.h> 78 #include <machine/specialreg.h> 79 #include <machine/stack.h> 80 #include <x86/ucode.h> 81 82 #ifdef DEV_ACPI 83 #include <contrib/dev/acpica/include/acpi.h> 84 #include <dev/acpica/acpivar.h> 85 #endif 86 87 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 88 89 int mp_naps; /* # of Applications processors */ 90 int boot_cpu_id = -1; /* designated BSP */ 91 92 /* AP uses this during bootstrap. Do not staticize. */ 93 char *bootSTK; 94 int bootAP; 95 96 /* Free these after use */ 97 void *bootstacks[MAXCPU]; 98 void *dpcpu; 99 100 struct susppcb **susppcbs; 101 102 #ifdef COUNT_IPIS 103 /* Interrupt counts. */ 104 static u_long *ipi_preempt_counts[MAXCPU]; 105 static u_long *ipi_ast_counts[MAXCPU]; 106 u_long *ipi_invltlb_counts[MAXCPU]; 107 u_long *ipi_invlrng_counts[MAXCPU]; 108 u_long *ipi_invlpg_counts[MAXCPU]; 109 u_long *ipi_invlcache_counts[MAXCPU]; 110 u_long *ipi_rendezvous_counts[MAXCPU]; 111 static u_long *ipi_hardclock_counts[MAXCPU]; 112 #endif 113 114 /* Default cpu_ops implementation. */ 115 struct cpu_ops cpu_ops; 116 117 /* 118 * Local data and functions. 119 */ 120 121 static volatile cpuset_t ipi_stop_nmi_pending; 122 123 volatile cpuset_t resuming_cpus; 124 volatile cpuset_t toresume_cpus; 125 126 /* used to hold the AP's until we are ready to release them */ 127 struct mtx ap_boot_mtx; 128 129 /* Set to 1 once we're ready to let the APs out of the pen. */ 130 volatile int aps_ready = 0; 131 132 /* 133 * Store data from cpu_add() until later in the boot when we actually setup 134 * the APs. 135 */ 136 struct cpu_info *cpu_info; 137 int *apic_cpuids; 138 int cpu_apic_ids[MAXCPU]; 139 _Static_assert(MAXCPU <= MAX_APIC_ID, 140 "MAXCPU cannot be larger that MAX_APIC_ID"); 141 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 142 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 143 144 static void release_aps(void *dummy); 145 static void cpustop_handler_post(u_int cpu); 146 147 static int hyperthreading_allowed = 1; 148 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 149 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 150 151 static int hyperthreading_intr_allowed = 0; 152 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 153 &hyperthreading_intr_allowed, 0, 154 "Allow interrupts on HTT logical CPUs"); 155 156 static int intr_apic_id_limit = -1; 157 SYSCTL_INT(_machdep, OID_AUTO, intr_apic_id_limit, CTLFLAG_RDTUN, 158 &intr_apic_id_limit, 0, 159 "Maximum permitted APIC ID for interrupt delivery (-1 is unlimited)"); 160 161 static struct topo_node topo_root; 162 163 static int pkg_id_shift; 164 static int node_id_shift; 165 static int core_id_shift; 166 static int disabled_cpus; 167 168 struct cache_info { 169 int id_shift; 170 int present; 171 } static caches[MAX_CACHE_LEVELS]; 172 173 static bool stop_mwait = false; 174 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 175 "Use MONITOR/MWAIT when stopping CPU, if available"); 176 177 void 178 mem_range_AP_init(void) 179 { 180 181 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 182 mem_range_softc.mr_op->initAP(&mem_range_softc); 183 } 184 185 /* 186 * Compute ceil(log2(x)). Returns -1 if x is zero. 187 */ 188 static __inline int 189 mask_width(u_int x) 190 { 191 192 return (x == 0 ? -1 : order_base_2(x)); 193 } 194 195 /* 196 * Add a cache level to the cache topology description. 197 */ 198 static int 199 add_deterministic_cache(int type, int level, int share_count) 200 { 201 202 if (type == 0) 203 return (0); 204 if (type > 3) { 205 printf("unexpected cache type %d\n", type); 206 return (1); 207 } 208 if (type == 2) /* ignore instruction cache */ 209 return (1); 210 if (level == 0 || level > MAX_CACHE_LEVELS) { 211 printf("unexpected cache level %d\n", level); 212 return (1); 213 } 214 215 if (caches[level - 1].present) { 216 printf("WARNING: multiple entries for L%u data cache\n", level); 217 printf("%u => %u\n", caches[level - 1].id_shift, 218 mask_width(share_count)); 219 } 220 caches[level - 1].id_shift = mask_width(share_count); 221 caches[level - 1].present = 1; 222 223 if (caches[level - 1].id_shift > pkg_id_shift) { 224 printf("WARNING: L%u data cache covers more " 225 "APIC IDs than a package (%u > %u)\n", level, 226 caches[level - 1].id_shift, pkg_id_shift); 227 caches[level - 1].id_shift = pkg_id_shift; 228 } 229 if (caches[level - 1].id_shift < core_id_shift) { 230 printf("WARNING: L%u data cache covers fewer " 231 "APIC IDs than a core (%u < %u)\n", level, 232 caches[level - 1].id_shift, core_id_shift); 233 caches[level - 1].id_shift = core_id_shift; 234 } 235 236 return (1); 237 } 238 239 /* 240 * Determine topology of processing units and caches for AMD CPUs. 241 * See: 242 * - AMD CPUID Specification (Publication # 25481) 243 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 244 * - BKDG For AMD Family 10h Processors (Publication # 31116) 245 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 246 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 247 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 248 */ 249 static void 250 topo_probe_amd(void) 251 { 252 u_int p[4]; 253 uint64_t v; 254 int level; 255 int nodes_per_socket; 256 int share_count; 257 int type; 258 int i; 259 260 /* No multi-core capability. */ 261 if ((amd_feature2 & AMDID2_CMP) == 0) 262 return; 263 264 /* 265 * XXX Lack of an AMD IOMMU driver prevents use of APIC IDs above 266 * xAPIC_MAX_APIC_ID. This is a workaround so we boot and function on 267 * AMD systems with high thread counts, albeit with reduced interrupt 268 * performance. 269 * 270 * We should really set the limit to xAPIC_MAX_APIC_ID by default, and 271 * have the IOMMU driver increase it. That way if a driver is present 272 * but disabled, or is otherwise not able to route the interrupts, the 273 * system can fall back to a functional state. That will require a more 274 * substantial change though, including having the IOMMU initialize 275 * earlier. 276 */ 277 if (intr_apic_id_limit == -1) 278 intr_apic_id_limit = xAPIC_MAX_APIC_ID; 279 280 /* For families 10h and newer. */ 281 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 282 AMDID_COREID_SIZE_SHIFT; 283 284 /* For 0Fh family. */ 285 if (pkg_id_shift == 0) 286 pkg_id_shift = 287 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 288 289 /* 290 * Families prior to 16h define the following value as 291 * cores per compute unit and we don't really care about the AMD 292 * compute units at the moment. Perhaps we should treat them as 293 * cores and cores within the compute units as hardware threads, 294 * but that's up for debate. 295 * Later families define the value as threads per compute unit, 296 * so we are following AMD's nomenclature here. 297 */ 298 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 299 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 300 cpuid_count(0x8000001e, 0, p); 301 share_count = ((p[1] >> 8) & 0xff) + 1; 302 core_id_shift = mask_width(share_count); 303 304 /* 305 * For Zen (17h), gather Nodes per Processor. Each node is a 306 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 307 * package. Communication latency between dies is higher than 308 * within them. 309 */ 310 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 311 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 312 } 313 314 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 315 for (i = 0; ; i++) { 316 cpuid_count(0x8000001d, i, p); 317 type = p[0] & 0x1f; 318 level = (p[0] >> 5) & 0x7; 319 share_count = 1 + ((p[0] >> 14) & 0xfff); 320 321 if (!add_deterministic_cache(type, level, share_count)) 322 break; 323 } 324 } else { 325 if (cpu_exthigh >= 0x80000005) { 326 cpuid_count(0x80000005, 0, p); 327 if (((p[2] >> 24) & 0xff) != 0) { 328 caches[0].id_shift = 0; 329 caches[0].present = 1; 330 } 331 } 332 if (cpu_exthigh >= 0x80000006) { 333 cpuid_count(0x80000006, 0, p); 334 if (((p[2] >> 16) & 0xffff) != 0) { 335 caches[1].id_shift = 0; 336 caches[1].present = 1; 337 } 338 if (((p[3] >> 18) & 0x3fff) != 0) { 339 nodes_per_socket = 1; 340 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 341 /* 342 * Handle multi-node processors that 343 * have multiple chips, each with its 344 * own L3 cache, on the same die. 345 */ 346 v = rdmsr(0xc001100c); 347 nodes_per_socket = 1 + ((v >> 3) & 0x7); 348 } 349 caches[2].id_shift = 350 pkg_id_shift - mask_width(nodes_per_socket); 351 caches[2].present = 1; 352 } 353 } 354 } 355 } 356 357 /* 358 * Determine topology of processing units for Intel CPUs 359 * using CPUID Leaf 1 and Leaf 4, if supported. 360 * See: 361 * - Intel 64 Architecture Processor Topology Enumeration 362 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 363 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 364 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 365 */ 366 static void 367 topo_probe_intel_0x4(void) 368 { 369 u_int p[4]; 370 int max_cores; 371 int max_logical; 372 373 /* Both zero and one here mean one logical processor per package. */ 374 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 375 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 376 if (max_logical <= 1) 377 return; 378 379 if (cpu_high >= 0x4) { 380 cpuid_count(0x04, 0, p); 381 max_cores = ((p[0] >> 26) & 0x3f) + 1; 382 } else 383 max_cores = 1; 384 385 core_id_shift = mask_width(max_logical/max_cores); 386 KASSERT(core_id_shift >= 0, 387 ("intel topo: max_cores > max_logical\n")); 388 pkg_id_shift = core_id_shift + mask_width(max_cores); 389 } 390 391 /* 392 * Determine topology of processing units for Intel CPUs 393 * using CPUID Leaf 1Fh or 0Bh, if supported. 394 * See: 395 * - Intel 64 Architecture Processor Topology Enumeration 396 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 397 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 398 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 399 */ 400 static void 401 topo_probe_intel_0xb(void) 402 { 403 u_int leaf; 404 u_int p[4] = { 0 }; 405 int bits; 406 int type; 407 int i; 408 409 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */ 410 if (cpu_high >= 0x1f) { 411 leaf = 0x1f; 412 cpuid_count(leaf, 0, p); 413 } 414 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */ 415 if (p[1] == 0) { 416 leaf = 0x0b; 417 cpuid_count(leaf, 0, p); 418 } 419 /* Fall back to leaf 04h (Deterministic Cache Parameters). */ 420 if (p[1] == 0) { 421 topo_probe_intel_0x4(); 422 return; 423 } 424 425 /* We only support three levels for now. */ 426 for (i = 0; ; i++) { 427 cpuid_count(leaf, i, p); 428 429 bits = p[0] & 0x1f; 430 type = (p[2] >> 8) & 0xff; 431 432 if (type == 0) 433 break; 434 435 if (type == CPUID_TYPE_SMT) 436 core_id_shift = bits; 437 else if (type == CPUID_TYPE_CORE) 438 pkg_id_shift = bits; 439 else if (bootverbose) 440 printf("Topology level type %d shift: %d\n", type, bits); 441 } 442 443 if (pkg_id_shift < core_id_shift) { 444 printf("WARNING: core covers more APIC IDs than a package\n"); 445 core_id_shift = pkg_id_shift; 446 } 447 } 448 449 /* 450 * Determine topology of caches for Intel CPUs. 451 * See: 452 * - Intel 64 Architecture Processor Topology Enumeration 453 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 454 * Volume 2A: Instruction Set Reference, A-M, 455 * CPUID instruction 456 */ 457 static void 458 topo_probe_intel_caches(void) 459 { 460 u_int p[4]; 461 int level; 462 int share_count; 463 int type; 464 int i; 465 466 if (cpu_high < 0x4) { 467 /* 468 * Available cache level and sizes can be determined 469 * via CPUID leaf 2, but that requires a huge table of hardcoded 470 * values, so for now just assume L1 and L2 caches potentially 471 * shared only by HTT processing units, if HTT is present. 472 */ 473 caches[0].id_shift = pkg_id_shift; 474 caches[0].present = 1; 475 caches[1].id_shift = pkg_id_shift; 476 caches[1].present = 1; 477 return; 478 } 479 480 for (i = 0; ; i++) { 481 cpuid_count(0x4, i, p); 482 type = p[0] & 0x1f; 483 level = (p[0] >> 5) & 0x7; 484 share_count = 1 + ((p[0] >> 14) & 0xfff); 485 486 if (!add_deterministic_cache(type, level, share_count)) 487 break; 488 } 489 } 490 491 /* 492 * Determine topology of processing units and caches for Intel CPUs. 493 * See: 494 * - Intel 64 Architecture Processor Topology Enumeration 495 */ 496 static void 497 topo_probe_intel(void) 498 { 499 500 /* 501 * Note that 0x1 <= cpu_high < 4 case should be 502 * compatible with topo_probe_intel_0x4() logic when 503 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 504 * or it should trigger the fallback otherwise. 505 */ 506 if (cpu_high >= 0xb) 507 topo_probe_intel_0xb(); 508 else if (cpu_high >= 0x1) 509 topo_probe_intel_0x4(); 510 511 topo_probe_intel_caches(); 512 } 513 514 /* 515 * Topology information is queried only on BSP, on which this 516 * code runs and for which it can query CPUID information. 517 * Then topology is extrapolated on all packages using an 518 * assumption that APIC ID to hardware component ID mapping is 519 * homogenious. 520 * That doesn't necesserily imply that the topology is uniform. 521 */ 522 void 523 topo_probe(void) 524 { 525 static int cpu_topo_probed = 0; 526 struct x86_topo_layer { 527 int type; 528 int subtype; 529 int id_shift; 530 } topo_layers[MAX_CACHE_LEVELS + 5]; 531 struct topo_node *parent; 532 struct topo_node *node; 533 int layer; 534 int nlayers; 535 int node_id; 536 int i; 537 #if defined(DEV_ACPI) && MAXMEMDOM > 1 538 int d, domain; 539 #endif 540 541 if (cpu_topo_probed) 542 return; 543 544 CPU_ZERO(&logical_cpus_mask); 545 546 if (mp_ncpus <= 1) 547 ; /* nothing */ 548 else if (cpu_vendor_id == CPU_VENDOR_AMD || 549 cpu_vendor_id == CPU_VENDOR_HYGON) 550 topo_probe_amd(); 551 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 552 topo_probe_intel(); 553 554 KASSERT(pkg_id_shift >= core_id_shift, 555 ("bug in APIC topology discovery")); 556 557 nlayers = 0; 558 bzero(topo_layers, sizeof(topo_layers)); 559 560 topo_layers[nlayers].type = TOPO_TYPE_PKG; 561 topo_layers[nlayers].id_shift = pkg_id_shift; 562 if (bootverbose) 563 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 564 nlayers++; 565 566 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 567 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 568 topo_layers[nlayers].id_shift = node_id_shift; 569 if (bootverbose) 570 printf("Node ID shift: %u\n", 571 topo_layers[nlayers].id_shift); 572 nlayers++; 573 } 574 575 /* 576 * Consider all caches to be within a package/chip 577 * and "in front" of all sub-components like 578 * cores and hardware threads. 579 */ 580 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 581 if (caches[i].present) { 582 if (node_id_shift != 0) 583 KASSERT(caches[i].id_shift <= node_id_shift, 584 ("bug in APIC topology discovery")); 585 KASSERT(caches[i].id_shift <= pkg_id_shift, 586 ("bug in APIC topology discovery")); 587 KASSERT(caches[i].id_shift >= core_id_shift, 588 ("bug in APIC topology discovery")); 589 590 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 591 topo_layers[nlayers].subtype = i + 1; 592 topo_layers[nlayers].id_shift = caches[i].id_shift; 593 if (bootverbose) 594 printf("L%u cache ID shift: %u\n", 595 topo_layers[nlayers].subtype, 596 topo_layers[nlayers].id_shift); 597 nlayers++; 598 } 599 } 600 601 if (pkg_id_shift > core_id_shift) { 602 topo_layers[nlayers].type = TOPO_TYPE_CORE; 603 topo_layers[nlayers].id_shift = core_id_shift; 604 if (bootverbose) 605 printf("Core ID shift: %u\n", 606 topo_layers[nlayers].id_shift); 607 nlayers++; 608 } 609 610 topo_layers[nlayers].type = TOPO_TYPE_PU; 611 topo_layers[nlayers].id_shift = 0; 612 nlayers++; 613 614 #if defined(DEV_ACPI) && MAXMEMDOM > 1 615 if (vm_ndomains > 1) { 616 for (layer = 0; layer < nlayers; ++layer) { 617 for (i = 0; i <= max_apic_id; ++i) { 618 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) 619 domain = -1; 620 if (!cpu_info[i].cpu_present) 621 continue; 622 d = acpi_pxm_get_cpu_locality(i); 623 if (domain >= 0 && domain != d) 624 break; 625 domain = d; 626 } 627 if (i > max_apic_id) 628 break; 629 } 630 KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); 631 memmove(&topo_layers[layer+1], &topo_layers[layer], 632 sizeof(*topo_layers) * (nlayers - layer)); 633 topo_layers[layer].type = TOPO_TYPE_NODE; 634 topo_layers[layer].subtype = CG_SHARE_NONE; 635 nlayers++; 636 } 637 #endif 638 639 topo_init_root(&topo_root); 640 for (i = 0; i <= max_apic_id; ++i) { 641 if (!cpu_info[i].cpu_present) 642 continue; 643 644 parent = &topo_root; 645 for (layer = 0; layer < nlayers; ++layer) { 646 #if defined(DEV_ACPI) && MAXMEMDOM > 1 647 if (topo_layers[layer].type == TOPO_TYPE_NODE) { 648 node_id = acpi_pxm_get_cpu_locality(i); 649 } else 650 #endif 651 node_id = i >> topo_layers[layer].id_shift; 652 parent = topo_add_node_by_hwid(parent, node_id, 653 topo_layers[layer].type, 654 topo_layers[layer].subtype); 655 } 656 } 657 658 parent = &topo_root; 659 for (layer = 0; layer < nlayers; ++layer) { 660 #if defined(DEV_ACPI) && MAXMEMDOM > 1 661 if (topo_layers[layer].type == TOPO_TYPE_NODE) 662 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); 663 else 664 #endif 665 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 666 node = topo_find_node_by_hwid(parent, node_id, 667 topo_layers[layer].type, 668 topo_layers[layer].subtype); 669 topo_promote_child(node); 670 parent = node; 671 } 672 673 cpu_topo_probed = 1; 674 } 675 676 /* 677 * Assign logical CPU IDs to local APICs. 678 */ 679 void 680 assign_cpu_ids(void) 681 { 682 struct topo_node *node; 683 u_int smt_mask; 684 int nhyper; 685 686 smt_mask = (1u << core_id_shift) - 1; 687 688 /* 689 * Assign CPU IDs to local APIC IDs and disable any CPUs 690 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 691 */ 692 mp_ncpus = 0; 693 nhyper = 0; 694 TOPO_FOREACH(node, &topo_root) { 695 if (node->type != TOPO_TYPE_PU) 696 continue; 697 698 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 699 cpu_info[node->hwid].cpu_hyperthread = 1; 700 701 if (resource_disabled("lapic", node->hwid)) { 702 if (node->hwid != boot_cpu_id) 703 cpu_info[node->hwid].cpu_disabled = 1; 704 else 705 printf("Cannot disable BSP, APIC ID = %d\n", 706 node->hwid); 707 } 708 709 if (!hyperthreading_allowed && 710 cpu_info[node->hwid].cpu_hyperthread) 711 cpu_info[node->hwid].cpu_disabled = 1; 712 713 if (mp_ncpus >= MAXCPU) 714 cpu_info[node->hwid].cpu_disabled = 1; 715 716 if (cpu_info[node->hwid].cpu_disabled) { 717 disabled_cpus++; 718 continue; 719 } 720 721 if (cpu_info[node->hwid].cpu_hyperthread) 722 nhyper++; 723 724 cpu_apic_ids[mp_ncpus] = node->hwid; 725 apic_cpuids[node->hwid] = mp_ncpus; 726 topo_set_pu_id(node, mp_ncpus); 727 mp_ncpus++; 728 } 729 730 KASSERT(mp_maxid >= mp_ncpus - 1, 731 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 732 mp_ncpus)); 733 734 mp_ncores = mp_ncpus - nhyper; 735 smp_threads_per_core = mp_ncpus / mp_ncores; 736 } 737 738 /* 739 * Print various information about the SMP system hardware and setup. 740 */ 741 void 742 cpu_mp_announce(void) 743 { 744 struct topo_node *node; 745 const char *hyperthread; 746 struct topo_analysis topology; 747 748 printf("FreeBSD/SMP: "); 749 if (topo_analyze(&topo_root, 1, &topology)) { 750 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 751 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 752 printf(" x %d groups", 753 topology.entities[TOPO_LEVEL_GROUP]); 754 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 755 printf(" x %d cache groups", 756 topology.entities[TOPO_LEVEL_CACHEGROUP]); 757 if (topology.entities[TOPO_LEVEL_CORE] > 0) 758 printf(" x %d core(s)", 759 topology.entities[TOPO_LEVEL_CORE]); 760 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 761 printf(" x %d hardware threads", 762 topology.entities[TOPO_LEVEL_THREAD]); 763 } else { 764 printf("Non-uniform topology"); 765 } 766 printf("\n"); 767 768 if (disabled_cpus) { 769 printf("FreeBSD/SMP Online: "); 770 if (topo_analyze(&topo_root, 0, &topology)) { 771 printf("%d package(s)", 772 topology.entities[TOPO_LEVEL_PKG]); 773 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 774 printf(" x %d groups", 775 topology.entities[TOPO_LEVEL_GROUP]); 776 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 777 printf(" x %d cache groups", 778 topology.entities[TOPO_LEVEL_CACHEGROUP]); 779 if (topology.entities[TOPO_LEVEL_CORE] > 0) 780 printf(" x %d core(s)", 781 topology.entities[TOPO_LEVEL_CORE]); 782 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 783 printf(" x %d hardware threads", 784 topology.entities[TOPO_LEVEL_THREAD]); 785 } else { 786 printf("Non-uniform topology"); 787 } 788 printf("\n"); 789 } 790 791 if (!bootverbose) 792 return; 793 794 TOPO_FOREACH(node, &topo_root) { 795 switch (node->type) { 796 case TOPO_TYPE_PKG: 797 printf("Package HW ID = %u\n", node->hwid); 798 break; 799 case TOPO_TYPE_CORE: 800 printf("\tCore HW ID = %u\n", node->hwid); 801 break; 802 case TOPO_TYPE_PU: 803 if (cpu_info[node->hwid].cpu_hyperthread) 804 hyperthread = "/HT"; 805 else 806 hyperthread = ""; 807 808 if (node->subtype == 0) 809 printf("\t\tCPU (AP%s): APIC ID: %u" 810 "(disabled)\n", hyperthread, node->hwid); 811 else if (node->id == 0) 812 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 813 node->hwid); 814 else 815 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 816 node->id, hyperthread, node->hwid); 817 break; 818 default: 819 /* ignored */ 820 break; 821 } 822 } 823 } 824 825 /* 826 * Add a scheduling group, a group of logical processors sharing 827 * a particular cache (and, thus having an affinity), to the scheduling 828 * topology. 829 * This function recursively works on lower level caches. 830 */ 831 static void 832 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 833 { 834 struct topo_node *node; 835 int nchildren; 836 int ncores; 837 int i; 838 839 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 840 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, 841 ("x86topo_add_sched_group: bad type: %u", root->type)); 842 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 843 cg_root->cg_count = root->cpu_count; 844 if (root->type == TOPO_TYPE_CACHE) 845 cg_root->cg_level = root->subtype; 846 else 847 cg_root->cg_level = CG_SHARE_NONE; 848 if (root->type == TOPO_TYPE_NODE) 849 cg_root->cg_flags = CG_FLAG_NODE; 850 else 851 cg_root->cg_flags = 0; 852 853 /* 854 * Check how many core nodes we have under the given root node. 855 * If we have multiple logical processors, but not multiple 856 * cores, then those processors must be hardware threads. 857 */ 858 ncores = 0; 859 node = root; 860 while (node != NULL) { 861 if (node->type != TOPO_TYPE_CORE) { 862 node = topo_next_node(root, node); 863 continue; 864 } 865 866 ncores++; 867 node = topo_next_nonchild_node(root, node); 868 } 869 870 if (cg_root->cg_level != CG_SHARE_NONE && 871 root->cpu_count > 1 && ncores < 2) 872 cg_root->cg_flags |= CG_FLAG_SMT; 873 874 /* 875 * Find out how many cache nodes we have under the given root node. 876 * We ignore cache nodes that cover all the same processors as the 877 * root node. Also, we do not descend below found cache nodes. 878 * That is, we count top-level "non-redundant" caches under the root 879 * node. 880 */ 881 nchildren = 0; 882 node = root; 883 while (node != NULL) { 884 /* 885 * When some APICs are disabled by tunables, nodes can end up 886 * with an empty cpuset. Nodes with an empty cpuset will be 887 * translated into cpu groups with empty cpusets. smp_topo_fill 888 * will then set cg_first and cg_last to -1. This isn't 889 * correctly handled in all functions. E.g. when 890 * cpu_search_lowest and cpu_search_highest loop through all 891 * cpus, they call CPU_ISSET on cpu -1 which ends up in a 892 * general protection fault. 893 * 894 * We could fix the scheduler to handle empty cpu groups 895 * correctly. Nevertheless, empty cpu groups are causing 896 * overhead for no value. So, it makes more sense to just don't 897 * create them. 898 */ 899 if (CPU_EMPTY(&node->cpuset)) { 900 node = topo_next_node(root, node); 901 continue; 902 } 903 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { 904 if (node->type == TOPO_TYPE_CACHE && 905 cg_root->cg_level < node->subtype) 906 cg_root->cg_level = node->subtype; 907 if (node->type == TOPO_TYPE_NODE) 908 cg_root->cg_flags |= CG_FLAG_NODE; 909 node = topo_next_node(root, node); 910 continue; 911 } 912 if (node->type != TOPO_TYPE_GROUP && 913 node->type != TOPO_TYPE_NODE && 914 node->type != TOPO_TYPE_CACHE) { 915 node = topo_next_node(root, node); 916 continue; 917 } 918 nchildren++; 919 node = topo_next_nonchild_node(root, node); 920 } 921 922 /* 923 * We are not interested in nodes including only one CPU each. 924 */ 925 if (nchildren == root->cpu_count) 926 return; 927 928 /* 929 * We are not interested in nodes without children. 930 */ 931 cg_root->cg_children = nchildren; 932 if (nchildren == 0) 933 return; 934 935 cg_root->cg_child = smp_topo_alloc(nchildren); 936 937 /* 938 * Now find again the same cache nodes as above and recursively 939 * build scheduling topologies for them. 940 */ 941 node = root; 942 i = 0; 943 while (node != NULL) { 944 if ((node->type != TOPO_TYPE_GROUP && 945 node->type != TOPO_TYPE_NODE && 946 node->type != TOPO_TYPE_CACHE) || 947 CPU_CMP(&node->cpuset, &root->cpuset) == 0 || 948 CPU_EMPTY(&node->cpuset)) { 949 node = topo_next_node(root, node); 950 continue; 951 } 952 cg_root->cg_child[i].cg_parent = cg_root; 953 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 954 i++; 955 node = topo_next_nonchild_node(root, node); 956 } 957 } 958 959 /* 960 * Build the MI scheduling topology from the discovered hardware topology. 961 */ 962 struct cpu_group * 963 cpu_topo(void) 964 { 965 struct cpu_group *cg_root; 966 967 if (mp_ncpus <= 1) 968 return (smp_topo_none()); 969 970 cg_root = smp_topo_alloc(1); 971 x86topo_add_sched_group(&topo_root, cg_root); 972 return (cg_root); 973 } 974 975 static void 976 cpu_alloc(void *dummy __unused) 977 { 978 /* 979 * Dynamically allocate the arrays that depend on the 980 * maximum APIC ID. 981 */ 982 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 983 M_WAITOK | M_ZERO); 984 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 985 M_WAITOK | M_ZERO); 986 } 987 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 988 989 /* 990 * Add a logical CPU to the topology. 991 */ 992 void 993 cpu_add(u_int apic_id, char boot_cpu) 994 { 995 996 if (apic_id > max_apic_id) 997 panic("SMP: APIC ID %d too high", apic_id); 998 999 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 1000 apic_id)); 1001 cpu_info[apic_id].cpu_present = 1; 1002 if (boot_cpu) { 1003 KASSERT(boot_cpu_id == -1, 1004 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 1005 boot_cpu_id)); 1006 boot_cpu_id = apic_id; 1007 cpu_info[apic_id].cpu_bsp = 1; 1008 } 1009 if (bootverbose) 1010 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 1011 "AP"); 1012 } 1013 1014 void 1015 cpu_mp_setmaxid(void) 1016 { 1017 1018 /* 1019 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 1020 * If there were no calls to cpu_add() assume this is a UP system. 1021 */ 1022 if (mp_ncpus == 0) 1023 mp_ncpus = 1; 1024 } 1025 1026 int 1027 cpu_mp_probe(void) 1028 { 1029 1030 /* 1031 * Always record BSP in CPU map so that the mbuf init code works 1032 * correctly. 1033 */ 1034 CPU_SETOF(0, &all_cpus); 1035 return (mp_ncpus > 1); 1036 } 1037 1038 /* 1039 * AP CPU's call this to initialize themselves. 1040 */ 1041 void 1042 init_secondary_tail(void) 1043 { 1044 u_int cpuid; 1045 1046 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 1047 1048 /* 1049 * On real hardware, switch to x2apic mode if possible. Do it 1050 * after aps_ready was signalled, to avoid manipulating the 1051 * mode while BSP might still want to send some IPI to us 1052 * (second startup IPI is ignored on modern hardware etc). 1053 */ 1054 lapic_xapic_mode(); 1055 1056 /* Initialize the PAT MSR. */ 1057 pmap_init_pat(); 1058 1059 /* set up CPU registers and state */ 1060 cpu_setregs(); 1061 1062 /* set up SSE/NX */ 1063 initializecpu(); 1064 1065 /* set up FPU state on the AP */ 1066 #ifdef __amd64__ 1067 fpuinit(); 1068 #else 1069 npxinit(false); 1070 #endif 1071 1072 if (cpu_ops.cpu_init) 1073 cpu_ops.cpu_init(); 1074 1075 /* A quick check from sanity claus */ 1076 cpuid = PCPU_GET(cpuid); 1077 if (PCPU_GET(apic_id) != lapic_id()) { 1078 printf("SMP: cpuid = %d\n", cpuid); 1079 printf("SMP: actual apic_id = %d\n", lapic_id()); 1080 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1081 panic("cpuid mismatch! boom!!"); 1082 } 1083 1084 /* Initialize curthread. */ 1085 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1086 PCPU_SET(curthread, PCPU_GET(idlethread)); 1087 schedinit_ap(); 1088 1089 mtx_lock_spin(&ap_boot_mtx); 1090 1091 mca_init(); 1092 1093 /* Init local apic for irq's */ 1094 lapic_setup(1); 1095 1096 /* Set memory range attributes for this CPU to match the BSP */ 1097 mem_range_AP_init(); 1098 1099 smp_cpus++; 1100 1101 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1102 if (bootverbose) 1103 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1104 else 1105 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1106 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1107 1108 /* Determine if we are a logical CPU. */ 1109 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1110 CPU_SET(cpuid, &logical_cpus_mask); 1111 1112 if (bootverbose) 1113 lapic_dump("AP"); 1114 1115 if (smp_cpus == mp_ncpus) { 1116 /* enable IPI's, tlb shootdown, freezes etc */ 1117 atomic_store_rel_int(&smp_started, 1); 1118 } 1119 1120 #ifdef __amd64__ 1121 if (pmap_pcid_enabled) 1122 load_cr4(rcr4() | CR4_PCIDE); 1123 load_ds(_udatasel); 1124 load_es(_udatasel); 1125 load_fs(_ufssel); 1126 #endif 1127 1128 mtx_unlock_spin(&ap_boot_mtx); 1129 1130 /* Wait until all the AP's are up. */ 1131 while (atomic_load_acq_int(&smp_started) == 0) 1132 ia32_pause(); 1133 1134 kcsan_cpu_init(cpuid); 1135 1136 sched_ap_entry(); 1137 1138 panic("scheduler returned us to %s", __func__); 1139 /* NOTREACHED */ 1140 } 1141 1142 static void 1143 smp_after_idle_runnable(void *arg __unused) 1144 { 1145 int cpu; 1146 1147 if (mp_ncpus == 1) 1148 return; 1149 1150 KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__)); 1151 1152 /* 1153 * Wait for all APs to handle an interrupt. After that, we know that 1154 * the APs have entered the scheduler at least once, so the boot stacks 1155 * are safe to free. 1156 */ 1157 smp_rendezvous(smp_no_rendezvous_barrier, NULL, 1158 smp_no_rendezvous_barrier, NULL); 1159 1160 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1161 kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE); 1162 } 1163 } 1164 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1165 smp_after_idle_runnable, NULL); 1166 1167 /* 1168 * We tell the I/O APIC code about all the CPUs we want to receive 1169 * interrupts. If we don't want certain CPUs to receive IRQs we 1170 * can simply not tell the I/O APIC code about them in this function. 1171 * We also do not tell it about the BSP since it tells itself about 1172 * the BSP internally to work with UP kernels and on UP machines. 1173 */ 1174 void 1175 set_interrupt_apic_ids(void) 1176 { 1177 u_int i, apic_id; 1178 1179 for (i = 0; i < MAXCPU; i++) { 1180 apic_id = cpu_apic_ids[i]; 1181 if (apic_id == -1) 1182 continue; 1183 if (cpu_info[apic_id].cpu_bsp) 1184 continue; 1185 if (cpu_info[apic_id].cpu_disabled) 1186 continue; 1187 if (intr_apic_id_limit >= 0 && apic_id > intr_apic_id_limit) 1188 continue; 1189 1190 /* Don't let hyperthreads service interrupts. */ 1191 if (cpu_info[apic_id].cpu_hyperthread && 1192 !hyperthreading_intr_allowed) 1193 continue; 1194 1195 intr_add_cpu(i); 1196 } 1197 } 1198 1199 #ifdef COUNT_XINVLTLB_HITS 1200 u_int xhits_gbl[MAXCPU]; 1201 u_int xhits_pg[MAXCPU]; 1202 u_int xhits_rng[MAXCPU]; 1203 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1204 ""); 1205 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1206 sizeof(xhits_gbl), "IU", ""); 1207 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1208 sizeof(xhits_pg), "IU", ""); 1209 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1210 sizeof(xhits_rng), "IU", ""); 1211 1212 u_int ipi_global; 1213 u_int ipi_page; 1214 u_int ipi_range; 1215 u_int ipi_range_size; 1216 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1217 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1218 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1219 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1220 0, ""); 1221 #endif /* COUNT_XINVLTLB_HITS */ 1222 1223 /* 1224 * Init and startup IPI. 1225 */ 1226 void 1227 ipi_startup(int apic_id, int vector) 1228 { 1229 1230 /* 1231 * This attempts to follow the algorithm described in the 1232 * Intel Multiprocessor Specification v1.4 in section B.4. 1233 * For each IPI, we allow the local APIC ~20us to deliver the 1234 * IPI. If that times out, we panic. 1235 */ 1236 1237 /* 1238 * first we do an INIT IPI: this INIT IPI might be run, resetting 1239 * and running the target CPU. OR this INIT IPI might be latched (P5 1240 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1241 * ignored. 1242 */ 1243 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1244 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1245 lapic_ipi_wait(100); 1246 1247 /* Explicitly deassert the INIT IPI. */ 1248 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1249 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1250 apic_id); 1251 1252 DELAY(10000); /* wait ~10mS */ 1253 1254 /* 1255 * next we do a STARTUP IPI: the previous INIT IPI might still be 1256 * latched, (P5 bug) this 1st STARTUP would then terminate 1257 * immediately, and the previously started INIT IPI would continue. OR 1258 * the previous INIT IPI has already run. and this STARTUP IPI will 1259 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1260 * will run. 1261 */ 1262 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1263 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1264 vector, apic_id); 1265 if (!lapic_ipi_wait(100)) 1266 panic("Failed to deliver first STARTUP IPI to APIC %d", 1267 apic_id); 1268 DELAY(200); /* wait ~200uS */ 1269 1270 /* 1271 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1272 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1273 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1274 * recognized after hardware RESET or INIT IPI. 1275 */ 1276 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1277 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1278 vector, apic_id); 1279 if (!lapic_ipi_wait(100)) 1280 panic("Failed to deliver second STARTUP IPI to APIC %d", 1281 apic_id); 1282 1283 DELAY(200); /* wait ~200uS */ 1284 } 1285 1286 static bool 1287 ipi_bitmap_set(int cpu, u_int ipi) 1288 { 1289 u_int bitmap, old, new; 1290 u_int *cpu_bitmap; 1291 1292 bitmap = 1 << ipi; 1293 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1294 old = *cpu_bitmap; 1295 for (;;) { 1296 if ((old & bitmap) != 0) 1297 break; 1298 new = old | bitmap; 1299 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1300 break; 1301 } 1302 return (old != 0); 1303 } 1304 1305 /* 1306 * Send an IPI to specified CPU handling the bitmap logic. 1307 */ 1308 static void 1309 ipi_send_cpu(int cpu, u_int ipi) 1310 { 1311 1312 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1313 ("IPI to non-existent CPU %d", cpu)); 1314 1315 if (IPI_IS_BITMAPED(ipi)) { 1316 if (ipi_bitmap_set(cpu, ipi)) 1317 return; 1318 ipi = IPI_BITMAP_VECTOR; 1319 } 1320 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1321 } 1322 1323 void 1324 ipi_bitmap_handler(struct trapframe frame) 1325 { 1326 struct trapframe *oldframe; 1327 struct thread *td; 1328 int cpu = PCPU_GET(cpuid); 1329 u_int ipi_bitmap; 1330 1331 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); 1332 1333 td = curthread; 1334 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1335 pc_ipi_bitmap); 1336 1337 /* 1338 * sched_preempt() must be called to clear the pending preempt 1339 * IPI to enable delivery of further preempts. However, the 1340 * critical section will cause extra scheduler lock thrashing 1341 * when used unconditionally. Only critical_enter() if 1342 * hardclock must also run, which requires the section entry. 1343 */ 1344 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1345 critical_enter(); 1346 1347 td->td_intr_nesting_level++; 1348 oldframe = td->td_intr_frame; 1349 td->td_intr_frame = &frame; 1350 #if defined(STACK) || defined(DDB) 1351 if (ipi_bitmap & (1 << IPI_TRACE)) 1352 stack_capture_intr(); 1353 #endif 1354 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1355 #ifdef COUNT_IPIS 1356 (*ipi_preempt_counts[cpu])++; 1357 #endif 1358 sched_preempt(td); 1359 } 1360 if (ipi_bitmap & (1 << IPI_AST)) { 1361 #ifdef COUNT_IPIS 1362 (*ipi_ast_counts[cpu])++; 1363 #endif 1364 /* Nothing to do for AST */ 1365 } 1366 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1367 #ifdef COUNT_IPIS 1368 (*ipi_hardclock_counts[cpu])++; 1369 #endif 1370 hardclockintr(); 1371 } 1372 td->td_intr_frame = oldframe; 1373 td->td_intr_nesting_level--; 1374 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1375 critical_exit(); 1376 } 1377 1378 /* 1379 * send an IPI to a set of cpus. 1380 */ 1381 void 1382 ipi_selected(cpuset_t cpus, u_int ipi) 1383 { 1384 int cpu; 1385 1386 /* 1387 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1388 * of help in order to understand what is the source. 1389 * Set the mask of receiving CPUs for this purpose. 1390 */ 1391 if (ipi == IPI_STOP_HARD) 1392 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1393 1394 CPU_FOREACH_ISSET(cpu, &cpus) { 1395 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1396 ipi_send_cpu(cpu, ipi); 1397 } 1398 } 1399 1400 /* 1401 * send an IPI to a specific CPU. 1402 */ 1403 void 1404 ipi_cpu(int cpu, u_int ipi) 1405 { 1406 1407 /* 1408 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1409 * of help in order to understand what is the source. 1410 * Set the mask of receiving CPUs for this purpose. 1411 */ 1412 if (ipi == IPI_STOP_HARD) 1413 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1414 1415 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1416 ipi_send_cpu(cpu, ipi); 1417 } 1418 1419 /* 1420 * send an IPI to all CPUs EXCEPT myself 1421 */ 1422 void 1423 ipi_all_but_self(u_int ipi) 1424 { 1425 cpuset_t other_cpus; 1426 int cpu, c; 1427 1428 if (mp_ncpus == 1) 1429 return; 1430 1431 /* 1432 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1433 * of help in order to understand what is the source. 1434 * Set the mask of receiving CPUs for this purpose. 1435 */ 1436 if (ipi == IPI_STOP_HARD) { 1437 other_cpus = all_cpus; 1438 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1439 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1440 } 1441 1442 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1443 if (IPI_IS_BITMAPED(ipi)) { 1444 cpu = PCPU_GET(cpuid); 1445 CPU_FOREACH(c) { 1446 if (c != cpu) 1447 ipi_bitmap_set(c, ipi); 1448 } 1449 ipi = IPI_BITMAP_VECTOR; 1450 } 1451 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1452 } 1453 1454 void 1455 ipi_self_from_nmi(u_int vector) 1456 { 1457 1458 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); 1459 1460 /* Wait for IPI to finish. */ 1461 if (!lapic_ipi_wait(50000)) { 1462 if (KERNEL_PANICKED()) 1463 return; 1464 else 1465 panic("APIC: IPI is stuck"); 1466 } 1467 } 1468 1469 int 1470 ipi_nmi_handler(void) 1471 { 1472 u_int cpuid; 1473 1474 /* 1475 * As long as there is not a simple way to know about a NMI's 1476 * source, if the bitmask for the current CPU is present in 1477 * the global pending bitword an IPI_STOP_HARD has been issued 1478 * and should be handled. 1479 */ 1480 cpuid = PCPU_GET(cpuid); 1481 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1482 return (1); 1483 1484 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1485 cpustop_handler(); 1486 return (0); 1487 } 1488 1489 int nmi_kdb_lock; 1490 1491 void 1492 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1493 { 1494 int cpu; 1495 bool call_post; 1496 1497 cpu = PCPU_GET(cpuid); 1498 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1499 nmi_call_kdb(cpu, type, frame); 1500 call_post = false; 1501 } else { 1502 savectx(&stoppcbs[cpu]); 1503 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1504 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1505 ia32_pause(); 1506 call_post = true; 1507 } 1508 atomic_store_rel_int(&nmi_kdb_lock, 0); 1509 if (call_post) 1510 cpustop_handler_post(cpu); 1511 } 1512 1513 /* 1514 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1515 * if available) until we are resumed. 1516 */ 1517 void 1518 cpustop_handler(void) 1519 { 1520 struct monitorbuf *mb; 1521 u_int cpu; 1522 bool use_mwait; 1523 1524 cpu = PCPU_GET(cpuid); 1525 1526 savectx(&stoppcbs[cpu]); 1527 1528 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1529 !mwait_cpustop_broken); 1530 if (use_mwait) { 1531 mb = PCPU_PTR(monitorbuf); 1532 atomic_store_int(&mb->stop_state, 1533 MONITOR_STOPSTATE_STOPPED); 1534 } 1535 1536 /* Indicate that we are stopped */ 1537 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1538 1539 /* Wait for restart */ 1540 while (!CPU_ISSET(cpu, &started_cpus)) { 1541 if (use_mwait) { 1542 cpu_monitor(mb, 0, 0); 1543 if (atomic_load_int(&mb->stop_state) == 1544 MONITOR_STOPSTATE_STOPPED) 1545 cpu_mwait(0, MWAIT_C1); 1546 continue; 1547 } 1548 1549 ia32_pause(); 1550 1551 /* 1552 * Halt non-BSP CPUs on panic -- we're never going to need them 1553 * again, and might as well save power / release resources 1554 * (e.g., overprovisioned VM infrastructure). 1555 */ 1556 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1557 halt(); 1558 } 1559 1560 cpustop_handler_post(cpu); 1561 } 1562 1563 static void 1564 cpustop_handler_post(u_int cpu) 1565 { 1566 1567 CPU_CLR_ATOMIC(cpu, &started_cpus); 1568 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1569 1570 /* 1571 * We don't broadcast TLB invalidations to other CPUs when they are 1572 * stopped. Hence, we clear the TLB before resuming. 1573 */ 1574 invltlb_glob(); 1575 1576 #if defined(__amd64__) && (defined(DDB) || defined(GDB)) 1577 amd64_db_resume_dbreg(); 1578 #endif 1579 1580 if (cpu == 0 && cpustop_restartfunc != NULL) { 1581 cpustop_restartfunc(); 1582 cpustop_restartfunc = NULL; 1583 } 1584 } 1585 1586 /* 1587 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1588 * are resumed. 1589 */ 1590 void 1591 cpususpend_handler(void) 1592 { 1593 u_int cpu; 1594 1595 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1596 1597 #ifdef __amd64__ 1598 if (vmm_suspend_p) 1599 vmm_suspend_p(); 1600 #endif 1601 1602 cpu = PCPU_GET(cpuid); 1603 1604 #ifdef XENHVM 1605 /* 1606 * Some Xen guest types (PVH) expose a very minimal set of ACPI tables, 1607 * and for example have no support for SCI. That leads to the suspend 1608 * stacks not being allocated, and hence when attempting to perform a 1609 * Xen triggered suspension FreeBSD will hit a #PF. Avoid saving the 1610 * CPU and FPU contexts if the stacks are not allocated, as the 1611 * hypervisor will already take care of this. Note that we could even 1612 * do this for Xen triggered suspensions on guests that have full ACPI 1613 * support, but doing so would introduce extra complexity. 1614 */ 1615 if (susppcbs == NULL) { 1616 KASSERT(vm_guest == VM_GUEST_XEN, ("Missing suspend stack")); 1617 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1618 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1619 } else 1620 #endif 1621 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1622 #ifdef __amd64__ 1623 fpususpend(susppcbs[cpu]->sp_fpususpend); 1624 #else 1625 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1626 #endif 1627 /* 1628 * suspended_cpus is cleared shortly after each AP is restarted 1629 * by a Startup IPI, so that the BSP can proceed to restarting 1630 * the next AP. 1631 * 1632 * resuming_cpus gets cleared when the AP completes 1633 * initialization after having been released by the BSP. 1634 * resuming_cpus is probably not the best name for the 1635 * variable, because it is actually a set of processors that 1636 * haven't resumed yet and haven't necessarily started resuming. 1637 * 1638 * Note that suspended_cpus is meaningful only for ACPI suspend 1639 * as it's not really used for Xen suspend since the APs are 1640 * automatically restored to the running state and the correct 1641 * context. For the same reason resumectx is never called in 1642 * that case. 1643 */ 1644 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1645 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1646 1647 /* 1648 * Invalidate the cache after setting the global status bits. 1649 * The last AP to set its bit may end up being an Owner of the 1650 * corresponding cache line in MOESI protocol. The AP may be 1651 * stopped before the cache line is written to the main memory. 1652 */ 1653 wbinvd(); 1654 } else { 1655 #ifdef __amd64__ 1656 fpuresume(susppcbs[cpu]->sp_fpususpend); 1657 #else 1658 npxresume(susppcbs[cpu]->sp_fpususpend); 1659 #endif 1660 pmap_init_pat(); 1661 initializecpu(); 1662 PCPU_SET(switchtime, 0); 1663 PCPU_SET(switchticks, ticks); 1664 1665 /* Indicate that we have restarted and restored the context. */ 1666 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1667 } 1668 1669 /* Wait for resume directive */ 1670 while (!CPU_ISSET(cpu, &toresume_cpus)) 1671 ia32_pause(); 1672 1673 /* Re-apply microcode updates. */ 1674 ucode_reload(); 1675 1676 #ifdef __i386__ 1677 /* Finish removing the identity mapping of low memory for this AP. */ 1678 invltlb_glob(); 1679 #endif 1680 1681 if (cpu_ops.cpu_resume) 1682 cpu_ops.cpu_resume(); 1683 #ifdef __amd64__ 1684 if (vmm_resume_p) 1685 vmm_resume_p(); 1686 #endif 1687 1688 /* Resume MCA and local APIC */ 1689 lapic_xapic_mode(); 1690 mca_resume(); 1691 lapic_setup(0); 1692 1693 /* Indicate that we are resumed */ 1694 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1695 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1696 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1697 } 1698 1699 /* 1700 * Handle an IPI_SWI by waking delayed SWI thread. 1701 */ 1702 void 1703 ipi_swi_handler(struct trapframe frame) 1704 { 1705 1706 intr_event_handle(clk_intr_event, &frame); 1707 } 1708 1709 /* 1710 * This is called once the rest of the system is up and running and we're 1711 * ready to let the AP's out of the pen. 1712 */ 1713 static void 1714 release_aps(void *dummy __unused) 1715 { 1716 1717 if (mp_ncpus == 1) 1718 return; 1719 atomic_store_rel_int(&aps_ready, 1); 1720 while (smp_started == 0) 1721 ia32_pause(); 1722 } 1723 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1724 1725 #ifdef COUNT_IPIS 1726 /* 1727 * Setup interrupt counters for IPI handlers. 1728 */ 1729 static void 1730 mp_ipi_intrcnt(void *dummy) 1731 { 1732 char buf[64]; 1733 int i; 1734 1735 CPU_FOREACH(i) { 1736 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1737 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1738 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1739 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1740 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1741 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1742 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1743 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1744 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1745 intrcnt_add(buf, &ipi_preempt_counts[i]); 1746 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1747 intrcnt_add(buf, &ipi_ast_counts[i]); 1748 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1749 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1750 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1751 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1752 } 1753 } 1754 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1755 #endif 1756