1 /* 2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * This module implements IPI message queueing and the MI portion of IPI 37 * message processing. 38 */ 39 40 #include "opt_ddb.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/queue.h> 48 #include <sys/thread2.h> 49 #include <sys/sysctl.h> 50 #include <sys/ktr.h> 51 #include <sys/kthread.h> 52 #include <machine/cpu.h> 53 #include <sys/lock.h> 54 55 #include <vm/vm.h> 56 #include <vm/vm_param.h> 57 #include <vm/vm_kern.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_map.h> 61 #include <vm/vm_pager.h> 62 #include <vm/vm_extern.h> 63 #include <vm/vm_zone.h> 64 65 #include <machine/stdarg.h> 66 #include <machine/smp.h> 67 #include <machine/atomic.h> 68 69 #ifdef _KERNEL_VIRTUAL 70 #include <pthread.h> 71 #endif 72 73 struct ipiq_stats { 74 __int64_t ipiq_count; /* total calls to lwkt_send_ipiq*() */ 75 __int64_t ipiq_fifofull; /* number of fifo full conditions detected */ 76 __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */ 77 __int64_t ipiq_passive; /* passive IPI messages */ 78 __int64_t ipiq_cscount; /* number of cpu synchronizations */ 79 } __cachealign; 80 81 static struct ipiq_stats ipiq_stats_percpu[MAXCPU]; 82 #define ipiq_stat(gd) ipiq_stats_percpu[(gd)->gd_cpuid] 83 84 static int ipiq_debug; /* set to 1 for debug */ 85 #ifdef PANIC_DEBUG 86 static int panic_ipiq_cpu = -1; 87 static int panic_ipiq_count = 100; 88 #endif 89 90 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0, 91 ""); 92 #ifdef PANIC_DEBUG 93 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, ""); 94 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, ""); 95 #endif 96 97 #define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d" 98 #define IPIQ_ARGS void *func, void *arg1, int arg2, int scpu, int dcpu 99 100 #if !defined(KTR_IPIQ) 101 #define KTR_IPIQ KTR_ALL 102 #endif 103 KTR_INFO_MASTER(ipiq); 104 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARGS); 105 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARGS); 106 KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARGS); 107 KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARGS); 108 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARGS); 109 KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08lx", unsigned long mask); 110 KTR_INFO(KTR_IPIQ, ipiq, sync_end, 6, "cpumask=%08lx", unsigned long mask); 111 KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARGS); 112 KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARGS); 113 114 #define logipiq(name, func, arg1, arg2, sgd, dgd) \ 115 KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid) 116 #define logipiq2(name, arg) \ 117 KTR_LOG(ipiq_ ## name, arg) 118 119 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 120 struct intrframe *frame); 121 static void lwkt_cpusync_remote1(lwkt_cpusync_t cs); 122 static void lwkt_cpusync_remote2(lwkt_cpusync_t cs); 123 124 #define IPIQ_SYSCTL(name) \ 125 static int \ 126 sysctl_##name(SYSCTL_HANDLER_ARGS) \ 127 { \ 128 __int64_t val = 0; \ 129 int cpu, error; \ 130 \ 131 for (cpu = 0; cpu < ncpus; ++cpu) \ 132 val += ipiq_stats_percpu[cpu].name; \ 133 \ 134 error = sysctl_handle_quad(oidp, &val, 0, req); \ 135 if (error || req->newptr == NULL) \ 136 return error; \ 137 \ 138 for (cpu = 0; cpu < ncpus; ++cpu) \ 139 ipiq_stats_percpu[cpu].name = val; \ 140 \ 141 return 0; \ 142 } 143 144 IPIQ_SYSCTL(ipiq_count); 145 IPIQ_SYSCTL(ipiq_fifofull); 146 IPIQ_SYSCTL(ipiq_avoided); 147 IPIQ_SYSCTL(ipiq_passive); 148 IPIQ_SYSCTL(ipiq_cscount); 149 150 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_count, (CTLTYPE_QUAD | CTLFLAG_RW), 151 0, 0, sysctl_ipiq_count, "Q", "Number of IPI's sent"); 152 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_fifofull, (CTLTYPE_QUAD | CTLFLAG_RW), 153 0, 0, sysctl_ipiq_fifofull, "Q", 154 "Number of fifo full conditions detected"); 155 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_avoided, (CTLTYPE_QUAD | CTLFLAG_RW), 156 0, 0, sysctl_ipiq_avoided, "Q", 157 "Number of IPI's avoided by interlock with target cpu"); 158 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_passive, (CTLTYPE_QUAD | CTLFLAG_RW), 159 0, 0, sysctl_ipiq_passive, "Q", 160 "Number of passive IPI messages sent"); 161 SYSCTL_PROC(_lwkt, OID_AUTO, ipiq_cscount, (CTLTYPE_QUAD | CTLFLAG_RW), 162 0, 0, sysctl_ipiq_cscount, "Q", 163 "Number of cpu synchronizations"); 164 165 /* 166 * Send a function execution request to another cpu. The request is queued 167 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every 168 * possible target cpu. The FIFO can be written. 169 * 170 * If the FIFO fills up we have to enable interrupts to avoid an APIC 171 * deadlock and process pending IPIQs while waiting for it to empty. 172 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full. 173 * 174 * We can safely bump gd_intr_nesting_level because our crit_exit() at the 175 * end will take care of any pending interrupts. 176 * 177 * The actual hardware IPI is avoided if the target cpu is already processing 178 * the queue from a prior IPI. It is possible to pipeline IPI messages 179 * very quickly between cpus due to the FIFO hysteresis. 180 * 181 * Need not be called from a critical section. 182 */ 183 int 184 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2) 185 { 186 lwkt_ipiq_t ip; 187 int windex; 188 #ifdef _KERNEL_VIRTUAL 189 int repeating = 0; 190 #endif 191 struct globaldata *gd = mycpu; 192 193 logipiq(send_norm, func, arg1, arg2, gd, target); 194 195 if (target == gd) { 196 func(arg1, arg2, NULL); 197 logipiq(send_end, func, arg1, arg2, gd, target); 198 return(0); 199 } 200 crit_enter(); 201 ++gd->gd_intr_nesting_level; 202 #ifdef INVARIANTS 203 if (gd->gd_intr_nesting_level > 20) 204 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 205 #endif 206 KKASSERT(curthread->td_critcount); 207 ++ipiq_stat(gd).ipiq_count; 208 ip = &gd->gd_ipiq[target->gd_cpuid]; 209 210 /* 211 * Do not allow the FIFO to become full. Interrupts must be physically 212 * enabled while we liveloop to avoid deadlocking the APIC. 213 * 214 * The target ipiq may have gotten filled up due to passive IPIs and thus 215 * not be aware that its queue is too full, so be sure to issue an 216 * ipiq interrupt to the target cpu. 217 */ 218 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 219 #if defined(__i386__) 220 unsigned int eflags = read_eflags(); 221 #elif defined(__x86_64__) 222 unsigned long rflags = read_rflags(); 223 #endif 224 225 cpu_enable_intr(); 226 ++ipiq_stat(gd).ipiq_fifofull; 227 DEBUG_PUSH_INFO("send_ipiq3"); 228 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 229 if (atomic_poll_acquire_int(&target->gd_npoll)) { 230 logipiq(cpu_send, func, arg1, arg2, gd, target); 231 cpu_send_ipiq(target->gd_cpuid); 232 } 233 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 234 lwkt_process_ipiq(); 235 cpu_pause(); 236 #ifdef _KERNEL_VIRTUAL 237 if (repeating++ > 10) 238 pthread_yield(); 239 #endif 240 } 241 DEBUG_POP_INFO(); 242 #if defined(__i386__) 243 write_eflags(eflags); 244 #elif defined(__x86_64__) 245 write_rflags(rflags); 246 #endif 247 } 248 249 /* 250 * Queue the new message 251 */ 252 windex = ip->ip_windex & MAXCPUFIFO_MASK; 253 ip->ip_info[windex].func = func; 254 ip->ip_info[windex].arg1 = arg1; 255 ip->ip_info[windex].arg2 = arg2; 256 cpu_sfence(); 257 ++ip->ip_windex; 258 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 259 260 /* 261 * signal the target cpu that there is work pending. 262 */ 263 if (atomic_poll_acquire_int(&target->gd_npoll)) { 264 logipiq(cpu_send, func, arg1, arg2, gd, target); 265 cpu_send_ipiq(target->gd_cpuid); 266 } else { 267 ++ipiq_stat(gd).ipiq_avoided; 268 } 269 --gd->gd_intr_nesting_level; 270 crit_exit(); 271 logipiq(send_end, func, arg1, arg2, gd, target); 272 273 return(ip->ip_windex); 274 } 275 276 /* 277 * Similar to lwkt_send_ipiq() but this function does not actually initiate 278 * the IPI to the target cpu unless the FIFO has become too full, so it is 279 * very fast. 280 * 281 * This function is used for non-critical IPI messages, such as memory 282 * deallocations. The queue will typically be flushed by the target cpu at 283 * the next clock interrupt. 284 * 285 * Need not be called from a critical section. 286 */ 287 int 288 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func, 289 void *arg1, int arg2) 290 { 291 lwkt_ipiq_t ip; 292 int windex; 293 #ifdef _KERNEL_VIRTUAL 294 int repeating = 0; 295 #endif 296 struct globaldata *gd = mycpu; 297 298 KKASSERT(target != gd); 299 crit_enter(); 300 ++gd->gd_intr_nesting_level; 301 logipiq(send_pasv, func, arg1, arg2, gd, target); 302 #ifdef INVARIANTS 303 if (gd->gd_intr_nesting_level > 20) 304 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!"); 305 #endif 306 KKASSERT(curthread->td_critcount); 307 ++ipiq_stat(gd).ipiq_count; 308 ++ipiq_stat(gd).ipiq_passive; 309 ip = &gd->gd_ipiq[target->gd_cpuid]; 310 311 /* 312 * Do not allow the FIFO to become full. Interrupts must be physically 313 * enabled while we liveloop to avoid deadlocking the APIC. 314 */ 315 if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) { 316 #if defined(__i386__) 317 unsigned int eflags = read_eflags(); 318 #elif defined(__x86_64__) 319 unsigned long rflags = read_rflags(); 320 #endif 321 322 cpu_enable_intr(); 323 ++ipiq_stat(gd).ipiq_fifofull; 324 DEBUG_PUSH_INFO("send_ipiq3_passive"); 325 while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) { 326 if (atomic_poll_acquire_int(&target->gd_npoll)) { 327 logipiq(cpu_send, func, arg1, arg2, gd, target); 328 cpu_send_ipiq(target->gd_cpuid); 329 } 330 KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1); 331 lwkt_process_ipiq(); 332 cpu_pause(); 333 #ifdef _KERNEL_VIRTUAL 334 if (repeating++ > 10) 335 pthread_yield(); 336 #endif 337 } 338 DEBUG_POP_INFO(); 339 #if defined(__i386__) 340 write_eflags(eflags); 341 #elif defined(__x86_64__) 342 write_rflags(rflags); 343 #endif 344 } 345 346 /* 347 * Queue the new message 348 */ 349 windex = ip->ip_windex & MAXCPUFIFO_MASK; 350 ip->ip_info[windex].func = func; 351 ip->ip_info[windex].arg1 = arg1; 352 ip->ip_info[windex].arg2 = arg2; 353 cpu_sfence(); 354 ++ip->ip_windex; 355 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 356 --gd->gd_intr_nesting_level; 357 358 /* 359 * Do not signal the target cpu, it will pick up the IPI when it next 360 * polls (typically on the next tick). 361 */ 362 crit_exit(); 363 logipiq(send_end, func, arg1, arg2, gd, target); 364 365 return(ip->ip_windex); 366 } 367 368 /* 369 * Send an IPI request without blocking, return 0 on success, ENOENT on 370 * failure. The actual queueing of the hardware IPI may still force us 371 * to spin and process incoming IPIs but that will eventually go away 372 * when we've gotten rid of the other general IPIs. 373 */ 374 int 375 lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func, 376 void *arg1, int arg2) 377 { 378 lwkt_ipiq_t ip; 379 int windex; 380 struct globaldata *gd = mycpu; 381 382 logipiq(send_nbio, func, arg1, arg2, gd, target); 383 KKASSERT(curthread->td_critcount); 384 if (target == gd) { 385 func(arg1, arg2, NULL); 386 logipiq(send_end, func, arg1, arg2, gd, target); 387 return(0); 388 } 389 crit_enter(); 390 ++gd->gd_intr_nesting_level; 391 ++ipiq_stat(gd).ipiq_count; 392 ip = &gd->gd_ipiq[target->gd_cpuid]; 393 394 if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 2 / 3) { 395 logipiq(send_fail, func, arg1, arg2, gd, target); 396 --gd->gd_intr_nesting_level; 397 crit_exit(); 398 return(ENOENT); 399 } 400 windex = ip->ip_windex & MAXCPUFIFO_MASK; 401 ip->ip_info[windex].func = func; 402 ip->ip_info[windex].arg1 = arg1; 403 ip->ip_info[windex].arg2 = arg2; 404 cpu_sfence(); 405 ++ip->ip_windex; 406 ATOMIC_CPUMASK_ORBIT(target->gd_ipimask, gd->gd_cpuid); 407 408 /* 409 * This isn't a passive IPI, we still have to signal the target cpu. 410 */ 411 if (atomic_poll_acquire_int(&target->gd_npoll)) { 412 logipiq(cpu_send, func, arg1, arg2, gd, target); 413 cpu_send_ipiq(target->gd_cpuid); 414 } else { 415 ++ipiq_stat(gd).ipiq_avoided; 416 } 417 --gd->gd_intr_nesting_level; 418 crit_exit(); 419 420 logipiq(send_end, func, arg1, arg2, gd, target); 421 return(0); 422 } 423 424 /* 425 * deprecated, used only by fast int forwarding. 426 */ 427 int 428 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2) 429 { 430 return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2)); 431 } 432 433 /* 434 * Send a message to several target cpus. Typically used for scheduling. 435 * The message will not be sent to stopped cpus. 436 */ 437 int 438 lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2) 439 { 440 int cpuid; 441 int count = 0; 442 443 CPUMASK_NANDMASK(mask, stopped_cpus); 444 while (CPUMASK_TESTNZERO(mask)) { 445 cpuid = BSFCPUMASK(mask); 446 lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2); 447 CPUMASK_NANDBIT(mask, cpuid); 448 ++count; 449 } 450 return(count); 451 } 452 453 /* 454 * Wait for the remote cpu to finish processing a function. 455 * 456 * YYY we have to enable interrupts and process the IPIQ while waiting 457 * for it to empty or we may deadlock with another cpu. Create a CPU_*() 458 * function to do this! YYY we really should 'block' here. 459 * 460 * MUST be called from a critical section. This routine may be called 461 * from an interrupt (for example, if an interrupt wakes a foreign thread 462 * up). 463 */ 464 void 465 lwkt_wait_ipiq(globaldata_t target, int seq) 466 { 467 lwkt_ipiq_t ip; 468 469 if (target != mycpu) { 470 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 471 if ((int)(ip->ip_xindex - seq) < 0) { 472 #if defined(__i386__) 473 unsigned int eflags = read_eflags(); 474 #elif defined(__x86_64__) 475 unsigned long rflags = read_rflags(); 476 #endif 477 int64_t time_tgt = tsc_get_target(1000000000LL); 478 int time_loops = 10; 479 int benice = 0; 480 #ifdef _KERNEL_VIRTUAL 481 int repeating = 0; 482 #endif 483 484 cpu_enable_intr(); 485 DEBUG_PUSH_INFO("wait_ipiq"); 486 while ((int)(ip->ip_xindex - seq) < 0) { 487 crit_enter(); 488 lwkt_process_ipiq(); 489 crit_exit(); 490 #ifdef _KERNEL_VIRTUAL 491 if (repeating++ > 10) 492 pthread_yield(); 493 #endif 494 495 /* 496 * IPIQs must be handled within 10 seconds and this code 497 * will warn after one second. 498 */ 499 if ((benice & 255) == 0 && tsc_test_target(time_tgt) > 0) { 500 kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", 501 mycpu->gd_cpuid, target->gd_cpuid, 502 ip->ip_xindex - seq); 503 if (--time_loops == 0) 504 panic("LWKT_WAIT_IPIQ"); 505 time_tgt = tsc_get_target(1000000000LL); 506 } 507 ++benice; 508 509 /* 510 * xindex may be modified by another cpu, use a load fence 511 * to ensure that the loop does not use a speculative value 512 * (which may improve performance). 513 */ 514 cpu_pause(); 515 cpu_lfence(); 516 } 517 DEBUG_POP_INFO(); 518 #if defined(__i386__) 519 write_eflags(eflags); 520 #elif defined(__x86_64__) 521 write_rflags(rflags); 522 #endif 523 } 524 } 525 } 526 527 int 528 lwkt_seq_ipiq(globaldata_t target) 529 { 530 lwkt_ipiq_t ip; 531 532 ip = &mycpu->gd_ipiq[target->gd_cpuid]; 533 return(ip->ip_windex); 534 } 535 536 /* 537 * Called from IPI interrupt (like a fast interrupt), which has placed 538 * us in a critical section. The MP lock may or may not be held. 539 * May also be called from doreti or splz, or be reentrantly called 540 * indirectly through the ip_info[].func we run. 541 * 542 * There are two versions, one where no interrupt frame is available (when 543 * called from the send code and from splz, and one where an interrupt 544 * frame is available. 545 * 546 * When the current cpu is mastering a cpusync we do NOT internally loop 547 * on the cpusyncq poll. We also do not re-flag a pending ipi due to 548 * the cpusyncq poll because this can cause doreti/splz to loop internally. 549 * The cpusync master's own loop must be allowed to run to avoid a deadlock. 550 */ 551 void 552 lwkt_process_ipiq(void) 553 { 554 globaldata_t gd = mycpu; 555 globaldata_t sgd; 556 lwkt_ipiq_t ip; 557 cpumask_t mask; 558 int n; 559 560 ++gd->gd_processing_ipiq; 561 again: 562 cpu_lfence(); 563 mask = gd->gd_ipimask; 564 ATOMIC_CPUMASK_NANDMASK(gd->gd_ipimask, mask); 565 while (CPUMASK_TESTNZERO(mask)) { 566 n = BSFCPUMASK(mask); 567 if (n != gd->gd_cpuid) { 568 sgd = globaldata_find(n); 569 ip = sgd->gd_ipiq; 570 if (ip != NULL) { 571 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL)) 572 ; 573 } 574 } 575 CPUMASK_NANDBIT(mask, n); 576 } 577 578 /* 579 * Process pending cpusyncs. If the current thread has a cpusync 580 * active cpusync we only run the list once and do not re-flag 581 * as the thread itself is processing its interlock. 582 */ 583 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) { 584 if (gd->gd_curthread->td_cscount == 0) 585 goto again; 586 /* need_ipiq(); do not reflag */ 587 } 588 589 /* 590 * Interlock to allow more IPI interrupts. Recheck ipimask after 591 * releasing gd_npoll. 592 */ 593 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 594 goto again; 595 atomic_poll_release_int(&gd->gd_npoll); 596 cpu_mfence(); 597 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 598 goto again; 599 --gd->gd_processing_ipiq; 600 } 601 602 void 603 lwkt_process_ipiq_frame(struct intrframe *frame) 604 { 605 globaldata_t gd = mycpu; 606 globaldata_t sgd; 607 lwkt_ipiq_t ip; 608 cpumask_t mask; 609 int n; 610 611 again: 612 cpu_lfence(); 613 mask = gd->gd_ipimask; 614 ATOMIC_CPUMASK_NANDMASK(gd->gd_ipimask, mask); 615 while (CPUMASK_TESTNZERO(mask)) { 616 n = BSFCPUMASK(mask); 617 if (n != gd->gd_cpuid) { 618 sgd = globaldata_find(n); 619 ip = sgd->gd_ipiq; 620 if (ip != NULL) { 621 while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], frame)) 622 ; 623 } 624 } 625 CPUMASK_NANDBIT(mask, n); 626 } 627 if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) { 628 if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) { 629 if (gd->gd_curthread->td_cscount == 0) 630 goto again; 631 /* need_ipiq(); do not reflag */ 632 } 633 } 634 635 /* 636 * Interlock to allow more IPI interrupts. Recheck ipimask after 637 * releasing gd_npoll. 638 */ 639 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 640 goto again; 641 atomic_poll_release_int(&gd->gd_npoll); 642 cpu_mfence(); 643 if (CPUMASK_TESTNZERO(gd->gd_ipimask)) 644 goto again; 645 } 646 647 #if 0 648 static int iqticks[SMP_MAXCPU]; 649 static int iqcount[SMP_MAXCPU]; 650 #endif 651 #if 0 652 static int iqterm[SMP_MAXCPU]; 653 #endif 654 655 static int 656 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip, 657 struct intrframe *frame) 658 { 659 globaldata_t mygd = mycpu; 660 int ri; 661 int wi; 662 ipifunc3_t copy_func; 663 void *copy_arg1; 664 int copy_arg2; 665 666 #if 0 667 if (iqticks[mygd->gd_cpuid] != ticks) { 668 iqticks[mygd->gd_cpuid] = ticks; 669 iqcount[mygd->gd_cpuid] = 0; 670 } 671 if (++iqcount[mygd->gd_cpuid] > 3000000) { 672 kprintf("cpu %d ipiq maxed cscount %d spin %d\n", 673 mygd->gd_cpuid, 674 mygd->gd_curthread->td_cscount, 675 mygd->gd_spinlocks); 676 iqcount[mygd->gd_cpuid] = 0; 677 #if 0 678 if (++iqterm[mygd->gd_cpuid] > 10) 679 panic("cpu %d ipiq maxed", mygd->gd_cpuid); 680 #endif 681 int i; 682 for (i = 0; i < ncpus; ++i) { 683 if (globaldata_find(i)->gd_infomsg) 684 kprintf(" %s", globaldata_find(i)->gd_infomsg); 685 } 686 kprintf("\n"); 687 } 688 #endif 689 690 /* 691 * Clear the originating core from our ipimask, we will process all 692 * incoming messages. 693 * 694 * Obtain the current write index, which is modified by a remote cpu. 695 * Issue a load fence to prevent speculative reads of e.g. data written 696 * by the other cpu prior to it updating the index. 697 */ 698 KKASSERT(curthread->td_critcount); 699 wi = ip->ip_windex; 700 cpu_lfence(); 701 ++mygd->gd_intr_nesting_level; 702 703 /* 704 * NOTE: xindex is only updated after we are sure the function has 705 * finished execution. Beware lwkt_process_ipiq() reentrancy! 706 * The function may send an IPI which may block/drain. 707 * 708 * NOTE: Due to additional IPI operations that the callback function 709 * may make, it is possible for both rindex and windex to advance and 710 * thus for rindex to advance passed our cached windex. 711 * 712 * NOTE: A load fence is required to prevent speculative loads prior 713 * to the loading of ip_rindex. Even though stores might be 714 * ordered, loads are probably not. A memory fence is required 715 * to prevent reordering of the loads after the ip_rindex update. 716 * 717 * NOTE: Single pass only. Returns non-zero if the queue is not empty 718 * on return. 719 */ 720 while (wi - (ri = ip->ip_rindex) > 0) { 721 ri &= MAXCPUFIFO_MASK; 722 cpu_lfence(); 723 copy_func = ip->ip_info[ri].func; 724 copy_arg1 = ip->ip_info[ri].arg1; 725 copy_arg2 = ip->ip_info[ri].arg2; 726 cpu_mfence(); 727 ++ip->ip_rindex; 728 KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == 729 ((ri + 1) & MAXCPUFIFO_MASK)); 730 logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu); 731 #ifdef INVARIANTS 732 if (ipiq_debug && (ip->ip_rindex & 0xFFFFFF) == 0) { 733 kprintf("cpu %d ipifunc %p %p %d (frame %p)\n", 734 mycpu->gd_cpuid, 735 copy_func, copy_arg1, copy_arg2, 736 #if defined(__i386__) 737 (frame ? (void *)frame->if_eip : NULL)); 738 #elif defined(__x86_64__) 739 (frame ? (void *)frame->if_rip : NULL)); 740 #else 741 NULL); 742 #endif 743 } 744 #endif 745 copy_func(copy_arg1, copy_arg2, frame); 746 cpu_sfence(); 747 ip->ip_xindex = ip->ip_rindex; 748 749 #ifdef PANIC_DEBUG 750 /* 751 * Simulate panics during the processing of an IPI 752 */ 753 if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) { 754 if (--panic_ipiq_count == 0) { 755 #ifdef DDB 756 Debugger("PANIC_DEBUG"); 757 #else 758 panic("PANIC_DEBUG"); 759 #endif 760 } 761 } 762 #endif 763 } 764 --mygd->gd_intr_nesting_level; 765 766 /* 767 * Return non-zero if there is still more in the queue. 768 */ 769 cpu_lfence(); 770 return (ip->ip_rindex != ip->ip_windex); 771 } 772 773 static void 774 lwkt_sync_ipiq(void *arg) 775 { 776 volatile cpumask_t *cpumask = arg; 777 778 ATOMIC_CPUMASK_NANDBIT(*cpumask, mycpu->gd_cpuid); 779 if (CPUMASK_TESTZERO(*cpumask)) 780 wakeup(cpumask); 781 } 782 783 void 784 lwkt_synchronize_ipiqs(const char *wmesg) 785 { 786 volatile cpumask_t other_cpumask; 787 788 other_cpumask = smp_active_mask; 789 CPUMASK_ANDMASK(other_cpumask, mycpu->gd_other_cpus); 790 lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, 791 __DEVOLATILE(void *, &other_cpumask)); 792 793 while (CPUMASK_TESTNZERO(other_cpumask)) { 794 tsleep_interlock(&other_cpumask, 0); 795 if (CPUMASK_TESTNZERO(other_cpumask)) 796 tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0); 797 } 798 } 799 800 /* 801 * CPU Synchronization Support 802 * 803 * lwkt_cpusync_interlock() - Place specified cpus in a quiescent state. 804 * The current cpu is placed in a hard critical 805 * section. 806 * 807 * lwkt_cpusync_deinterlock() - Execute cs_func on specified cpus, including 808 * current cpu if specified, then return. 809 */ 810 void 811 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *arg) 812 { 813 struct lwkt_cpusync cs; 814 815 lwkt_cpusync_init(&cs, mask, func, arg); 816 lwkt_cpusync_interlock(&cs); 817 lwkt_cpusync_deinterlock(&cs); 818 } 819 820 821 void 822 lwkt_cpusync_interlock(lwkt_cpusync_t cs) 823 { 824 #if 0 825 const char *smsg = "SMPSYNL"; 826 #endif 827 globaldata_t gd = mycpu; 828 cpumask_t mask; 829 830 /* 831 * mask acknowledge (cs_mack): 0->mask for stage 1 832 * 833 * mack does not include the current cpu. 834 */ 835 mask = cs->cs_mask; 836 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 837 CPUMASK_ANDMASK(mask, smp_active_mask); 838 CPUMASK_ASSZERO(cs->cs_mack); 839 840 crit_enter_id("cpusync"); 841 if (CPUMASK_TESTNZERO(mask)) { 842 DEBUG_PUSH_INFO("cpusync_interlock"); 843 ++ipiq_stat(gd).ipiq_cscount; 844 ++gd->gd_curthread->td_cscount; 845 lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs); 846 logipiq2(sync_start, (long)CPUMASK_LOWMASK(mask)); 847 #if 0 848 if (gd->gd_curthread->td_wmesg == NULL) 849 gd->gd_curthread->td_wmesg = smsg; 850 #endif 851 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 852 lwkt_process_ipiq(); 853 cpu_pause(); 854 #ifdef _KERNEL_VIRTUAL 855 pthread_yield(); 856 #endif 857 } 858 #if 0 859 if (gd->gd_curthread->td_wmesg == smsg) 860 gd->gd_curthread->td_wmesg = NULL; 861 #endif 862 DEBUG_POP_INFO(); 863 } 864 } 865 866 /* 867 * Interlocked cpus have executed remote1 and are polling in remote2. 868 * To deinterlock we clear cs_mack and wait for the cpus to execute 869 * the func and set their bit in cs_mack again. 870 * 871 */ 872 void 873 lwkt_cpusync_deinterlock(lwkt_cpusync_t cs) 874 { 875 globaldata_t gd = mycpu; 876 #if 0 877 const char *smsg = "SMPSYNU"; 878 #endif 879 cpumask_t mask; 880 881 /* 882 * mask acknowledge (cs_mack): mack->0->mack for stage 2 883 * 884 * Clearing cpu bits for polling cpus in cs_mack will cause them to 885 * execute stage 2, which executes the cs_func(cs_data) and then sets 886 * their bit in cs_mack again. 887 * 888 * mack does not include the current cpu. 889 */ 890 mask = cs->cs_mack; 891 cpu_ccfence(); 892 CPUMASK_ASSZERO(cs->cs_mack); 893 cpu_ccfence(); 894 if (cs->cs_func && CPUMASK_TESTBIT(cs->cs_mask, gd->gd_cpuid)) 895 cs->cs_func(cs->cs_data); 896 if (CPUMASK_TESTNZERO(mask)) { 897 DEBUG_PUSH_INFO("cpusync_deinterlock"); 898 #if 0 899 if (gd->gd_curthread->td_wmesg == NULL) 900 gd->gd_curthread->td_wmesg = smsg; 901 #endif 902 while (CPUMASK_CMPMASKNEQ(cs->cs_mack, mask)) { 903 lwkt_process_ipiq(); 904 cpu_pause(); 905 #ifdef _KERNEL_VIRTUAL 906 pthread_yield(); 907 #endif 908 } 909 #if 0 910 if (gd->gd_curthread->td_wmesg == smsg) 911 gd->gd_curthread->td_wmesg = NULL; 912 #endif 913 DEBUG_POP_INFO(); 914 /* 915 * cpusyncq ipis may be left queued without the RQF flag set due to 916 * a non-zero td_cscount, so be sure to process any laggards after 917 * decrementing td_cscount. 918 */ 919 --gd->gd_curthread->td_cscount; 920 lwkt_process_ipiq(); 921 logipiq2(sync_end, (long)CPUMASK_LOWMASK(mask)); 922 } 923 crit_exit_id("cpusync"); 924 } 925 926 /* 927 * helper IPI remote messaging function. 928 * 929 * Called on remote cpu when a new cpu synchronization request has been 930 * sent to us. Execute the run function and adjust cs_count, then requeue 931 * the request so we spin on it. 932 */ 933 static void 934 lwkt_cpusync_remote1(lwkt_cpusync_t cs) 935 { 936 globaldata_t gd = mycpu; 937 938 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 939 lwkt_cpusync_remote2(cs); 940 } 941 942 /* 943 * helper IPI remote messaging function. 944 * 945 * Poll for the originator telling us to finish. If it hasn't, requeue 946 * our request so we spin on it. 947 */ 948 static void 949 lwkt_cpusync_remote2(lwkt_cpusync_t cs) 950 { 951 globaldata_t gd = mycpu; 952 953 if (CPUMASK_TESTMASK(cs->cs_mack, gd->gd_cpumask) == 0) { 954 if (cs->cs_func) 955 cs->cs_func(cs->cs_data); 956 ATOMIC_CPUMASK_ORBIT(cs->cs_mack, gd->gd_cpuid); 957 /* cs can be ripped out at this point */ 958 } else { 959 lwkt_ipiq_t ip; 960 int wi; 961 962 cpu_pause(); 963 #ifdef _KERNEL_VIRTUAL 964 pthread_yield(); 965 #endif 966 cpu_lfence(); 967 968 /* 969 * Requeue our IPI to avoid a deep stack recursion. If no other 970 * IPIs are pending we can just loop up, which should help VMs 971 * better-detect spin loops. 972 */ 973 ip = &gd->gd_cpusyncq; 974 #if 0 975 if (ip->ip_rindex == ip->ip_windex) { 976 __asm __volatile("cli"); 977 if (ip->ip_rindex == ip->ip_windex) { 978 __asm __volatile("sti; hlt"); 979 } else { 980 __asm __volatile("sti"); 981 } 982 } 983 #endif 984 985 wi = ip->ip_windex & MAXCPUFIFO_MASK; 986 ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2; 987 ip->ip_info[wi].arg1 = cs; 988 ip->ip_info[wi].arg2 = 0; 989 cpu_sfence(); 990 KKASSERT(ip->ip_windex - ip->ip_rindex < MAXCPUFIFO); 991 ++ip->ip_windex; 992 if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) { 993 kprintf("cpu %d cm=%016jx %016jx f=%p\n", 994 gd->gd_cpuid, 995 (intmax_t)CPUMASK_LOWMASK(cs->cs_mask), 996 (intmax_t)CPUMASK_LOWMASK(cs->cs_mack), 997 cs->cs_func); 998 } 999 } 1000 } 1001