1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 35 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ 36 */ 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/signalvar.h> 45 #include <sys/resourcevar.h> 46 #include <sys/vmmeter.h> 47 #include <sys/sysctl.h> 48 #include <sys/lock.h> 49 #include <sys/uio.h> 50 #include <sys/kcollect.h> 51 #ifdef KTRACE 52 #include <sys/ktrace.h> 53 #endif 54 #include <sys/ktr.h> 55 #include <sys/serialize.h> 56 57 #include <sys/signal2.h> 58 #include <sys/thread2.h> 59 #include <sys/spinlock2.h> 60 #include <sys/mutex2.h> 61 62 #include <machine/cpu.h> 63 #include <machine/smp.h> 64 65 TAILQ_HEAD(tslpque, thread); 66 67 static void sched_setup (void *dummy); 68 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL); 69 static void sched_dyninit (void *dummy); 70 SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL); 71 72 int lbolt; 73 void *lbolt_syncer; 74 int ncpus; 75 int ncpus2, ncpus2_shift, ncpus2_mask; /* note: mask not cpumask_t */ 76 int ncpus_fit, ncpus_fit_mask; /* note: mask not cpumask_t */ 77 int safepri; 78 int tsleep_now_works; 79 int tsleep_crypto_dump = 0; 80 81 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues"); 82 83 #define __DEALL(ident) __DEQUALIFY(void *, ident) 84 85 #if !defined(KTR_TSLEEP) 86 #define KTR_TSLEEP KTR_ALL 87 #endif 88 KTR_INFO_MASTER(tsleep); 89 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", const volatile void *ident); 90 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit"); 91 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", const volatile void *ident); 92 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit"); 93 KTR_INFO(KTR_TSLEEP, tsleep, ilockfail, 4, "interlock failed %p", const volatile void *ident); 94 95 #define logtsleep1(name) KTR_LOG(tsleep_ ## name) 96 #define logtsleep2(name, val) KTR_LOG(tsleep_ ## name, val) 97 98 struct loadavg averunnable = 99 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 100 /* 101 * Constants for averages over 1, 5, and 15 minutes 102 * when sampling at 5 second intervals. 103 */ 104 static fixpt_t cexp[3] = { 105 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 106 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 107 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 108 }; 109 110 static void endtsleep (void *); 111 static void loadav (void *arg); 112 static void schedcpu (void *arg); 113 114 static int pctcpu_decay = 10; 115 SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW, &pctcpu_decay, 0, ""); 116 117 /* 118 * kernel uses `FSCALE', userland (SHOULD) use kern.fscale 119 */ 120 int fscale __unused = FSCALE; /* exported to systat */ 121 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); 122 123 /* 124 * Recompute process priorities, once a second. 125 * 126 * Since the userland schedulers are typically event oriented, if the 127 * estcpu calculation at wakeup() time is not sufficient to make a 128 * process runnable relative to other processes in the system we have 129 * a 1-second recalc to help out. 130 * 131 * This code also allows us to store sysclock_t data in the process structure 132 * without fear of an overrun, since sysclock_t are guarenteed to hold 133 * several seconds worth of count. 134 * 135 * WARNING! callouts can preempt normal threads. However, they will not 136 * preempt a thread holding a spinlock so we *can* safely use spinlocks. 137 */ 138 static int schedcpu_stats(struct proc *p, void *data __unused); 139 static int schedcpu_resource(struct proc *p, void *data __unused); 140 141 static void 142 schedcpu(void *arg) 143 { 144 allproc_scan(schedcpu_stats, NULL, 1); 145 allproc_scan(schedcpu_resource, NULL, 1); 146 if (mycpu->gd_cpuid == 0) { 147 wakeup((caddr_t)&lbolt); 148 wakeup(lbolt_syncer); 149 } 150 callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL); 151 } 152 153 /* 154 * General process statistics once a second 155 */ 156 static int 157 schedcpu_stats(struct proc *p, void *data __unused) 158 { 159 struct lwp *lp; 160 161 /* 162 * Threads may not be completely set up if process in SIDL state. 163 */ 164 if (p->p_stat == SIDL) 165 return(0); 166 167 PHOLD(p); 168 if (lwkt_trytoken(&p->p_token) == FALSE) { 169 PRELE(p); 170 return(0); 171 } 172 173 p->p_swtime++; 174 FOREACH_LWP_IN_PROC(lp, p) { 175 if (lp->lwp_stat == LSSLEEP) { 176 ++lp->lwp_slptime; 177 if (lp->lwp_slptime == 1) 178 p->p_usched->uload_update(lp); 179 } 180 181 /* 182 * Only recalculate processes that are active or have slept 183 * less then 2 seconds. The schedulers understand this. 184 * Otherwise decay by 50% per second. 185 */ 186 if (lp->lwp_slptime <= 1) { 187 p->p_usched->recalculate(lp); 188 } else { 189 int decay; 190 191 decay = pctcpu_decay; 192 cpu_ccfence(); 193 if (decay <= 1) 194 decay = 1; 195 if (decay > 100) 196 decay = 100; 197 lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay; 198 } 199 } 200 lwkt_reltoken(&p->p_token); 201 lwkt_yield(); 202 PRELE(p); 203 return(0); 204 } 205 206 /* 207 * Resource checks. XXX break out since ksignal/killproc can block, 208 * limiting us to one process killed per second. There is probably 209 * a better way. 210 */ 211 static int 212 schedcpu_resource(struct proc *p, void *data __unused) 213 { 214 u_int64_t ttime; 215 struct lwp *lp; 216 217 if (p->p_stat == SIDL) 218 return(0); 219 220 PHOLD(p); 221 if (lwkt_trytoken(&p->p_token) == FALSE) { 222 PRELE(p); 223 return(0); 224 } 225 226 if (p->p_stat == SZOMB || p->p_limit == NULL) { 227 lwkt_reltoken(&p->p_token); 228 PRELE(p); 229 return(0); 230 } 231 232 ttime = 0; 233 FOREACH_LWP_IN_PROC(lp, p) { 234 /* 235 * We may have caught an lp in the middle of being 236 * created, lwp_thread can be NULL. 237 */ 238 if (lp->lwp_thread) { 239 ttime += lp->lwp_thread->td_sticks; 240 ttime += lp->lwp_thread->td_uticks; 241 } 242 } 243 244 switch(plimit_testcpulimit(p->p_limit, ttime)) { 245 case PLIMIT_TESTCPU_KILL: 246 killproc(p, "exceeded maximum CPU limit"); 247 break; 248 case PLIMIT_TESTCPU_XCPU: 249 if ((p->p_flags & P_XCPU) == 0) { 250 p->p_flags |= P_XCPU; 251 ksignal(p, SIGXCPU); 252 } 253 break; 254 default: 255 break; 256 } 257 lwkt_reltoken(&p->p_token); 258 lwkt_yield(); 259 PRELE(p); 260 return(0); 261 } 262 263 /* 264 * This is only used by ps. Generate a cpu percentage use over 265 * a period of one second. 266 */ 267 void 268 updatepcpu(struct lwp *lp, int cpticks, int ttlticks) 269 { 270 fixpt_t acc; 271 int remticks; 272 273 acc = (cpticks << FSHIFT) / ttlticks; 274 if (ttlticks >= ESTCPUFREQ) { 275 lp->lwp_pctcpu = acc; 276 } else { 277 remticks = ESTCPUFREQ - ttlticks; 278 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) / 279 ESTCPUFREQ; 280 } 281 } 282 283 /* 284 * Handy macros to calculate hash indices. LOOKUP() calculates the 285 * global cpumask hash index, TCHASHSHIFT() converts that into the 286 * pcpu hash index. 287 * 288 * By making the pcpu hash arrays smaller we save a significant amount 289 * of memory at very low cost. The real cost is in IPIs, which are handled 290 * by the much larger global cpumask hash table. 291 */ 292 #define LOOKUP(x) (((u_int)(uintptr_t)(x)) % slpque_tablesize) 293 #define TCHASHSHIFT(x) ((x) >> 4) 294 295 static uint32_t slpque_tablesize; 296 static cpumask_t *slpque_cpumasks; 297 298 /* 299 * This is a dandy function that allows us to interlock tsleep/wakeup 300 * operations with unspecified upper level locks, such as lockmgr locks, 301 * simply by holding a critical section. The sequence is: 302 * 303 * (acquire upper level lock) 304 * tsleep_interlock(blah) 305 * (release upper level lock) 306 * tsleep(blah, ...) 307 * 308 * Basically this functions queues us on the tsleep queue without actually 309 * descheduling us. When tsleep() is later called with PINTERLOCK it 310 * assumes the thread was already queued, otherwise it queues it there. 311 * 312 * Thus it is possible to receive the wakeup prior to going to sleep and 313 * the race conditions are covered. 314 */ 315 static __inline void 316 _tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags) 317 { 318 thread_t td = gd->gd_curthread; 319 uint32_t cid; 320 uint32_t gid; 321 322 crit_enter_quick(td); 323 if (td->td_flags & TDF_TSLEEPQ) { 324 cid = LOOKUP(td->td_wchan); 325 gid = TCHASHSHIFT(cid); 326 TAILQ_REMOVE(&gd->gd_tsleep_hash[gid], td, td_sleepq); 327 if (TAILQ_FIRST(&gd->gd_tsleep_hash[gid]) == NULL) { 328 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], 329 gd->gd_cpuid); 330 } 331 } else { 332 td->td_flags |= TDF_TSLEEPQ; 333 } 334 cid = LOOKUP(ident); 335 gid = TCHASHSHIFT(cid); 336 TAILQ_INSERT_TAIL(&gd->gd_tsleep_hash[gid], td, td_sleepq); 337 ATOMIC_CPUMASK_ORBIT(slpque_cpumasks[cid], gd->gd_cpuid); 338 td->td_wchan = ident; 339 td->td_wdomain = flags & PDOMAIN_MASK; 340 crit_exit_quick(td); 341 } 342 343 void 344 tsleep_interlock(const volatile void *ident, int flags) 345 { 346 _tsleep_interlock(mycpu, ident, flags); 347 } 348 349 /* 350 * Remove thread from sleepq. Must be called with a critical section held. 351 * The thread must not be migrating. 352 */ 353 static __inline void 354 _tsleep_remove(thread_t td) 355 { 356 globaldata_t gd = mycpu; 357 uint32_t cid; 358 uint32_t gid; 359 360 KKASSERT(td->td_gd == gd && IN_CRITICAL_SECT(td)); 361 KKASSERT((td->td_flags & TDF_MIGRATING) == 0); 362 if (td->td_flags & TDF_TSLEEPQ) { 363 td->td_flags &= ~TDF_TSLEEPQ; 364 cid = LOOKUP(td->td_wchan); 365 gid = TCHASHSHIFT(cid); 366 TAILQ_REMOVE(&gd->gd_tsleep_hash[gid], td, td_sleepq); 367 if (TAILQ_FIRST(&gd->gd_tsleep_hash[gid]) == NULL) { 368 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], 369 gd->gd_cpuid); 370 } 371 td->td_wchan = NULL; 372 td->td_wdomain = 0; 373 } 374 } 375 376 void 377 tsleep_remove(thread_t td) 378 { 379 _tsleep_remove(td); 380 } 381 382 /* 383 * General sleep call. Suspends the current process until a wakeup is 384 * performed on the specified identifier. The process will then be made 385 * runnable with the specified priority. Sleeps at most timo/hz seconds 386 * (0 means no timeout). If flags includes PCATCH flag, signals are checked 387 * before and after sleeping, else signals are not checked. Returns 0 if 388 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 389 * signal needs to be delivered, ERESTART is returned if the current system 390 * call should be restarted if possible, and EINTR is returned if the system 391 * call should be interrupted by the signal (return EINTR). 392 * 393 * Note that if we are a process, we release_curproc() before messing with 394 * the LWKT scheduler. 395 * 396 * During autoconfiguration or after a panic, a sleep will simply 397 * lower the priority briefly to allow interrupts, then return. 398 * 399 * WARNING! This code can't block (short of switching away), or bad things 400 * will happen. No getting tokens, no blocking locks, etc. 401 */ 402 int 403 tsleep(const volatile void *ident, int flags, const char *wmesg, int timo) 404 { 405 struct thread *td = curthread; 406 struct lwp *lp = td->td_lwp; 407 struct proc *p = td->td_proc; /* may be NULL */ 408 globaldata_t gd; 409 int sig; 410 int catch; 411 int error; 412 int oldpri; 413 struct callout thandle; 414 415 /* 416 * Currently a severe hack. Make sure any delayed wakeups 417 * are flushed before we sleep or we might deadlock on whatever 418 * event we are sleeping on. 419 */ 420 if (td->td_flags & TDF_DELAYED_WAKEUP) 421 wakeup_end_delayed(); 422 423 /* 424 * NOTE: removed KTRPOINT, it could cause races due to blocking 425 * even in stable. Just scrap it for now. 426 */ 427 if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) { 428 /* 429 * After a panic, or before we actually have an operational 430 * softclock, just give interrupts a chance, then just return; 431 * 432 * don't run any other procs or panic below, 433 * in case this is the idle process and already asleep. 434 */ 435 splz(); 436 oldpri = td->td_pri; 437 lwkt_setpri_self(safepri); 438 lwkt_switch(); 439 lwkt_setpri_self(oldpri); 440 return (0); 441 } 442 logtsleep2(tsleep_beg, ident); 443 gd = td->td_gd; 444 KKASSERT(td != &gd->gd_idlethread); /* you must be kidding! */ 445 td->td_wakefromcpu = -1; /* overwritten by _wakeup */ 446 447 /* 448 * NOTE: all of this occurs on the current cpu, including any 449 * callout-based wakeups, so a critical section is a sufficient 450 * interlock. 451 * 452 * The entire sequence through to where we actually sleep must 453 * run without breaking the critical section. 454 */ 455 catch = flags & PCATCH; 456 error = 0; 457 sig = 0; 458 459 crit_enter_quick(td); 460 461 KASSERT(ident != NULL, ("tsleep: no ident")); 462 KASSERT(lp == NULL || 463 lp->lwp_stat == LSRUN || /* Obvious */ 464 lp->lwp_stat == LSSTOP, /* Set in tstop */ 465 ("tsleep %p %s %d", 466 ident, wmesg, lp->lwp_stat)); 467 468 /* 469 * We interlock the sleep queue if the caller has not already done 470 * it for us. This must be done before we potentially acquire any 471 * tokens or we can loose the wakeup. 472 */ 473 if ((flags & PINTERLOCKED) == 0) { 474 _tsleep_interlock(gd, ident, flags); 475 } 476 477 /* 478 * Setup for the current process (if this is a process). We must 479 * interlock with lwp_token to avoid remote wakeup races via 480 * setrunnable() 481 */ 482 if (lp) { 483 lwkt_gettoken(&lp->lwp_token); 484 485 /* 486 * If the umbrella process is in the SCORE state then 487 * make sure that the thread is flagged going into a 488 * normal sleep to allow the core dump to proceed, otherwise 489 * the coredump can end up waiting forever. If the normal 490 * sleep is woken up, the thread will enter a stopped state 491 * upon return to userland. 492 * 493 * We do not want to interrupt or cause a thread exist at 494 * this juncture because that will mess-up the state the 495 * coredump is trying to save. 496 */ 497 if (p->p_stat == SCORE && 498 (lp->lwp_mpflags & LWP_MP_WSTOP) == 0) { 499 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP); 500 ++p->p_nstopped; 501 } 502 503 /* 504 * PCATCH requested. 505 */ 506 if (catch) { 507 /* 508 * Early termination if PCATCH was set and a 509 * signal is pending, interlocked with the 510 * critical section. 511 * 512 * Early termination only occurs when tsleep() is 513 * entered while in a normal LSRUN state. 514 */ 515 if ((sig = CURSIG(lp)) != 0) 516 goto resume; 517 518 /* 519 * Causes ksignal to wake us up if a signal is 520 * received (interlocked with lp->lwp_token). 521 */ 522 lp->lwp_flags |= LWP_SINTR; 523 } 524 } else { 525 KKASSERT(p == NULL); 526 } 527 528 /* 529 * Make sure the current process has been untangled from 530 * the userland scheduler and initialize slptime to start 531 * counting. 532 * 533 * NOTE: td->td_wakefromcpu is pre-set by the release function 534 * for the dfly scheduler, and then adjusted by _wakeup() 535 */ 536 if (lp) { 537 p->p_usched->release_curproc(lp); 538 lp->lwp_slptime = 0; 539 } 540 541 /* 542 * If the interlocked flag is set but our cpu bit in the slpqueue 543 * is no longer set, then a wakeup was processed inbetween the 544 * tsleep_interlock() (ours or the callers), and here. This can 545 * occur under numerous circumstances including when we release the 546 * current process. 547 * 548 * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s) 549 * to process incoming IPIs, thus draining incoming wakeups. 550 */ 551 if ((td->td_flags & TDF_TSLEEPQ) == 0) { 552 logtsleep2(ilockfail, ident); 553 goto resume; 554 } 555 556 /* 557 * scheduling is blocked while in a critical section. Coincide 558 * the descheduled-by-tsleep flag with the descheduling of the 559 * lwkt. 560 * 561 * The timer callout is localized on our cpu and interlocked by 562 * our critical section. 563 */ 564 lwkt_deschedule_self(td); 565 td->td_flags |= TDF_TSLEEP_DESCHEDULED; 566 td->td_wmesg = wmesg; 567 568 /* 569 * Setup the timeout, if any. The timeout is only operable while 570 * the thread is flagged descheduled. 571 */ 572 KKASSERT((td->td_flags & TDF_TIMEOUT) == 0); 573 if (timo) { 574 callout_init_mp(&thandle); 575 callout_reset(&thandle, timo, endtsleep, td); 576 } 577 578 /* 579 * Beddy bye bye. 580 */ 581 if (lp) { 582 /* 583 * Ok, we are sleeping. Place us in the SSLEEP state. 584 */ 585 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 586 587 /* 588 * tstop() sets LSSTOP, so don't fiddle with that. 589 */ 590 if (lp->lwp_stat != LSSTOP) 591 lp->lwp_stat = LSSLEEP; 592 lp->lwp_ru.ru_nvcsw++; 593 p->p_usched->uload_update(lp); 594 lwkt_switch(); 595 596 /* 597 * And when we are woken up, put us back in LSRUN. If we 598 * slept for over a second, recalculate our estcpu. 599 */ 600 lp->lwp_stat = LSRUN; 601 if (lp->lwp_slptime) { 602 p->p_usched->uload_update(lp); 603 p->p_usched->recalculate(lp); 604 } 605 lp->lwp_slptime = 0; 606 } else { 607 lwkt_switch(); 608 } 609 610 /* 611 * Make sure we haven't switched cpus while we were asleep. It's 612 * not supposed to happen. Cleanup our temporary flags. 613 */ 614 KKASSERT(gd == td->td_gd); 615 616 /* 617 * Cleanup the timeout. If the timeout has already occured thandle 618 * has already been stopped, otherwise stop thandle. If the timeout 619 * is running (the callout thread must be blocked trying to get 620 * lwp_token) then wait for us to get scheduled. 621 */ 622 if (timo) { 623 while (td->td_flags & TDF_TIMEOUT_RUNNING) { 624 /* else we won't get rescheduled! */ 625 if (lp->lwp_stat != LSSTOP) 626 lp->lwp_stat = LSSLEEP; 627 lwkt_deschedule_self(td); 628 td->td_wmesg = "tsrace"; 629 lwkt_switch(); 630 kprintf("td %p %s: timeout race\n", td, td->td_comm); 631 } 632 if (td->td_flags & TDF_TIMEOUT) { 633 td->td_flags &= ~TDF_TIMEOUT; 634 error = EWOULDBLOCK; 635 } else { 636 /* does not block when on same cpu */ 637 callout_stop(&thandle); 638 } 639 } 640 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED; 641 642 /* 643 * Make sure we have been removed from the sleepq. In most 644 * cases this will have been done for us already but it is 645 * possible for a scheduling IPI to be in-flight from a 646 * previous tsleep/tsleep_interlock() or due to a straight-out 647 * call to lwkt_schedule() (in the case of an interrupt thread), 648 * causing a spurious wakeup. 649 */ 650 _tsleep_remove(td); 651 td->td_wmesg = NULL; 652 653 /* 654 * Figure out the correct error return. If interrupted by a 655 * signal we want to return EINTR or ERESTART. 656 */ 657 resume: 658 if (lp) { 659 if (catch && error == 0) { 660 if (sig != 0 || (sig = CURSIG(lp))) { 661 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) 662 error = EINTR; 663 else 664 error = ERESTART; 665 } 666 } 667 668 lp->lwp_flags &= ~LWP_SINTR; 669 670 /* 671 * Unconditionally set us to LSRUN on resume. lwp_stat could 672 * be in a weird state due to the goto resume, particularly 673 * when tsleep() is called from tstop(). 674 */ 675 lp->lwp_stat = LSRUN; 676 lwkt_reltoken(&lp->lwp_token); 677 } 678 logtsleep1(tsleep_end); 679 crit_exit_quick(td); 680 return (error); 681 } 682 683 /* 684 * Interlocked spinlock sleep. An exclusively held spinlock must 685 * be passed to ssleep(). The function will atomically release the 686 * spinlock and tsleep on the ident, then reacquire the spinlock and 687 * return. 688 * 689 * This routine is fairly important along the critical path, so optimize it 690 * heavily. 691 */ 692 int 693 ssleep(const volatile void *ident, struct spinlock *spin, int flags, 694 const char *wmesg, int timo) 695 { 696 globaldata_t gd = mycpu; 697 int error; 698 699 _tsleep_interlock(gd, ident, flags); 700 spin_unlock_quick(gd, spin); 701 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 702 _spin_lock_quick(gd, spin, wmesg); 703 704 return (error); 705 } 706 707 int 708 lksleep(const volatile void *ident, struct lock *lock, int flags, 709 const char *wmesg, int timo) 710 { 711 globaldata_t gd = mycpu; 712 int error; 713 714 _tsleep_interlock(gd, ident, flags); 715 lockmgr(lock, LK_RELEASE); 716 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 717 lockmgr(lock, LK_EXCLUSIVE); 718 719 return (error); 720 } 721 722 /* 723 * Interlocked mutex sleep. An exclusively held mutex must be passed 724 * to mtxsleep(). The function will atomically release the mutex 725 * and tsleep on the ident, then reacquire the mutex and return. 726 */ 727 int 728 mtxsleep(const volatile void *ident, struct mtx *mtx, int flags, 729 const char *wmesg, int timo) 730 { 731 globaldata_t gd = mycpu; 732 int error; 733 734 _tsleep_interlock(gd, ident, flags); 735 mtx_unlock(mtx); 736 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 737 mtx_lock_ex_quick(mtx); 738 739 return (error); 740 } 741 742 /* 743 * Interlocked serializer sleep. An exclusively held serializer must 744 * be passed to zsleep(). The function will atomically release 745 * the serializer and tsleep on the ident, then reacquire the serializer 746 * and return. 747 */ 748 int 749 zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags, 750 const char *wmesg, int timo) 751 { 752 globaldata_t gd = mycpu; 753 int ret; 754 755 ASSERT_SERIALIZED(slz); 756 757 _tsleep_interlock(gd, ident, flags); 758 lwkt_serialize_exit(slz); 759 ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 760 lwkt_serialize_enter(slz); 761 762 return ret; 763 } 764 765 /* 766 * Directly block on the LWKT thread by descheduling it. This 767 * is much faster then tsleep(), but the only legal way to wake 768 * us up is to directly schedule the thread. 769 * 770 * Setting TDF_SINTR will cause new signals to directly schedule us. 771 * 772 * This routine must be called while in a critical section. 773 */ 774 int 775 lwkt_sleep(const char *wmesg, int flags) 776 { 777 thread_t td = curthread; 778 int sig; 779 780 if ((flags & PCATCH) == 0 || td->td_lwp == NULL) { 781 td->td_flags |= TDF_BLOCKED; 782 td->td_wmesg = wmesg; 783 lwkt_deschedule_self(td); 784 lwkt_switch(); 785 td->td_wmesg = NULL; 786 td->td_flags &= ~TDF_BLOCKED; 787 return(0); 788 } 789 if ((sig = CURSIG(td->td_lwp)) != 0) { 790 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig)) 791 return(EINTR); 792 else 793 return(ERESTART); 794 795 } 796 td->td_flags |= TDF_BLOCKED | TDF_SINTR; 797 td->td_wmesg = wmesg; 798 lwkt_deschedule_self(td); 799 lwkt_switch(); 800 td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR); 801 td->td_wmesg = NULL; 802 return(0); 803 } 804 805 /* 806 * Implement the timeout for tsleep. 807 * 808 * This type of callout timeout is scheduled on the same cpu the process 809 * is sleeping on. Also, at the moment, the MP lock is held. 810 */ 811 static void 812 endtsleep(void *arg) 813 { 814 thread_t td = arg; 815 struct lwp *lp; 816 817 /* 818 * We are going to have to get the lwp_token, which means we might 819 * block. This can race a tsleep getting woken up by other means 820 * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our 821 * processing to complete (sorry tsleep!). 822 * 823 * We can safely set td_flags because td MUST be on the same cpu 824 * as we are. 825 */ 826 KKASSERT(td->td_gd == mycpu); 827 crit_enter(); 828 td->td_flags |= TDF_TIMEOUT_RUNNING | TDF_TIMEOUT; 829 830 /* 831 * This can block but TDF_TIMEOUT_RUNNING will prevent the thread 832 * from exiting the tsleep on us. The flag is interlocked by virtue 833 * of lp being on the same cpu as we are. 834 */ 835 if ((lp = td->td_lwp) != NULL) 836 lwkt_gettoken(&lp->lwp_token); 837 838 KKASSERT(td->td_flags & TDF_TSLEEP_DESCHEDULED); 839 840 if (lp) { 841 /* 842 * callout timer should normally never be set in tstop() 843 * because it passes a timeout of 0. However, there is a 844 * case during thread exit (which SSTOP's all the threads) 845 * for which tstop() must break out and can (properly) leave 846 * the thread in LSSTOP. 847 */ 848 KKASSERT(lp->lwp_stat != LSSTOP || 849 (lp->lwp_mpflags & LWP_MP_WEXIT)); 850 setrunnable(lp); 851 lwkt_reltoken(&lp->lwp_token); 852 } else { 853 _tsleep_remove(td); 854 lwkt_schedule(td); 855 } 856 KKASSERT(td->td_gd == mycpu); 857 td->td_flags &= ~TDF_TIMEOUT_RUNNING; 858 crit_exit(); 859 } 860 861 /* 862 * Make all processes sleeping on the specified identifier runnable. 863 * count may be zero or one only. 864 * 865 * The domain encodes the sleep/wakeup domain, flags, plus the originating 866 * cpu. 867 * 868 * This call may run without the MP lock held. We can only manipulate thread 869 * state on the cpu owning the thread. We CANNOT manipulate process state 870 * at all. 871 * 872 * _wakeup() can be passed to an IPI so we can't use (const volatile 873 * void *ident). 874 */ 875 static void 876 _wakeup(void *ident, int domain) 877 { 878 struct tslpque *qp; 879 struct thread *td; 880 struct thread *ntd; 881 globaldata_t gd; 882 cpumask_t mask; 883 uint32_t cid; 884 uint32_t gid; 885 886 crit_enter(); 887 logtsleep2(wakeup_beg, ident); 888 gd = mycpu; 889 cid = LOOKUP(ident); 890 gid = TCHASHSHIFT(cid); 891 qp = &gd->gd_tsleep_hash[gid]; 892 restart: 893 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { 894 ntd = TAILQ_NEXT(td, td_sleepq); 895 if (td->td_wchan == ident && 896 td->td_wdomain == (domain & PDOMAIN_MASK) 897 ) { 898 KKASSERT(td->td_gd == gd); 899 _tsleep_remove(td); 900 td->td_wakefromcpu = PWAKEUP_DECODE(domain); 901 if (td->td_flags & TDF_TSLEEP_DESCHEDULED) { 902 lwkt_schedule(td); 903 if (domain & PWAKEUP_ONE) 904 goto done; 905 } 906 goto restart; 907 } 908 } 909 910 /* 911 * Because a bunch of cpumask array entries cover the same queue, it 912 * is possible for our bit to remain set in some of them and cause 913 * spurious wakeup IPIs later on. Make sure that the bit is cleared 914 * when a spurious IPI occurs to prevent further spurious IPIs. 915 */ 916 if (TAILQ_FIRST(qp) == NULL) { 917 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], gd->gd_cpuid); 918 } 919 920 /* 921 * We finished checking the current cpu but there still may be 922 * more work to do. Either wakeup_one was requested and no matching 923 * thread was found, or a normal wakeup was requested and we have 924 * to continue checking cpus. 925 * 926 * It should be noted that this scheme is actually less expensive then 927 * the old scheme when waking up multiple threads, since we send 928 * only one IPI message per target candidate which may then schedule 929 * multiple threads. Before we could have wound up sending an IPI 930 * message for each thread on the target cpu (!= current cpu) that 931 * needed to be woken up. 932 * 933 * NOTE: Wakeups occuring on remote cpus are asynchronous. This 934 * should be ok since we are passing idents in the IPI rather 935 * then thread pointers. 936 * 937 * NOTE: We MUST mfence (or use an atomic op) prior to reading 938 * the cpumask, as another cpu may have written to it in 939 * a fashion interlocked with whatever the caller did before 940 * calling wakeup(). Otherwise we might miss the interaction 941 * (kern_mutex.c can cause this problem). 942 * 943 * lfence is insufficient as it may allow a written state to 944 * reorder around the cpumask load. 945 */ 946 if ((domain & PWAKEUP_MYCPU) == 0) { 947 cpu_mfence(); 948 mask = slpque_cpumasks[cid]; 949 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 950 if (CPUMASK_TESTNZERO(mask)) { 951 lwkt_send_ipiq2_mask(mask, _wakeup, ident, 952 domain | PWAKEUP_MYCPU); 953 } 954 } 955 done: 956 logtsleep1(wakeup_end); 957 crit_exit(); 958 } 959 960 /* 961 * Wakeup all threads tsleep()ing on the specified ident, on all cpus 962 */ 963 void 964 wakeup(const volatile void *ident) 965 { 966 globaldata_t gd = mycpu; 967 thread_t td = gd->gd_curthread; 968 969 if (td && (td->td_flags & TDF_DELAYED_WAKEUP)) { 970 /* 971 * If we are in a delayed wakeup section, record up to two wakeups in 972 * a per-CPU queue and issue them when we block or exit the delayed 973 * wakeup section. 974 */ 975 if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[0], NULL, ident)) 976 return; 977 if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[1], NULL, ident)) 978 return; 979 980 ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[1]), 981 __DEALL(ident)); 982 ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[0]), 983 __DEALL(ident)); 984 } 985 986 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, gd->gd_cpuid)); 987 } 988 989 /* 990 * Wakeup one thread tsleep()ing on the specified ident, on any cpu. 991 */ 992 void 993 wakeup_one(const volatile void *ident) 994 { 995 /* XXX potentially round-robin the first responding cpu */ 996 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | 997 PWAKEUP_ONE); 998 } 999 1000 /* 1001 * Wakeup threads tsleep()ing on the specified ident on the current cpu 1002 * only. 1003 */ 1004 void 1005 wakeup_mycpu(const volatile void *ident) 1006 { 1007 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | 1008 PWAKEUP_MYCPU); 1009 } 1010 1011 /* 1012 * Wakeup one thread tsleep()ing on the specified ident on the current cpu 1013 * only. 1014 */ 1015 void 1016 wakeup_mycpu_one(const volatile void *ident) 1017 { 1018 /* XXX potentially round-robin the first responding cpu */ 1019 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | 1020 PWAKEUP_MYCPU | PWAKEUP_ONE); 1021 } 1022 1023 /* 1024 * Wakeup all thread tsleep()ing on the specified ident on the specified cpu 1025 * only. 1026 */ 1027 void 1028 wakeup_oncpu(globaldata_t gd, const volatile void *ident) 1029 { 1030 globaldata_t mygd = mycpu; 1031 if (gd == mycpu) { 1032 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1033 PWAKEUP_MYCPU); 1034 } else { 1035 lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), 1036 PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1037 PWAKEUP_MYCPU); 1038 } 1039 } 1040 1041 /* 1042 * Wakeup one thread tsleep()ing on the specified ident on the specified cpu 1043 * only. 1044 */ 1045 void 1046 wakeup_oncpu_one(globaldata_t gd, const volatile void *ident) 1047 { 1048 globaldata_t mygd = mycpu; 1049 if (gd == mygd) { 1050 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1051 PWAKEUP_MYCPU | PWAKEUP_ONE); 1052 } else { 1053 lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), 1054 PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1055 PWAKEUP_MYCPU | PWAKEUP_ONE); 1056 } 1057 } 1058 1059 /* 1060 * Wakeup all threads waiting on the specified ident that slept using 1061 * the specified domain, on all cpus. 1062 */ 1063 void 1064 wakeup_domain(const volatile void *ident, int domain) 1065 { 1066 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid)); 1067 } 1068 1069 /* 1070 * Wakeup one thread waiting on the specified ident that slept using 1071 * the specified domain, on any cpu. 1072 */ 1073 void 1074 wakeup_domain_one(const volatile void *ident, int domain) 1075 { 1076 /* XXX potentially round-robin the first responding cpu */ 1077 _wakeup(__DEALL(ident), 1078 PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE); 1079 } 1080 1081 void 1082 wakeup_start_delayed(void) 1083 { 1084 globaldata_t gd = mycpu; 1085 1086 crit_enter(); 1087 gd->gd_curthread->td_flags |= TDF_DELAYED_WAKEUP; 1088 crit_exit(); 1089 } 1090 1091 void 1092 wakeup_end_delayed(void) 1093 { 1094 globaldata_t gd = mycpu; 1095 1096 if (gd->gd_curthread->td_flags & TDF_DELAYED_WAKEUP) { 1097 crit_enter(); 1098 gd->gd_curthread->td_flags &= ~TDF_DELAYED_WAKEUP; 1099 if (gd->gd_delayed_wakeup[0] || gd->gd_delayed_wakeup[1]) { 1100 if (gd->gd_delayed_wakeup[0]) { 1101 wakeup(gd->gd_delayed_wakeup[0]); 1102 gd->gd_delayed_wakeup[0] = NULL; 1103 } 1104 if (gd->gd_delayed_wakeup[1]) { 1105 wakeup(gd->gd_delayed_wakeup[1]); 1106 gd->gd_delayed_wakeup[1] = NULL; 1107 } 1108 } 1109 crit_exit(); 1110 } 1111 } 1112 1113 /* 1114 * setrunnable() 1115 * 1116 * Make a process runnable. lp->lwp_token must be held on call and this 1117 * function must be called from the cpu owning lp. 1118 * 1119 * This only has an effect if we are in LSSTOP or LSSLEEP. 1120 */ 1121 void 1122 setrunnable(struct lwp *lp) 1123 { 1124 thread_t td = lp->lwp_thread; 1125 1126 ASSERT_LWKT_TOKEN_HELD(&lp->lwp_token); 1127 KKASSERT(td->td_gd == mycpu); 1128 crit_enter(); 1129 if (lp->lwp_stat == LSSTOP) 1130 lp->lwp_stat = LSSLEEP; 1131 if (lp->lwp_stat == LSSLEEP) { 1132 _tsleep_remove(td); 1133 lwkt_schedule(td); 1134 } else if (td->td_flags & TDF_SINTR) { 1135 lwkt_schedule(td); 1136 } 1137 crit_exit(); 1138 } 1139 1140 /* 1141 * The process is stopped due to some condition, usually because p_stat is 1142 * set to SSTOP, but also possibly due to being traced. 1143 * 1144 * Caller must hold p->p_token 1145 * 1146 * NOTE! If the caller sets SSTOP, the caller must also clear P_WAITED 1147 * because the parent may check the child's status before the child actually 1148 * gets to this routine. 1149 * 1150 * This routine is called with the current lwp only, typically just 1151 * before returning to userland if the process state is detected as 1152 * possibly being in a stopped state. 1153 */ 1154 void 1155 tstop(void) 1156 { 1157 struct lwp *lp = curthread->td_lwp; 1158 struct proc *p = lp->lwp_proc; 1159 struct proc *q; 1160 1161 lwkt_gettoken(&lp->lwp_token); 1162 crit_enter(); 1163 1164 /* 1165 * If LWP_MP_WSTOP is set, we were sleeping 1166 * while our process was stopped. At this point 1167 * we were already counted as stopped. 1168 */ 1169 if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) { 1170 /* 1171 * If we're the last thread to stop, signal 1172 * our parent. 1173 */ 1174 p->p_nstopped++; 1175 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP); 1176 wakeup(&p->p_nstopped); 1177 if (p->p_nstopped == p->p_nthreads) { 1178 /* 1179 * Token required to interlock kern_wait() 1180 */ 1181 q = p->p_pptr; 1182 PHOLD(q); 1183 lwkt_gettoken(&q->p_token); 1184 p->p_flags &= ~P_WAITED; 1185 wakeup(p->p_pptr); 1186 if ((q->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0) 1187 ksignal(q, SIGCHLD); 1188 lwkt_reltoken(&q->p_token); 1189 PRELE(q); 1190 } 1191 } 1192 1193 /* 1194 * Wait here while in a stopped state, interlocked with lwp_token. 1195 * We must break-out if the whole process is trying to exit. 1196 */ 1197 while (STOPLWP(p, lp)) { 1198 lp->lwp_stat = LSSTOP; 1199 tsleep(p, 0, "stop", 0); 1200 } 1201 p->p_nstopped--; 1202 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_WSTOP); 1203 crit_exit(); 1204 lwkt_reltoken(&lp->lwp_token); 1205 } 1206 1207 /* 1208 * Compute a tenex style load average of a quantity on 1209 * 1, 5 and 15 minute intervals. This is a pcpu callout. 1210 * 1211 * We segment the lwp scan on a pcpu basis. This does NOT 1212 * mean the associated lwps are on this cpu, it is done 1213 * just to break the work up. 1214 * 1215 * The callout on cpu0 rolls up the stats from the other 1216 * cpus. 1217 */ 1218 static int loadav_count_runnable(struct lwp *p, void *data); 1219 1220 static void 1221 loadav(void *arg) 1222 { 1223 globaldata_t gd = mycpu; 1224 struct loadavg *avg; 1225 int i, nrun; 1226 1227 nrun = 0; 1228 alllwp_scan(loadav_count_runnable, &nrun, 1); 1229 gd->gd_loadav_nrunnable = nrun; 1230 if (gd->gd_cpuid == 0) { 1231 avg = &averunnable; 1232 nrun = 0; 1233 for (i = 0; i < ncpus; ++i) 1234 nrun += globaldata_find(i)->gd_loadav_nrunnable; 1235 for (i = 0; i < 3; i++) { 1236 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1237 (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1238 } 1239 } 1240 1241 /* 1242 * Schedule the next update to occur after 5 seconds, but add a 1243 * random variation to avoid synchronisation with processes that 1244 * run at regular intervals. 1245 */ 1246 callout_reset(&gd->gd_loadav_callout, 1247 hz * 4 + (int)(krandom() % (hz * 2 + 1)), 1248 loadav, NULL); 1249 } 1250 1251 static int 1252 loadav_count_runnable(struct lwp *lp, void *data) 1253 { 1254 int *nrunp = data; 1255 thread_t td; 1256 1257 switch (lp->lwp_stat) { 1258 case LSRUN: 1259 if ((td = lp->lwp_thread) == NULL) 1260 break; 1261 if (td->td_flags & TDF_BLOCKED) 1262 break; 1263 ++*nrunp; 1264 break; 1265 default: 1266 break; 1267 } 1268 lwkt_yield(); 1269 return(0); 1270 } 1271 1272 /* 1273 * Regular data collection 1274 */ 1275 static uint64_t 1276 collect_load_callback(int n) 1277 { 1278 int fscale = averunnable.fscale; 1279 1280 return ((averunnable.ldavg[0] * 100 + (fscale >> 1)) / fscale); 1281 } 1282 1283 static void 1284 sched_setup(void *dummy __unused) 1285 { 1286 globaldata_t save_gd = mycpu; 1287 globaldata_t gd; 1288 int n; 1289 1290 kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback, 1291 KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0)); 1292 1293 /* 1294 * Kick off timeout driven events by calling first time. We 1295 * split the work across available cpus to help scale it, 1296 * it can eat a lot of cpu when there are a lot of processes 1297 * on the system. 1298 */ 1299 for (n = 0; n < ncpus; ++n) { 1300 gd = globaldata_find(n); 1301 lwkt_setcpu_self(gd); 1302 callout_init_mp(&gd->gd_loadav_callout); 1303 callout_init_mp(&gd->gd_schedcpu_callout); 1304 schedcpu(NULL); 1305 loadav(NULL); 1306 } 1307 lwkt_setcpu_self(save_gd); 1308 } 1309 1310 /* 1311 * Extremely early initialization, dummy-up the tables so we don't have 1312 * to conditionalize for NULL in _wakeup() and tsleep_interlock(). Even 1313 * though the system isn't blocking this early, these functions still 1314 * try to access the hash table. 1315 * 1316 * This setup will be overridden once sched_dyninit() -> sleep_gdinit() 1317 * is called. 1318 */ 1319 void 1320 sleep_early_gdinit(globaldata_t gd) 1321 { 1322 static struct tslpque dummy_slpque; 1323 static cpumask_t dummy_cpumasks; 1324 1325 slpque_tablesize = 1; 1326 gd->gd_tsleep_hash = &dummy_slpque; 1327 slpque_cpumasks = &dummy_cpumasks; 1328 TAILQ_INIT(&dummy_slpque); 1329 } 1330 1331 /* 1332 * PCPU initialization. Called after KMALLOC is operational, by 1333 * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later. 1334 * 1335 * WARNING! The pcpu hash table is smaller than the global cpumask 1336 * hash table, which can save us a lot of memory when maxproc 1337 * is set high. 1338 */ 1339 void 1340 sleep_gdinit(globaldata_t gd) 1341 { 1342 struct thread *td; 1343 uint32_t n; 1344 uint32_t i; 1345 1346 /* 1347 * This shouldn't happen, that is there shouldn't be any threads 1348 * waiting on the dummy tsleep queue this early in the boot. 1349 */ 1350 if (gd->gd_cpuid == 0) { 1351 TAILQ_FOREACH(td, &gd->gd_tsleep_hash[0], td_sleepq) { 1352 kprintf("SLEEP_GDINIT SWITCH %s\n", td->td_comm); 1353 } 1354 } 1355 1356 /* 1357 * Note that we have to allocate one extra slot because we are 1358 * shifting a modulo value. TCHASHSHIFT(slpque_tablesize - 1) can 1359 * return the same value as TCHASHSHIFT(slpque_tablesize). 1360 */ 1361 n = TCHASHSHIFT(slpque_tablesize) + 1; 1362 1363 gd->gd_tsleep_hash = kmalloc(sizeof(struct tslpque) * n, 1364 M_TSLEEP, M_WAITOK | M_ZERO); 1365 for (i = 0; i < n; ++i) 1366 TAILQ_INIT(&gd->gd_tsleep_hash[i]); 1367 } 1368 1369 /* 1370 * Dynamic initialization after the memory system is operational. 1371 */ 1372 static void 1373 sched_dyninit(void *dummy __unused) 1374 { 1375 int tblsize; 1376 int tblsize2; 1377 int n; 1378 1379 /* 1380 * Calculate table size for slpque hash. We want a prime number 1381 * large enough to avoid overloading slpque_cpumasks when the 1382 * system has a large number of sleeping processes, which will 1383 * spam IPIs on wakeup(). 1384 * 1385 * While it is true this is really a per-lwp factor, generally 1386 * speaking the maxproc limit is a good metric to go by. 1387 */ 1388 for (tblsize = maxproc | 1; ; tblsize += 2) { 1389 if (tblsize % 3 == 0) 1390 continue; 1391 if (tblsize % 5 == 0) 1392 continue; 1393 tblsize2 = (tblsize / 2) | 1; 1394 for (n = 7; n < tblsize2; n += 2) { 1395 if (tblsize % n == 0) 1396 break; 1397 } 1398 if (n == tblsize2) 1399 break; 1400 } 1401 1402 /* 1403 * PIDs are currently limited to 6 digits. Cap the table size 1404 * at double this. 1405 */ 1406 if (tblsize > 2000003) 1407 tblsize = 2000003; 1408 1409 slpque_tablesize = tblsize; 1410 slpque_cpumasks = kmalloc(sizeof(*slpque_cpumasks) * slpque_tablesize, 1411 M_TSLEEP, M_WAITOK | M_ZERO); 1412 sleep_gdinit(mycpu); 1413 } 1414