1 /* 2 * Copyright (c) 2012 The DragonFly Project. All rights reserved. 3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Matthew Dillon <dillon@backplane.com>, 7 * by Mihai Carabas <mihai.carabas@gmail.com> 8 * and many others. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/queue.h> 37 #include <sys/proc.h> 38 #include <sys/rtprio.h> 39 #include <sys/uio.h> 40 #include <sys/sysctl.h> 41 #include <sys/resourcevar.h> 42 #include <sys/spinlock.h> 43 #include <sys/cpu_topology.h> 44 #include <sys/thread2.h> 45 #include <sys/spinlock2.h> 46 #include <sys/mplock2.h> 47 48 #include <sys/ktr.h> 49 50 #include <machine/cpu.h> 51 #include <machine/smp.h> 52 53 /* 54 * Priorities. Note that with 32 run queues per scheduler each queue 55 * represents four priority levels. 56 */ 57 58 #define MAXPRI 128 59 #define PRIMASK (MAXPRI - 1) 60 #define PRIBASE_REALTIME 0 61 #define PRIBASE_NORMAL MAXPRI 62 #define PRIBASE_IDLE (MAXPRI * 2) 63 #define PRIBASE_THREAD (MAXPRI * 3) 64 #define PRIBASE_NULL (MAXPRI * 4) 65 66 #define NQS 32 /* 32 run queues. */ 67 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 68 #define PPQMASK (PPQ - 1) 69 70 /* 71 * NICEPPQ - number of nice units per priority queue 72 * 73 * ESTCPUPPQ - number of estcpu units per priority queue 74 * ESTCPUMAX - number of estcpu units 75 */ 76 #define NICEPPQ 2 77 #define ESTCPUPPQ 512 78 #define ESTCPUMAX (ESTCPUPPQ * NQS) 79 #define BATCHMAX (ESTCPUFREQ * 30) 80 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 81 82 #define ESTCPULIM(v) min((v), ESTCPUMAX) 83 84 TAILQ_HEAD(rq, lwp); 85 86 #define lwp_priority lwp_usdata.bsd4.priority 87 #define lwp_rqindex lwp_usdata.bsd4.rqindex 88 #define lwp_estcpu lwp_usdata.bsd4.estcpu 89 #define lwp_batch lwp_usdata.bsd4.batch 90 #define lwp_rqtype lwp_usdata.bsd4.rqtype 91 92 static void bsd4_acquire_curproc(struct lwp *lp); 93 static void bsd4_release_curproc(struct lwp *lp); 94 static void bsd4_select_curproc(globaldata_t gd); 95 static void bsd4_setrunqueue(struct lwp *lp); 96 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 97 sysclock_t cpstamp); 98 static void bsd4_recalculate_estcpu(struct lwp *lp); 99 static void bsd4_resetpriority(struct lwp *lp); 100 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 101 static void bsd4_exiting(struct lwp *lp, struct proc *); 102 static void bsd4_uload_update(struct lwp *lp); 103 static void bsd4_yield(struct lwp *lp); 104 105 #ifdef SMP 106 static void bsd4_need_user_resched_remote(void *dummy); 107 static int bsd4_batchy_looser_pri_test(struct lwp* lp); 108 static struct lwp *bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp); 109 static void bsd4_kick_helper(struct lwp *lp); 110 #endif 111 static struct lwp *bsd4_chooseproc_locked(struct lwp *chklp); 112 static void bsd4_remrunqueue_locked(struct lwp *lp); 113 static void bsd4_setrunqueue_locked(struct lwp *lp); 114 115 struct usched usched_bsd4 = { 116 { NULL }, 117 "bsd4", "Original DragonFly Scheduler", 118 NULL, /* default registration */ 119 NULL, /* default deregistration */ 120 bsd4_acquire_curproc, 121 bsd4_release_curproc, 122 bsd4_setrunqueue, 123 bsd4_schedulerclock, 124 bsd4_recalculate_estcpu, 125 bsd4_resetpriority, 126 bsd4_forking, 127 bsd4_exiting, 128 bsd4_uload_update, 129 NULL, /* setcpumask not supported */ 130 bsd4_yield 131 }; 132 133 struct usched_bsd4_pcpu { 134 struct thread helper_thread; 135 short rrcount; 136 short upri; 137 struct lwp *uschedcp; 138 struct lwp *old_uschedcp; 139 #ifdef SMP 140 cpu_node_t *cpunode; 141 #endif 142 }; 143 144 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 145 146 /* 147 * We have NQS (32) run queues per scheduling class. For the normal 148 * class, there are 128 priorities scaled onto these 32 queues. New 149 * processes are added to the last entry in each queue, and processes 150 * are selected for running by taking them from the head and maintaining 151 * a simple FIFO arrangement. Realtime and Idle priority processes have 152 * and explicit 0-31 priority which maps directly onto their class queue 153 * index. When a queue has something in it, the corresponding bit is 154 * set in the queuebits variable, allowing a single read to determine 155 * the state of all 32 queues and then a ffs() to find the first busy 156 * queue. 157 */ 158 static struct rq bsd4_queues[NQS]; 159 static struct rq bsd4_rtqueues[NQS]; 160 static struct rq bsd4_idqueues[NQS]; 161 static u_int32_t bsd4_queuebits; 162 static u_int32_t bsd4_rtqueuebits; 163 static u_int32_t bsd4_idqueuebits; 164 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ 165 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ 166 static int bsd4_runqcount; 167 #ifdef SMP 168 static volatile int bsd4_scancpu; 169 #endif 170 static struct spinlock bsd4_spin; 171 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 172 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; 173 static struct sysctl_oid *usched_bsd4_sysctl_tree; 174 175 /* Debug info exposed through debug.* sysctl */ 176 177 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, 178 &bsd4_runqcount, 0, 179 "Number of run queues"); 180 181 static int usched_bsd4_debug = -1; 182 SYSCTL_INT(_debug, OID_AUTO, bsd4_scdebug, CTLFLAG_RW, 183 &usched_bsd4_debug, 0, 184 "Print debug information for this pid"); 185 186 static int usched_bsd4_pid_debug = -1; 187 SYSCTL_INT(_debug, OID_AUTO, bsd4_pid_debug, CTLFLAG_RW, 188 &usched_bsd4_pid_debug, 0, 189 "Print KTR debug information for this pid"); 190 191 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ 192 #ifdef SMP 193 static int usched_bsd4_smt = 0; 194 static int usched_bsd4_cache_coherent = 0; 195 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ 196 static int usched_bsd4_queue_checks = 5; 197 static int usched_bsd4_stick_to_level = 0; 198 static long usched_bsd4_kicks; 199 #endif 200 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 201 static int usched_bsd4_decay = 8; 202 static int usched_bsd4_batch_time = 10; 203 204 /* KTR debug printings */ 205 206 KTR_INFO_MASTER_EXTERN(usched); 207 208 #if !defined(KTR_USCHED_BSD4) 209 #define KTR_USCHED_BSD4 KTR_ALL 210 #endif 211 212 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, 213 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " 214 "after release: pid %d, cpuid %d, curr_cpuid %d)", 215 pid_t pid, int cpuid, int curr); 216 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, 217 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " 218 "curr_cpuid %d)", 219 pid_t pid, int cpuid, int curr); 220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, 221 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " 222 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", 223 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); 224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, 225 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " 226 "cpuid %d, curr_cpuid %d)", 227 pid_t pid, int cpuid, int curr); 228 229 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, 230 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 231 "cpuid %d, curr_cpuid %d)", 232 pid_t pid, int cpuid, int curr); 233 234 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, 235 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 236 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", 237 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); 238 239 #ifdef SMP 240 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, 241 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " 242 "cpuid %d, verify_mask %lu)", 243 pid_t pid, int cpuid, cpumask_t mask); 244 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, 245 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " 246 "cpuid %d, verify_mask %lu)", 247 pid_t pid, int cpuid, cpumask_t mask); 248 249 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, 250 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " 251 "mask %lu, curr_cpuid %d)", 252 pid_t pid, int cpuid, cpumask_t mask, int curr); 253 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, 254 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " 255 "cpuid %d, mask %lu, curr_cpuid %d)", 256 pid_t pid, int cpuid, cpumask_t mask, int curr); 257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, 258 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " 259 "cpuid %d, mask %lu, curr_cpuid %d)", 260 pid_t pid, int cpuid, cpumask_t mask, int curr); 261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, 262 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 263 "mask %lu, found_cpuid %d, curr_cpuid %d)", 264 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 265 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, 266 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " 267 "try_cpuid %d, curr_cpuid %d)", 268 pid_t pid, int cpuid, int try_cpuid, int curr); 269 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, 270 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 271 "mask %lu, found_cpuid %d, curr_cpuid %d)", 272 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 273 #endif 274 275 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0, 276 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 277 pid_t pid, int old_cpuid, int curr); 278 #ifdef SMP 279 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, 280 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", 281 pid_t pid, int old_cpuid, int curr); 282 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, 283 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " 284 "sibling_mask %lu, curr_cpumask %lu)", 285 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 286 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, 287 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " 288 "sibling_mask %lu, curr_cpumask: %lu)", 289 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 290 291 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, 292 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", 293 int id, pid_t pid, int cpuid); 294 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, 295 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", 296 int id, pid_t pid, int cpuid); 297 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, 298 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", 299 int id, cpumask_t tmpmask); 300 #endif 301 302 /* 303 * Initialize the run queues at boot time. 304 */ 305 static void 306 bsd4_rqinit(void *dummy) 307 { 308 int i; 309 310 spin_init(&bsd4_spin); 311 for (i = 0; i < NQS; i++) { 312 TAILQ_INIT(&bsd4_queues[i]); 313 TAILQ_INIT(&bsd4_rtqueues[i]); 314 TAILQ_INIT(&bsd4_idqueues[i]); 315 } 316 atomic_clear_cpumask(&bsd4_curprocmask, 1); 317 } 318 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL) 319 320 /* 321 * BSD4_ACQUIRE_CURPROC 322 * 323 * This function is called when the kernel intends to return to userland. 324 * It is responsible for making the thread the current designated userland 325 * thread for this cpu, blocking if necessary. 326 * 327 * The kernel will not depress our LWKT priority until after we return, 328 * in case we have to shove over to another cpu. 329 * 330 * We must determine our thread's disposition before we switch away. This 331 * is very sensitive code. 332 * 333 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 334 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 335 * occur, this function is called only under very controlled circumstances. 336 * 337 * MPSAFE 338 */ 339 static void 340 bsd4_acquire_curproc(struct lwp *lp) 341 { 342 globaldata_t gd; 343 bsd4_pcpu_t dd; 344 thread_t td; 345 #if 0 346 struct lwp *olp; 347 #endif 348 349 /* 350 * Make sure we aren't sitting on a tsleep queue. 351 */ 352 td = lp->lwp_thread; 353 crit_enter_quick(td); 354 if (td->td_flags & TDF_TSLEEPQ) 355 tsleep_remove(td); 356 bsd4_recalculate_estcpu(lp); 357 358 /* 359 * If a reschedule was requested give another thread the 360 * driver's seat. 361 */ 362 if (user_resched_wanted()) { 363 clear_user_resched(); 364 bsd4_release_curproc(lp); 365 366 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, 367 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 368 lp->lwp_proc->p_pid, 369 lp->lwp_thread->td_gd->gd_cpuid, 370 mycpu->gd_cpuid); 371 } 372 373 /* 374 * Loop until we are the current user thread 375 */ 376 gd = mycpu; 377 dd = &bsd4_pcpu[gd->gd_cpuid]; 378 379 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, 380 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 381 lp->lwp_proc->p_pid, 382 lp->lwp_thread->td_gd->gd_cpuid, 383 gd->gd_cpuid); 384 385 do { 386 /* 387 * Process any pending events and higher priority threads. 388 */ 389 lwkt_yield(); 390 391 /* 392 * Become the currently scheduled user thread for this cpu 393 * if we can do so trivially. 394 * 395 * We can steal another thread's current thread designation 396 * on this cpu since if we are running that other thread 397 * must not be, so we can safely deschedule it. 398 */ 399 if (dd->uschedcp == lp) { 400 /* 401 * We are already the current lwp (hot path). 402 */ 403 dd->upri = lp->lwp_priority; 404 } else if (dd->uschedcp == NULL) { 405 /* 406 * We can trivially become the current lwp. 407 */ 408 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 409 dd->uschedcp = lp; 410 dd->upri = lp->lwp_priority; 411 } else if (dd->upri > lp->lwp_priority) { 412 /* 413 * We can steal the current cpu's lwp designation 414 * away simply by replacing it. The other thread 415 * will stall when it tries to return to userland. 416 */ 417 dd->uschedcp = lp; 418 dd->upri = lp->lwp_priority; 419 /* 420 lwkt_deschedule(olp->lwp_thread); 421 bsd4_setrunqueue(olp); 422 */ 423 } else { 424 /* 425 * We cannot become the current lwp, place the lp 426 * on the bsd4 run-queue and deschedule ourselves. 427 * 428 * When we are reactivated we will have another 429 * chance. 430 */ 431 lwkt_deschedule(lp->lwp_thread); 432 433 bsd4_setrunqueue(lp); 434 435 KTR_COND_LOG(usched_bsd4_acquire_curproc_not, 436 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 437 lp->lwp_proc->p_pid, 438 lp->lwp_thread->td_gd->gd_cpuid, 439 dd->uschedcp->lwp_proc->p_pid, 440 gd->gd_cpuid); 441 442 443 lwkt_switch(); 444 445 /* 446 * Reload after a switch or setrunqueue/switch possibly 447 * moved us to another cpu. 448 */ 449 gd = mycpu; 450 dd = &bsd4_pcpu[gd->gd_cpuid]; 451 452 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, 453 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 454 lp->lwp_proc->p_pid, 455 lp->lwp_thread->td_gd->gd_cpuid, 456 gd->gd_cpuid); 457 } 458 } while (dd->uschedcp != lp); 459 460 crit_exit_quick(td); 461 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 462 } 463 464 /* 465 * BSD4_RELEASE_CURPROC 466 * 467 * This routine detaches the current thread from the userland scheduler, 468 * usually because the thread needs to run or block in the kernel (at 469 * kernel priority) for a while. 470 * 471 * This routine is also responsible for selecting a new thread to 472 * make the current thread. 473 * 474 * NOTE: This implementation differs from the dummy example in that 475 * bsd4_select_curproc() is able to select the current process, whereas 476 * dummy_select_curproc() is not able to select the current process. 477 * This means we have to NULL out uschedcp. 478 * 479 * Additionally, note that we may already be on a run queue if releasing 480 * via the lwkt_switch() in bsd4_setrunqueue(). 481 * 482 * MPSAFE 483 */ 484 485 static void 486 bsd4_release_curproc(struct lwp *lp) 487 { 488 globaldata_t gd = mycpu; 489 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 490 491 if (dd->uschedcp == lp) { 492 crit_enter(); 493 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 494 495 KTR_COND_LOG(usched_bsd4_release_curproc, 496 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 497 lp->lwp_proc->p_pid, 498 lp->lwp_thread->td_gd->gd_cpuid, 499 gd->gd_cpuid); 500 501 dd->uschedcp = NULL; /* don't let lp be selected */ 502 dd->upri = PRIBASE_NULL; 503 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 504 dd->old_uschedcp = lp; /* used only for KTR debug prints */ 505 bsd4_select_curproc(gd); 506 crit_exit(); 507 } 508 } 509 510 /* 511 * BSD4_SELECT_CURPROC 512 * 513 * Select a new current process for this cpu and clear any pending user 514 * reschedule request. The cpu currently has no current process. 515 * 516 * This routine is also responsible for equal-priority round-robining, 517 * typically triggered from bsd4_schedulerclock(). In our dummy example 518 * all the 'user' threads are LWKT scheduled all at once and we just 519 * call lwkt_switch(). 520 * 521 * The calling process is not on the queue and cannot be selected. 522 * 523 * MPSAFE 524 */ 525 static 526 void 527 bsd4_select_curproc(globaldata_t gd) 528 { 529 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 530 struct lwp *nlp; 531 int cpuid = gd->gd_cpuid; 532 533 crit_enter_gd(gd); 534 535 spin_lock(&bsd4_spin); 536 #ifdef SMP 537 if(usched_bsd4_cache_coherent) 538 nlp = bsd4_chooseproc_locked_cache_coherent(dd->uschedcp); 539 else 540 #endif 541 nlp = bsd4_chooseproc_locked(dd->uschedcp); 542 543 if (nlp) { 544 545 KTR_COND_LOG(usched_bsd4_select_curproc, 546 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 547 nlp->lwp_proc->p_pid, 548 nlp->lwp_thread->td_gd->gd_cpuid, 549 dd->old_uschedcp->lwp_proc->p_pid, 550 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, 551 gd->gd_cpuid); 552 553 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); 554 dd->upri = nlp->lwp_priority; 555 dd->uschedcp = nlp; 556 dd->rrcount = 0; /* reset round robin */ 557 spin_unlock(&bsd4_spin); 558 #ifdef SMP 559 lwkt_acquire(nlp->lwp_thread); 560 #endif 561 lwkt_schedule(nlp->lwp_thread); 562 } else { 563 spin_unlock(&bsd4_spin); 564 } 565 566 #if 0 567 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { 568 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 569 spin_unlock(&bsd4_spin); 570 lwkt_schedule(&dd->helper_thread); 571 } else { 572 spin_unlock(&bsd4_spin); 573 } 574 #endif 575 crit_exit_gd(gd); 576 } 577 #ifdef SMP 578 579 /* 580 * batchy_looser_pri_test() - determine if a process is batchy or not 581 * relative to the other processes running in the system 582 */ 583 static int 584 bsd4_batchy_looser_pri_test(struct lwp* lp) 585 { 586 cpumask_t mask; 587 bsd4_pcpu_t other_dd; 588 int cpu; 589 590 /* Current running processes */ 591 mask = bsd4_curprocmask & smp_active_mask 592 & usched_global_cpumask; 593 594 while(mask) { 595 cpu = BSFCPUMASK(mask); 596 other_dd = &bsd4_pcpu[cpu]; 597 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { 598 599 KTR_COND_LOG(usched_batchy_test_false, 600 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 601 lp->lwp_proc->p_pid, 602 lp->lwp_thread->td_gd->gd_cpuid, 603 (unsigned long)mask); 604 605 return 0; 606 } 607 mask &= ~CPUMASK(cpu); 608 } 609 610 KTR_COND_LOG(usched_batchy_test_true, 611 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 612 lp->lwp_proc->p_pid, 613 lp->lwp_thread->td_gd->gd_cpuid, 614 (unsigned long)mask); 615 616 return 1; 617 } 618 619 #endif 620 /* 621 * 622 * BSD4_SETRUNQUEUE 623 * 624 * Place the specified lwp on the user scheduler's run queue. This routine 625 * must be called with the thread descheduled. The lwp must be runnable. 626 * 627 * The thread may be the current thread as a special case. 628 * 629 * MPSAFE 630 */ 631 static void 632 bsd4_setrunqueue(struct lwp *lp) 633 { 634 globaldata_t gd; 635 bsd4_pcpu_t dd; 636 #ifdef SMP 637 int cpuid; 638 cpumask_t mask; 639 cpumask_t tmpmask; 640 #endif 641 642 /* 643 * First validate the process state relative to the current cpu. 644 * We don't need the spinlock for this, just a critical section. 645 * We are in control of the process. 646 */ 647 crit_enter(); 648 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 649 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 650 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 651 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 652 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 653 654 /* 655 * Note: gd and dd are relative to the target thread's last cpu, 656 * NOT our current cpu. 657 */ 658 gd = lp->lwp_thread->td_gd; 659 dd = &bsd4_pcpu[gd->gd_cpuid]; 660 661 /* 662 * This process is not supposed to be scheduled anywhere or assigned 663 * as the current process anywhere. Assert the condition. 664 */ 665 KKASSERT(dd->uschedcp != lp); 666 667 #ifndef SMP 668 /* 669 * If we are not SMP we do not have a scheduler helper to kick 670 * and must directly activate the process if none are scheduled. 671 * 672 * This is really only an issue when bootstrapping init since 673 * the caller in all other cases will be a user process, and 674 * even if released (dd->uschedcp == NULL), that process will 675 * kickstart the scheduler when it returns to user mode from 676 * the kernel. 677 */ 678 if (dd->uschedcp == NULL) { 679 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 680 dd->uschedcp = lp; 681 dd->upri = lp->lwp_priority; 682 lwkt_schedule(lp->lwp_thread); 683 crit_exit(); 684 return; 685 } 686 #endif 687 688 #ifdef SMP 689 /* 690 * XXX fixme. Could be part of a remrunqueue/setrunqueue 691 * operation when the priority is recalculated, so TDF_MIGRATING 692 * may already be set. 693 */ 694 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 695 lwkt_giveaway(lp->lwp_thread); 696 #endif 697 698 /* 699 * We lose control of lp the moment we release the spinlock after 700 * having placed lp on the queue. i.e. another cpu could pick it 701 * up and it could exit, or its priority could be further adjusted, 702 * or something like that. 703 */ 704 spin_lock(&bsd4_spin); 705 bsd4_setrunqueue_locked(lp); 706 lp->lwp_rebal_ticks = sched_ticks; 707 708 #ifdef SMP 709 /* 710 * Kick the scheduler helper on one of the other cpu's 711 * and request a reschedule if appropriate. 712 * 713 * NOTE: We check all cpus whos rdyprocmask is set. First we 714 * look for cpus without designated lps, then we look for 715 * cpus with designated lps with a worse priority than our 716 * process. 717 */ 718 ++bsd4_scancpu; 719 720 if (usched_bsd4_smt) { 721 722 /* 723 * SMT heuristic - Try to schedule on a free physical core. 724 * If no physical core found than choose the one that has 725 * an interactive thread. 726 */ 727 728 int best_cpuid = -1; 729 int min_prio = MAXPRI * MAXPRI; 730 int sibling; 731 732 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 733 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 734 smp_active_mask & usched_global_cpumask; 735 736 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, 737 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 738 lp->lwp_proc->p_pid, 739 lp->lwp_thread->td_gd->gd_cpuid, 740 (unsigned long)mask, 741 mycpu->gd_cpuid); 742 743 while (mask) { 744 tmpmask = ~(CPUMASK(cpuid) - 1); 745 if (mask & tmpmask) 746 cpuid = BSFCPUMASK(mask & tmpmask); 747 else 748 cpuid = BSFCPUMASK(mask); 749 gd = globaldata_find(cpuid); 750 dd = &bsd4_pcpu[cpuid]; 751 752 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 753 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) { 754 755 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 756 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 757 lp->lwp_proc->p_pid, 758 lp->lwp_thread->td_gd->gd_cpuid, 759 (unsigned long)mask, 760 cpuid, 761 mycpu->gd_cpuid); 762 763 goto found; 764 } else { 765 sibling = BSFCPUMASK(dd->cpunode->parent_node->members & 766 ~dd->cpunode->members); 767 if (min_prio > bsd4_pcpu[sibling].upri) { 768 min_prio = bsd4_pcpu[sibling].upri; 769 best_cpuid = cpuid; 770 } 771 } 772 } 773 mask &= ~CPUMASK(cpuid); 774 } 775 776 if (best_cpuid != -1) { 777 cpuid = best_cpuid; 778 gd = globaldata_find(cpuid); 779 dd = &bsd4_pcpu[cpuid]; 780 781 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, 782 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 783 lp->lwp_proc->p_pid, 784 lp->lwp_thread->td_gd->gd_cpuid, 785 (unsigned long)mask, 786 cpuid, 787 mycpu->gd_cpuid); 788 789 goto found; 790 } 791 } else { 792 /* Fallback to the original heuristic */ 793 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 794 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 795 smp_active_mask & usched_global_cpumask; 796 797 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, 798 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 799 lp->lwp_proc->p_pid, 800 lp->lwp_thread->td_gd->gd_cpuid, 801 (unsigned long)mask, 802 mycpu->gd_cpuid); 803 804 while (mask) { 805 tmpmask = ~(CPUMASK(cpuid) - 1); 806 if (mask & tmpmask) 807 cpuid = BSFCPUMASK(mask & tmpmask); 808 else 809 cpuid = BSFCPUMASK(mask); 810 gd = globaldata_find(cpuid); 811 dd = &bsd4_pcpu[cpuid]; 812 813 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 814 815 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 816 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 817 lp->lwp_proc->p_pid, 818 lp->lwp_thread->td_gd->gd_cpuid, 819 (unsigned long)mask, 820 cpuid, 821 mycpu->gd_cpuid); 822 823 goto found; 824 } 825 mask &= ~CPUMASK(cpuid); 826 } 827 } 828 829 /* 830 * Then cpus which might have a currently running lp 831 */ 832 mask = bsd4_curprocmask & bsd4_rdyprocmask & 833 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; 834 835 KTR_COND_LOG(usched_bsd4_setrunqueue_rc, 836 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 837 lp->lwp_proc->p_pid, 838 lp->lwp_thread->td_gd->gd_cpuid, 839 (unsigned long)mask, 840 mycpu->gd_cpuid); 841 842 while (mask) { 843 tmpmask = ~(CPUMASK(cpuid) - 1); 844 if (mask & tmpmask) 845 cpuid = BSFCPUMASK(mask & tmpmask); 846 else 847 cpuid = BSFCPUMASK(mask); 848 gd = globaldata_find(cpuid); 849 dd = &bsd4_pcpu[cpuid]; 850 851 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 852 853 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 854 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 855 lp->lwp_proc->p_pid, 856 lp->lwp_thread->td_gd->gd_cpuid, 857 (unsigned long)mask, 858 cpuid, 859 mycpu->gd_cpuid); 860 861 goto found; 862 } 863 mask &= ~CPUMASK(cpuid); 864 } 865 866 /* 867 * If we cannot find a suitable cpu we reload from bsd4_scancpu 868 * and round-robin. Other cpus will pickup as they release their 869 * current lwps or become ready. 870 * 871 * Avoid a degenerate system lockup case if usched_global_cpumask 872 * is set to 0 or otherwise does not cover lwp_cpumask. 873 * 874 * We only kick the target helper thread in this case, we do not 875 * set the user resched flag because 876 */ 877 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 878 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) { 879 cpuid = 0; 880 } 881 gd = globaldata_find(cpuid); 882 dd = &bsd4_pcpu[cpuid]; 883 884 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, 885 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 886 lp->lwp_proc->p_pid, 887 lp->lwp_thread->td_gd->gd_cpuid, 888 cpuid, 889 mycpu->gd_cpuid); 890 891 found: 892 if (gd == mycpu) { 893 spin_unlock(&bsd4_spin); 894 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 895 if (dd->uschedcp == NULL) { 896 wakeup_mycpu(&dd->helper_thread); 897 } else { 898 need_user_resched(); 899 } 900 } 901 } else { 902 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 903 spin_unlock(&bsd4_spin); 904 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 905 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 906 else 907 wakeup(&dd->helper_thread); 908 } 909 #else 910 /* 911 * Request a reschedule if appropriate. 912 */ 913 spin_unlock(&bsd4_spin); 914 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 915 need_user_resched(); 916 } 917 #endif 918 crit_exit(); 919 } 920 921 /* 922 * This routine is called from a systimer IPI. It MUST be MP-safe and 923 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 924 * each cpu. 925 * 926 * This routine is called on every sched tick. If the currently running 927 * thread belongs to this scheduler it will be called with a non-NULL lp, 928 * otherwise it will be called with a NULL lp. 929 * 930 * MPSAFE 931 */ 932 static 933 void 934 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 935 { 936 globaldata_t gd = mycpu; 937 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 938 939 /* 940 * No impl if no lp running. 941 */ 942 if (lp == NULL) 943 return; 944 945 /* 946 * Do we need to round-robin? We round-robin 10 times a second. 947 * This should only occur for cpu-bound batch processes. 948 */ 949 if (++dd->rrcount >= usched_bsd4_rrinterval) { 950 dd->rrcount = 0; 951 need_user_resched(); 952 } 953 954 /* 955 * Adjust estcpu upward using a real time equivalent calculation. 956 */ 957 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 958 959 /* 960 * Spinlocks also hold a critical section so there should not be 961 * any active. 962 */ 963 KKASSERT(gd->gd_spinlocks == 0); 964 965 bsd4_resetpriority(lp); 966 } 967 968 /* 969 * Called from acquire and from kern_synch's one-second timer (one of the 970 * callout helper threads) with a critical section held. 971 * 972 * Decay p_estcpu based on the number of ticks we haven't been running 973 * and our p_nice. As the load increases each process observes a larger 974 * number of idle ticks (because other processes are running in them). 975 * This observation leads to a larger correction which tends to make the 976 * system more 'batchy'. 977 * 978 * Note that no recalculation occurs for a process which sleeps and wakes 979 * up in the same tick. That is, a system doing thousands of context 980 * switches per second will still only do serious estcpu calculations 981 * ESTCPUFREQ times per second. 982 * 983 * MPSAFE 984 */ 985 static 986 void 987 bsd4_recalculate_estcpu(struct lwp *lp) 988 { 989 globaldata_t gd = mycpu; 990 sysclock_t cpbase; 991 sysclock_t ttlticks; 992 int estcpu; 993 int decay_factor; 994 995 /* 996 * We have to subtract periodic to get the last schedclock 997 * timeout time, otherwise we would get the upcoming timeout. 998 * Keep in mind that a process can migrate between cpus and 999 * while the scheduler clock should be very close, boundary 1000 * conditions could lead to a small negative delta. 1001 */ 1002 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 1003 1004 if (lp->lwp_slptime > 1) { 1005 /* 1006 * Too much time has passed, do a coarse correction. 1007 */ 1008 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 1009 bsd4_resetpriority(lp); 1010 lp->lwp_cpbase = cpbase; 1011 lp->lwp_cpticks = 0; 1012 lp->lwp_batch -= ESTCPUFREQ; 1013 if (lp->lwp_batch < 0) 1014 lp->lwp_batch = 0; 1015 } else if (lp->lwp_cpbase != cpbase) { 1016 /* 1017 * Adjust estcpu if we are in a different tick. Don't waste 1018 * time if we are in the same tick. 1019 * 1020 * First calculate the number of ticks in the measurement 1021 * interval. The ttlticks calculation can wind up 0 due to 1022 * a bug in the handling of lwp_slptime (as yet not found), 1023 * so make sure we do not get a divide by 0 panic. 1024 */ 1025 ttlticks = (cpbase - lp->lwp_cpbase) / 1026 gd->gd_schedclock.periodic; 1027 if ((ssysclock_t)ttlticks < 0) { 1028 ttlticks = 0; 1029 lp->lwp_cpbase = cpbase; 1030 } 1031 if (ttlticks == 0) 1032 return; 1033 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 1034 1035 /* 1036 * Calculate the percentage of one cpu used factoring in ncpus 1037 * and the load and adjust estcpu. Handle degenerate cases 1038 * by adding 1 to bsd4_runqcount. 1039 * 1040 * estcpu is scaled by ESTCPUMAX. 1041 * 1042 * bsd4_runqcount is the excess number of user processes 1043 * that cannot be immediately scheduled to cpus. We want 1044 * to count these as running to avoid range compression 1045 * in the base calculation (which is the actual percentage 1046 * of one cpu used). 1047 */ 1048 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 1049 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 1050 1051 /* 1052 * If estcpu is > 50% we become more batch-like 1053 * If estcpu is <= 50% we become less batch-like 1054 * 1055 * It takes 30 cpu seconds to traverse the entire range. 1056 */ 1057 if (estcpu > ESTCPUMAX / 2) { 1058 lp->lwp_batch += ttlticks; 1059 if (lp->lwp_batch > BATCHMAX) 1060 lp->lwp_batch = BATCHMAX; 1061 } else { 1062 lp->lwp_batch -= ttlticks; 1063 if (lp->lwp_batch < 0) 1064 lp->lwp_batch = 0; 1065 } 1066 1067 if (usched_bsd4_debug == lp->lwp_proc->p_pid) { 1068 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 1069 lp->lwp_proc->p_pid, lp, 1070 estcpu, lp->lwp_estcpu, 1071 lp->lwp_batch, 1072 lp->lwp_cpticks, ttlticks); 1073 } 1074 1075 /* 1076 * Adjust lp->lwp_esetcpu. The decay factor determines how 1077 * quickly lwp_estcpu collapses to its realtime calculation. 1078 * A slower collapse gives us a more accurate number but 1079 * can cause a cpu hog to eat too much cpu before the 1080 * scheduler decides to downgrade it. 1081 * 1082 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 1083 * and not here, but we must still ensure that a 1084 * cpu-bound nice -20 process does not completely 1085 * override a cpu-bound nice +20 process. 1086 * 1087 * NOTE: We must use ESTCPULIM() here to deal with any 1088 * overshoot. 1089 */ 1090 decay_factor = usched_bsd4_decay; 1091 if (decay_factor < 1) 1092 decay_factor = 1; 1093 if (decay_factor > 1024) 1094 decay_factor = 1024; 1095 1096 lp->lwp_estcpu = ESTCPULIM( 1097 (lp->lwp_estcpu * decay_factor + estcpu) / 1098 (decay_factor + 1)); 1099 1100 if (usched_bsd4_debug == lp->lwp_proc->p_pid) 1101 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 1102 bsd4_resetpriority(lp); 1103 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1104 lp->lwp_cpticks = 0; 1105 } 1106 } 1107 1108 /* 1109 * Compute the priority of a process when running in user mode. 1110 * Arrange to reschedule if the resulting priority is better 1111 * than that of the current process. 1112 * 1113 * This routine may be called with any process. 1114 * 1115 * This routine is called by fork1() for initial setup with the process 1116 * of the run queue, and also may be called normally with the process on or 1117 * off the run queue. 1118 * 1119 * MPSAFE 1120 */ 1121 static void 1122 bsd4_resetpriority(struct lwp *lp) 1123 { 1124 bsd4_pcpu_t dd; 1125 int newpriority; 1126 u_short newrqtype; 1127 int reschedcpu; 1128 int checkpri; 1129 int estcpu; 1130 1131 /* 1132 * Calculate the new priority and queue type 1133 */ 1134 crit_enter(); 1135 spin_lock(&bsd4_spin); 1136 1137 newrqtype = lp->lwp_rtprio.type; 1138 1139 switch(newrqtype) { 1140 case RTP_PRIO_REALTIME: 1141 case RTP_PRIO_FIFO: 1142 newpriority = PRIBASE_REALTIME + 1143 (lp->lwp_rtprio.prio & PRIMASK); 1144 break; 1145 case RTP_PRIO_NORMAL: 1146 /* 1147 * Detune estcpu based on batchiness. lwp_batch ranges 1148 * from 0 to BATCHMAX. Limit estcpu for the sake of 1149 * the priority calculation to between 50% and 100%. 1150 */ 1151 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 1152 (BATCHMAX * 2); 1153 1154 /* 1155 * p_nice piece Adds (0-40) * 2 0-80 1156 * estcpu Adds 16384 * 4 / 512 0-128 1157 */ 1158 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 1159 newpriority += estcpu * PPQ / ESTCPUPPQ; 1160 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 1161 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 1162 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 1163 break; 1164 case RTP_PRIO_IDLE: 1165 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1166 break; 1167 case RTP_PRIO_THREAD: 1168 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1169 break; 1170 default: 1171 panic("Bad RTP_PRIO %d", newrqtype); 1172 /* NOT REACHED */ 1173 } 1174 1175 /* 1176 * The newpriority incorporates the queue type so do a simple masked 1177 * check to determine if the process has moved to another queue. If 1178 * it has, and it is currently on a run queue, then move it. 1179 * 1180 * td_upri has normal sense (higher values are more desireable), so 1181 * negate it. 1182 */ 1183 lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK); 1184 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1185 lp->lwp_priority = newpriority; 1186 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1187 bsd4_remrunqueue_locked(lp); 1188 lp->lwp_rqtype = newrqtype; 1189 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1190 bsd4_setrunqueue_locked(lp); 1191 checkpri = 1; 1192 } else { 1193 lp->lwp_rqtype = newrqtype; 1194 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1195 checkpri = 0; 1196 } 1197 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 1198 } else { 1199 lp->lwp_priority = newpriority; 1200 reschedcpu = -1; 1201 checkpri = 1; 1202 } 1203 1204 /* 1205 * Determine if we need to reschedule the target cpu. This only 1206 * occurs if the LWP is already on a scheduler queue, which means 1207 * that idle cpu notification has already occured. At most we 1208 * need only issue a need_user_resched() on the appropriate cpu. 1209 * 1210 * The LWP may be owned by a CPU different from the current one, 1211 * in which case dd->uschedcp may be modified without an MP lock 1212 * or a spinlock held. The worst that happens is that the code 1213 * below causes a spurious need_user_resched() on the target CPU 1214 * and dd->pri to be wrong for a short period of time, both of 1215 * which are harmless. 1216 * 1217 * If checkpri is 0 we are adjusting the priority of the current 1218 * process, possibly higher (less desireable), so ignore the upri 1219 * check which will fail in that case. 1220 */ 1221 if (reschedcpu >= 0) { 1222 dd = &bsd4_pcpu[reschedcpu]; 1223 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) && 1224 (checkpri == 0 || 1225 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 1226 #ifdef SMP 1227 if (reschedcpu == mycpu->gd_cpuid) { 1228 spin_unlock(&bsd4_spin); 1229 need_user_resched(); 1230 } else { 1231 spin_unlock(&bsd4_spin); 1232 atomic_clear_cpumask(&bsd4_rdyprocmask, 1233 CPUMASK(reschedcpu)); 1234 lwkt_send_ipiq(lp->lwp_thread->td_gd, 1235 bsd4_need_user_resched_remote, 1236 NULL); 1237 } 1238 #else 1239 spin_unlock(&bsd4_spin); 1240 need_user_resched(); 1241 #endif 1242 } else { 1243 spin_unlock(&bsd4_spin); 1244 } 1245 } else { 1246 spin_unlock(&bsd4_spin); 1247 } 1248 crit_exit(); 1249 } 1250 1251 /* 1252 * MPSAFE 1253 */ 1254 static 1255 void 1256 bsd4_yield(struct lwp *lp) 1257 { 1258 #if 0 1259 /* FUTURE (or something similar) */ 1260 switch(lp->lwp_rqtype) { 1261 case RTP_PRIO_NORMAL: 1262 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 1263 break; 1264 default: 1265 break; 1266 } 1267 #endif 1268 need_user_resched(); 1269 } 1270 1271 /* 1272 * Called from fork1() when a new child process is being created. 1273 * 1274 * Give the child process an initial estcpu that is more batch then 1275 * its parent and dock the parent for the fork (but do not 1276 * reschedule the parent). This comprises the main part of our batch 1277 * detection heuristic for both parallel forking and sequential execs. 1278 * 1279 * XXX lwp should be "spawning" instead of "forking" 1280 * 1281 * MPSAFE 1282 */ 1283 static void 1284 bsd4_forking(struct lwp *plp, struct lwp *lp) 1285 { 1286 /* 1287 * Put the child 4 queue slots (out of 32) higher than the parent 1288 * (less desireable than the parent). 1289 */ 1290 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 1291 1292 /* 1293 * The batch status of children always starts out centerline 1294 * and will inch-up or inch-down as appropriate. It takes roughly 1295 * ~15 seconds of >50% cpu to hit the limit. 1296 */ 1297 lp->lwp_batch = BATCHMAX / 2; 1298 1299 /* 1300 * Dock the parent a cost for the fork, protecting us from fork 1301 * bombs. If the parent is forking quickly make the child more 1302 * batchy. 1303 */ 1304 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 1305 } 1306 1307 /* 1308 * Called when a lwp is being removed from this scheduler, typically 1309 * during lwp_exit(). 1310 */ 1311 static void 1312 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 1313 { 1314 } 1315 1316 static void 1317 bsd4_uload_update(struct lwp *lp) 1318 { 1319 } 1320 1321 /* 1322 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1323 * it selects a user process and returns it. If chklp is non-NULL and chklp 1324 * has a better or equal priority then the process that would otherwise be 1325 * chosen, NULL is returned. 1326 * 1327 * Until we fix the RUNQ code the chklp test has to be strict or we may 1328 * bounce between processes trying to acquire the current process designation. 1329 * 1330 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 1331 * left intact through the entire routine. 1332 */ 1333 static 1334 struct lwp * 1335 bsd4_chooseproc_locked(struct lwp *chklp) 1336 { 1337 struct lwp *lp; 1338 struct rq *q; 1339 u_int32_t *which, *which2; 1340 u_int32_t pri; 1341 u_int32_t rtqbits; 1342 u_int32_t tsqbits; 1343 u_int32_t idqbits; 1344 cpumask_t cpumask; 1345 1346 rtqbits = bsd4_rtqueuebits; 1347 tsqbits = bsd4_queuebits; 1348 idqbits = bsd4_idqueuebits; 1349 cpumask = mycpu->gd_cpumask; 1350 1351 1352 #ifdef SMP 1353 again: 1354 #endif 1355 if (rtqbits) { 1356 pri = bsfl(rtqbits); 1357 q = &bsd4_rtqueues[pri]; 1358 which = &bsd4_rtqueuebits; 1359 which2 = &rtqbits; 1360 } else if (tsqbits) { 1361 pri = bsfl(tsqbits); 1362 q = &bsd4_queues[pri]; 1363 which = &bsd4_queuebits; 1364 which2 = &tsqbits; 1365 } else if (idqbits) { 1366 pri = bsfl(idqbits); 1367 q = &bsd4_idqueues[pri]; 1368 which = &bsd4_idqueuebits; 1369 which2 = &idqbits; 1370 } else { 1371 return NULL; 1372 } 1373 lp = TAILQ_FIRST(q); 1374 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1375 1376 #ifdef SMP 1377 while ((lp->lwp_cpumask & cpumask) == 0) { 1378 lp = TAILQ_NEXT(lp, lwp_procq); 1379 if (lp == NULL) { 1380 *which2 &= ~(1 << pri); 1381 goto again; 1382 } 1383 } 1384 #endif 1385 1386 /* 1387 * If the passed lwp <chklp> is reasonably close to the selected 1388 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1389 * 1390 * Note that we must error on the side of <chklp> to avoid bouncing 1391 * between threads in the acquire code. 1392 */ 1393 if (chklp) { 1394 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1395 return(NULL); 1396 } 1397 1398 #ifdef SMP 1399 /* 1400 * If the chosen lwp does not reside on this cpu spend a few 1401 * cycles looking for a better candidate at the same priority level. 1402 * This is a fallback check, setrunqueue() tries to wakeup the 1403 * correct cpu and is our front-line affinity. 1404 */ 1405 if (lp->lwp_thread->td_gd != mycpu && 1406 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1407 ) { 1408 if (chklp->lwp_thread->td_gd == mycpu) { 1409 lp = chklp; 1410 } 1411 } 1412 #endif 1413 1414 KTR_COND_LOG(usched_chooseproc, 1415 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1416 lp->lwp_proc->p_pid, 1417 lp->lwp_thread->td_gd->gd_cpuid, 1418 mycpu->gd_cpuid); 1419 1420 TAILQ_REMOVE(q, lp, lwp_procq); 1421 --bsd4_runqcount; 1422 if (TAILQ_EMPTY(q)) 1423 *which &= ~(1 << pri); 1424 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1425 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1426 1427 return lp; 1428 } 1429 1430 #ifdef SMP 1431 /* 1432 * chooseproc() - with a cache coherence heuristic. Try to pull a process that 1433 * has its home on the current CPU> If the process doesn't have its home here 1434 * and is a batchy one (see batcy_looser_pri_test), we can wait for a 1435 * sched_tick, may be its home will become free and pull it in. Anyway, 1436 * we can't wait more than one tick. If that tick expired, we pull in that 1437 * process, no matter what. 1438 */ 1439 static 1440 struct lwp * 1441 bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp) 1442 { 1443 struct lwp *lp; 1444 struct rq *q; 1445 u_int32_t *which, *which2; 1446 u_int32_t pri; 1447 u_int32_t checks; 1448 u_int32_t rtqbits; 1449 u_int32_t tsqbits; 1450 u_int32_t idqbits; 1451 cpumask_t cpumask; 1452 1453 struct lwp * min_level_lwp = NULL; 1454 struct rq *min_q = NULL; 1455 cpumask_t siblings; 1456 cpu_node_t* cpunode = NULL; 1457 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ 1458 u_int32_t *min_which = NULL; 1459 u_int32_t min_pri = 0; 1460 u_int32_t level = 0; 1461 1462 rtqbits = bsd4_rtqueuebits; 1463 tsqbits = bsd4_queuebits; 1464 idqbits = bsd4_idqueuebits; 1465 cpumask = mycpu->gd_cpumask; 1466 1467 /* Get the mask coresponding to the sysctl configured level */ 1468 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; 1469 level = usched_bsd4_stick_to_level; 1470 while (level) { 1471 cpunode = cpunode->parent_node; 1472 level--; 1473 } 1474 /* The cpus which can ellect a process */ 1475 siblings = cpunode->members; 1476 checks = 0; 1477 1478 again: 1479 if (rtqbits) { 1480 pri = bsfl(rtqbits); 1481 q = &bsd4_rtqueues[pri]; 1482 which = &bsd4_rtqueuebits; 1483 which2 = &rtqbits; 1484 } else if (tsqbits) { 1485 pri = bsfl(tsqbits); 1486 q = &bsd4_queues[pri]; 1487 which = &bsd4_queuebits; 1488 which2 = &tsqbits; 1489 } else if (idqbits) { 1490 pri = bsfl(idqbits); 1491 q = &bsd4_idqueues[pri]; 1492 which = &bsd4_idqueuebits; 1493 which2 = &idqbits; 1494 } else { 1495 /* 1496 * No more left and we didn't reach the checks limit. 1497 */ 1498 bsd4_kick_helper(min_level_lwp); 1499 return NULL; 1500 } 1501 lp = TAILQ_FIRST(q); 1502 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1503 1504 /* 1505 * Limit the number of checks/queue to a configurable value to 1506 * minimize the contention (we are in a locked region 1507 */ 1508 while (checks < usched_bsd4_queue_checks) { 1509 if ((lp->lwp_cpumask & cpumask) == 0 || 1510 ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 && 1511 (lp->lwp_rebal_ticks == sched_ticks || 1512 lp->lwp_rebal_ticks == (int)(sched_ticks - 1)) && 1513 bsd4_batchy_looser_pri_test(lp))) { 1514 1515 KTR_COND_LOG(usched_chooseproc_cc_not_good, 1516 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1517 lp->lwp_proc->p_pid, 1518 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1519 (unsigned long)siblings, 1520 (unsigned long)cpumask); 1521 1522 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; 1523 level = 0; 1524 while (cpunode) { 1525 if (cpunode->members & cpumask) 1526 break; 1527 cpunode = cpunode->parent_node; 1528 level++; 1529 } 1530 if (level < min_level || 1531 (level == min_level && min_level_lwp && 1532 lp->lwp_priority < min_level_lwp->lwp_priority)) { 1533 bsd4_kick_helper(min_level_lwp); 1534 min_level_lwp = lp; 1535 min_level = level; 1536 min_q = q; 1537 min_which = which; 1538 min_pri = pri; 1539 } else { 1540 bsd4_kick_helper(lp); 1541 } 1542 lp = TAILQ_NEXT(lp, lwp_procq); 1543 if (lp == NULL) { 1544 *which2 &= ~(1 << pri); 1545 goto again; 1546 } 1547 } else { 1548 KTR_COND_LOG(usched_chooseproc_cc_elected, 1549 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1550 lp->lwp_proc->p_pid, 1551 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1552 (unsigned long)siblings, 1553 (unsigned long)cpumask); 1554 1555 goto found; 1556 } 1557 ++checks; 1558 } 1559 1560 /* 1561 * Checks exhausted, we tried to defer too many threads, so schedule 1562 * the best of the worst. 1563 */ 1564 lp = min_level_lwp; 1565 q = min_q; 1566 which = min_which; 1567 pri = min_pri; 1568 KASSERT(lp, ("chooseproc: at least the first lp was good")); 1569 1570 found: 1571 1572 /* 1573 * If the passed lwp <chklp> is reasonably close to the selected 1574 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1575 * 1576 * Note that we must error on the side of <chklp> to avoid bouncing 1577 * between threads in the acquire code. 1578 */ 1579 if (chklp) { 1580 if (chklp->lwp_priority < lp->lwp_priority + PPQ) { 1581 bsd4_kick_helper(lp); 1582 return(NULL); 1583 } 1584 } 1585 1586 KTR_COND_LOG(usched_chooseproc_cc, 1587 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1588 lp->lwp_proc->p_pid, 1589 lp->lwp_thread->td_gd->gd_cpuid, 1590 mycpu->gd_cpuid); 1591 1592 TAILQ_REMOVE(q, lp, lwp_procq); 1593 --bsd4_runqcount; 1594 if (TAILQ_EMPTY(q)) 1595 *which &= ~(1 << pri); 1596 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1597 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1598 1599 return lp; 1600 } 1601 1602 /* 1603 * If we aren't willing to schedule a ready process on our cpu, give it's 1604 * target cpu a kick rather than wait for the next tick. 1605 * 1606 * Called with bsd4_spin held. 1607 */ 1608 static 1609 void 1610 bsd4_kick_helper(struct lwp *lp) 1611 { 1612 globaldata_t gd; 1613 bsd4_pcpu_t dd; 1614 1615 if (lp == NULL) 1616 return; 1617 gd = lp->lwp_thread->td_gd; 1618 dd = &bsd4_pcpu[gd->gd_cpuid]; 1619 if ((smp_active_mask & usched_global_cpumask & 1620 bsd4_rdyprocmask & gd->gd_cpumask) == 0) { 1621 return; 1622 } 1623 ++usched_bsd4_kicks; 1624 atomic_clear_cpumask(&bsd4_rdyprocmask, gd->gd_cpumask); 1625 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 1626 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 1627 } else { 1628 wakeup(&dd->helper_thread); 1629 } 1630 } 1631 1632 static 1633 void 1634 bsd4_need_user_resched_remote(void *dummy) 1635 { 1636 globaldata_t gd = mycpu; 1637 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1638 1639 need_user_resched(); 1640 1641 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */ 1642 wakeup_mycpu(&dd->helper_thread); 1643 } 1644 1645 #endif 1646 1647 /* 1648 * bsd4_remrunqueue_locked() removes a given process from the run queue 1649 * that it is on, clearing the queue busy bit if it becomes empty. 1650 * 1651 * Note that user process scheduler is different from the LWKT schedule. 1652 * The user process scheduler only manages user processes but it uses LWKT 1653 * underneath, and a user process operating in the kernel will often be 1654 * 'released' from our management. 1655 * 1656 * MPSAFE - bsd4_spin must be held exclusively on call 1657 */ 1658 static void 1659 bsd4_remrunqueue_locked(struct lwp *lp) 1660 { 1661 struct rq *q; 1662 u_int32_t *which; 1663 u_int8_t pri; 1664 1665 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1666 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1667 --bsd4_runqcount; 1668 KKASSERT(bsd4_runqcount >= 0); 1669 1670 pri = lp->lwp_rqindex; 1671 switch(lp->lwp_rqtype) { 1672 case RTP_PRIO_NORMAL: 1673 q = &bsd4_queues[pri]; 1674 which = &bsd4_queuebits; 1675 break; 1676 case RTP_PRIO_REALTIME: 1677 case RTP_PRIO_FIFO: 1678 q = &bsd4_rtqueues[pri]; 1679 which = &bsd4_rtqueuebits; 1680 break; 1681 case RTP_PRIO_IDLE: 1682 q = &bsd4_idqueues[pri]; 1683 which = &bsd4_idqueuebits; 1684 break; 1685 default: 1686 panic("remrunqueue: invalid rtprio type"); 1687 /* NOT REACHED */ 1688 } 1689 TAILQ_REMOVE(q, lp, lwp_procq); 1690 if (TAILQ_EMPTY(q)) { 1691 KASSERT((*which & (1 << pri)) != 0, 1692 ("remrunqueue: remove from empty queue")); 1693 *which &= ~(1 << pri); 1694 } 1695 } 1696 1697 /* 1698 * bsd4_setrunqueue_locked() 1699 * 1700 * Add a process whos rqtype and rqindex had previously been calculated 1701 * onto the appropriate run queue. Determine if the addition requires 1702 * a reschedule on a cpu and return the cpuid or -1. 1703 * 1704 * NOTE: Lower priorities are better priorities. 1705 * 1706 * MPSAFE - bsd4_spin must be held exclusively on call 1707 */ 1708 static void 1709 bsd4_setrunqueue_locked(struct lwp *lp) 1710 { 1711 struct rq *q; 1712 u_int32_t *which; 1713 int pri; 1714 1715 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1716 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1717 ++bsd4_runqcount; 1718 1719 pri = lp->lwp_rqindex; 1720 1721 switch(lp->lwp_rqtype) { 1722 case RTP_PRIO_NORMAL: 1723 q = &bsd4_queues[pri]; 1724 which = &bsd4_queuebits; 1725 break; 1726 case RTP_PRIO_REALTIME: 1727 case RTP_PRIO_FIFO: 1728 q = &bsd4_rtqueues[pri]; 1729 which = &bsd4_rtqueuebits; 1730 break; 1731 case RTP_PRIO_IDLE: 1732 q = &bsd4_idqueues[pri]; 1733 which = &bsd4_idqueuebits; 1734 break; 1735 default: 1736 panic("remrunqueue: invalid rtprio type"); 1737 /* NOT REACHED */ 1738 } 1739 1740 /* 1741 * Add to the correct queue and set the appropriate bit. If no 1742 * lower priority (i.e. better) processes are in the queue then 1743 * we want a reschedule, calculate the best cpu for the job. 1744 * 1745 * Always run reschedules on the LWPs original cpu. 1746 */ 1747 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1748 *which |= 1 << pri; 1749 } 1750 1751 #ifdef SMP 1752 1753 /* 1754 * For SMP systems a user scheduler helper thread is created for each 1755 * cpu and is used to allow one cpu to wakeup another for the purposes of 1756 * scheduling userland threads from setrunqueue(). 1757 * 1758 * UP systems do not need the helper since there is only one cpu. 1759 * 1760 * We can't use the idle thread for this because we might block. 1761 * Additionally, doing things this way allows us to HLT idle cpus 1762 * on MP systems. 1763 * 1764 * MPSAFE 1765 */ 1766 static void 1767 sched_thread(void *dummy) 1768 { 1769 globaldata_t gd; 1770 bsd4_pcpu_t dd; 1771 bsd4_pcpu_t tmpdd; 1772 struct lwp *nlp; 1773 cpumask_t mask; 1774 int cpuid; 1775 cpumask_t tmpmask; 1776 int tmpid; 1777 1778 gd = mycpu; 1779 cpuid = gd->gd_cpuid; /* doesn't change */ 1780 mask = gd->gd_cpumask; /* doesn't change */ 1781 dd = &bsd4_pcpu[cpuid]; 1782 1783 /* 1784 * Since we are woken up only when no user processes are scheduled 1785 * on a cpu, we can run at an ultra low priority. 1786 */ 1787 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1788 1789 tsleep(&dd->helper_thread, 0, "sched_thread_sleep", 0); 1790 1791 for (;;) { 1792 /* 1793 * We use the LWKT deschedule-interlock trick to avoid racing 1794 * bsd4_rdyprocmask. This means we cannot block through to the 1795 * manual lwkt_switch() call we make below. 1796 */ 1797 crit_enter_gd(gd); 1798 tsleep_interlock(&dd->helper_thread, 0); 1799 spin_lock(&bsd4_spin); 1800 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1801 1802 clear_user_resched(); /* This satisfied the reschedule request */ 1803 dd->rrcount = 0; /* Reset the round-robin counter */ 1804 1805 if ((bsd4_curprocmask & mask) == 0) { 1806 /* 1807 * No thread is currently scheduled. 1808 */ 1809 KKASSERT(dd->uschedcp == NULL); 1810 if ((nlp = bsd4_chooseproc_locked(NULL)) != NULL) { 1811 KTR_COND_LOG(usched_sched_thread_no_process, 1812 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1813 gd->gd_cpuid, 1814 nlp->lwp_proc->p_pid, 1815 nlp->lwp_thread->td_gd->gd_cpuid); 1816 1817 atomic_set_cpumask(&bsd4_curprocmask, mask); 1818 dd->upri = nlp->lwp_priority; 1819 dd->uschedcp = nlp; 1820 dd->rrcount = 0; /* reset round robin */ 1821 spin_unlock(&bsd4_spin); 1822 lwkt_acquire(nlp->lwp_thread); 1823 lwkt_schedule(nlp->lwp_thread); 1824 } else { 1825 spin_unlock(&bsd4_spin); 1826 } 1827 } else if (bsd4_runqcount) { 1828 if ((nlp = bsd4_chooseproc_locked(dd->uschedcp)) != NULL) { 1829 KTR_COND_LOG(usched_sched_thread_process, 1830 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1831 gd->gd_cpuid, 1832 nlp->lwp_proc->p_pid, 1833 nlp->lwp_thread->td_gd->gd_cpuid); 1834 1835 dd->upri = nlp->lwp_priority; 1836 dd->uschedcp = nlp; 1837 dd->rrcount = 0; /* reset round robin */ 1838 spin_unlock(&bsd4_spin); 1839 lwkt_acquire(nlp->lwp_thread); 1840 lwkt_schedule(nlp->lwp_thread); 1841 } else { 1842 /* 1843 * CHAINING CONDITION TRAIN 1844 * 1845 * We could not deal with the scheduler wakeup 1846 * request on this cpu, locate a ready scheduler 1847 * with no current lp assignment and chain to it. 1848 * 1849 * This ensures that a wakeup race which fails due 1850 * to priority test does not leave other unscheduled 1851 * cpus idle when the runqueue is not empty. 1852 */ 1853 tmpmask = ~bsd4_curprocmask & 1854 bsd4_rdyprocmask & smp_active_mask; 1855 if (tmpmask) { 1856 tmpid = BSFCPUMASK(tmpmask); 1857 tmpdd = &bsd4_pcpu[tmpid]; 1858 atomic_clear_cpumask(&bsd4_rdyprocmask, 1859 CPUMASK(tmpid)); 1860 spin_unlock(&bsd4_spin); 1861 wakeup(&tmpdd->helper_thread); 1862 } else { 1863 spin_unlock(&bsd4_spin); 1864 } 1865 1866 KTR_LOG(usched_sched_thread_no_process_found, 1867 gd->gd_cpuid, (unsigned long)tmpmask); 1868 } 1869 } else { 1870 /* 1871 * The runq is empty. 1872 */ 1873 spin_unlock(&bsd4_spin); 1874 } 1875 1876 /* 1877 * We're descheduled unless someone scheduled us. Switch away. 1878 * Exiting the critical section will cause splz() to be called 1879 * for us if interrupts and such are pending. 1880 */ 1881 crit_exit_gd(gd); 1882 tsleep(&dd->helper_thread, PINTERLOCKED, "schslp", 0); 1883 } 1884 } 1885 1886 /* sysctl stick_to_level parameter */ 1887 static int 1888 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) 1889 { 1890 int error, new_val; 1891 1892 new_val = usched_bsd4_stick_to_level; 1893 1894 error = sysctl_handle_int(oidp, &new_val, 0, req); 1895 if (error != 0 || req->newptr == NULL) 1896 return (error); 1897 if (new_val > cpu_topology_levels_number - 1 || new_val < 0) 1898 return (EINVAL); 1899 usched_bsd4_stick_to_level = new_val; 1900 return (0); 1901 } 1902 1903 /* 1904 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1905 * been cleared by rqinit() and we should not mess with it further. 1906 */ 1907 static void 1908 sched_thread_cpu_init(void) 1909 { 1910 int i; 1911 int cpuid; 1912 int smt_not_supported = 0; 1913 int cache_coherent_not_supported = 0; 1914 1915 if (bootverbose) 1916 kprintf("Start scheduler helpers on cpus:\n"); 1917 1918 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 1919 usched_bsd4_sysctl_tree = 1920 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 1921 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 1922 "usched_bsd4", CTLFLAG_RD, 0, ""); 1923 1924 for (i = 0; i < ncpus; ++i) { 1925 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1926 cpumask_t mask = CPUMASK(i); 1927 1928 if ((mask & smp_active_mask) == 0) 1929 continue; 1930 1931 dd->cpunode = get_cpu_node_by_cpuid(i); 1932 1933 if (dd->cpunode == NULL) { 1934 smt_not_supported = 1; 1935 cache_coherent_not_supported = 1; 1936 if (bootverbose) 1937 kprintf ("\tcpu%d - WARNING: No CPU NODE " 1938 "found for cpu\n", i); 1939 } else { 1940 switch (dd->cpunode->type) { 1941 case THREAD_LEVEL: 1942 if (bootverbose) 1943 kprintf ("\tcpu%d - HyperThreading " 1944 "available. Core siblings: ", 1945 i); 1946 break; 1947 case CORE_LEVEL: 1948 smt_not_supported = 1; 1949 1950 if (bootverbose) 1951 kprintf ("\tcpu%d - No HT available, " 1952 "multi-core/physical " 1953 "cpu. Physical siblings: ", 1954 i); 1955 break; 1956 case CHIP_LEVEL: 1957 smt_not_supported = 1; 1958 1959 if (bootverbose) 1960 kprintf ("\tcpu%d - No HT available, " 1961 "single-core/physical cpu. " 1962 "Package Siblings: ", 1963 i); 1964 break; 1965 default: 1966 /* Let's go for safe defaults here */ 1967 smt_not_supported = 1; 1968 cache_coherent_not_supported = 1; 1969 if (bootverbose) 1970 kprintf ("\tcpu%d - Unknown cpunode->" 1971 "type=%u. Siblings: ", 1972 i, 1973 (u_int)dd->cpunode->type); 1974 break; 1975 } 1976 1977 if (bootverbose) { 1978 if (dd->cpunode->parent_node != NULL) { 1979 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members) 1980 kprintf("cpu%d ", cpuid); 1981 kprintf("\n"); 1982 } else { 1983 kprintf(" no siblings\n"); 1984 } 1985 } 1986 } 1987 1988 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 1989 0, i, "usched %d", i); 1990 1991 /* 1992 * Allow user scheduling on the target cpu. cpu #0 has already 1993 * been enabled in rqinit(). 1994 */ 1995 if (i) 1996 atomic_clear_cpumask(&bsd4_curprocmask, mask); 1997 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1998 dd->upri = PRIBASE_NULL; 1999 2000 } 2001 2002 /* usched_bsd4 sysctl configurable parameters */ 2003 2004 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2005 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2006 OID_AUTO, "rrinterval", CTLFLAG_RW, 2007 &usched_bsd4_rrinterval, 0, ""); 2008 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2009 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2010 OID_AUTO, "decay", CTLFLAG_RW, 2011 &usched_bsd4_decay, 0, "Extra decay when not running"); 2012 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2013 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2014 OID_AUTO, "batch_time", CTLFLAG_RW, 2015 &usched_bsd4_batch_time, 0, "Min batch counter value"); 2016 SYSCTL_ADD_LONG(&usched_bsd4_sysctl_ctx, 2017 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2018 OID_AUTO, "kicks", CTLFLAG_RW, 2019 &usched_bsd4_kicks, "Number of kickstarts"); 2020 2021 /* Add enable/disable option for SMT scheduling if supported */ 2022 if (smt_not_supported) { 2023 usched_bsd4_smt = 0; 2024 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 2025 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2026 OID_AUTO, "smt", CTLFLAG_RD, 2027 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 2028 } else { 2029 usched_bsd4_smt = 1; 2030 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2031 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2032 OID_AUTO, "smt", CTLFLAG_RW, 2033 &usched_bsd4_smt, 0, "Enable SMT scheduling"); 2034 } 2035 2036 /* 2037 * Add enable/disable option for cache coherent scheduling 2038 * if supported 2039 */ 2040 if (cache_coherent_not_supported) { 2041 usched_bsd4_cache_coherent = 0; 2042 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 2043 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2044 OID_AUTO, "cache_coherent", CTLFLAG_RD, 2045 "NOT SUPPORTED", 0, 2046 "Cache coherence NOT SUPPORTED"); 2047 } else { 2048 usched_bsd4_cache_coherent = 1; 2049 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2050 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2051 OID_AUTO, "cache_coherent", CTLFLAG_RW, 2052 &usched_bsd4_cache_coherent, 0, 2053 "Enable/Disable cache coherent scheduling"); 2054 2055 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2056 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2057 OID_AUTO, "upri_affinity", CTLFLAG_RW, 2058 &usched_bsd4_upri_affinity, 1, 2059 "Number of PPQs in user priority check"); 2060 2061 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2062 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2063 OID_AUTO, "queue_checks", CTLFLAG_RW, 2064 &usched_bsd4_queue_checks, 5, 2065 "LWPs to check from a queue before giving up"); 2066 2067 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, 2068 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2069 OID_AUTO, "stick_to_level", 2070 CTLTYPE_INT | CTLFLAG_RW, 2071 NULL, sizeof usched_bsd4_stick_to_level, 2072 sysctl_usched_bsd4_stick_to_level, "I", 2073 "Stick a process to this level. See sysctl" 2074 "paremter hw.cpu_topology.level_description"); 2075 } 2076 } 2077 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2078 sched_thread_cpu_init, NULL) 2079 2080 #else /* No SMP options - just add the configurable parameters to sysctl */ 2081 2082 static void 2083 sched_sysctl_tree_init(void) 2084 { 2085 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 2086 usched_bsd4_sysctl_tree = 2087 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 2088 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 2089 "usched_bsd4", CTLFLAG_RD, 0, ""); 2090 2091 /* usched_bsd4 sysctl configurable parameters */ 2092 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2093 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2094 OID_AUTO, "rrinterval", CTLFLAG_RW, 2095 &usched_bsd4_rrinterval, 0, ""); 2096 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2097 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2098 OID_AUTO, "decay", CTLFLAG_RW, 2099 &usched_bsd4_decay, 0, "Extra decay when not running"); 2100 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2101 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2102 OID_AUTO, "batch_time", CTLFLAG_RW, 2103 &usched_bsd4_batch_time, 0, "Min batch counter value"); 2104 } 2105 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2106 sched_sysctl_tree_init, NULL) 2107 #endif 2108