1 /* $NetBSD: scheduler.c,v 1.39 2014/06/07 11:08:09 rmind Exp $ */ 2 3 /* 4 * Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.39 2014/06/07 11:08:09 rmind Exp $"); 30 31 #include <sys/param.h> 32 #include <sys/atomic.h> 33 #include <sys/cpu.h> 34 #include <sys/kmem.h> 35 #include <sys/mutex.h> 36 #include <sys/namei.h> 37 #include <sys/queue.h> 38 #include <sys/select.h> 39 #include <sys/systm.h> 40 41 #include <rump/rumpuser.h> 42 43 #include "rump_private.h" 44 45 static struct cpu_info rump_cpus[MAXCPUS]; 46 static struct rumpcpu { 47 /* needed in fastpath */ 48 struct cpu_info *rcpu_ci; 49 void *rcpu_prevlwp; 50 51 /* needed in slowpath */ 52 struct rumpuser_mtx *rcpu_mtx; 53 struct rumpuser_cv *rcpu_cv; 54 int rcpu_wanted; 55 56 /* offset 20 (P=4) or 36 (P=8) here */ 57 58 /* 59 * Some stats. Not really that necessary, but we should 60 * have room. Note that these overflow quite fast, so need 61 * to be collected often. 62 */ 63 unsigned int rcpu_fastpath; 64 unsigned int rcpu_slowpath; 65 unsigned int rcpu_migrated; 66 67 /* offset 32 (P=4) or 50 (P=8) */ 68 69 int rcpu_align[0] __aligned(CACHE_LINE_SIZE); 70 } rcpu_storage[MAXCPUS]; 71 72 struct cpu_info *rump_cpu = &rump_cpus[0]; 73 kcpuset_t *kcpuset_attached = NULL; 74 kcpuset_t *kcpuset_running = NULL; 75 int ncpu; 76 77 #define RCPULWP_BUSY ((void *)-1) 78 #define RCPULWP_WANTED ((void *)-2) 79 80 static struct rumpuser_mtx *lwp0mtx; 81 static struct rumpuser_cv *lwp0cv; 82 static unsigned nextcpu; 83 84 kmutex_t unruntime_lock; /* unruntime lwp lock. practically unused */ 85 86 static bool lwp0isbusy = false; 87 88 /* 89 * Keep some stats. 90 * 91 * Keeping track of there is not really critical for speed, unless 92 * stats happen to be on a different cache line (CACHE_LINE_SIZE is 93 * really just a coarse estimate), so default for the performant case 94 * (i.e. no stats). 95 */ 96 #ifdef RUMPSCHED_STATS 97 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++; 98 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++; 99 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++; 100 #else 101 #define SCHED_FASTPATH(rcpu) 102 #define SCHED_SLOWPATH(rcpu) 103 #define SCHED_MIGRATED(rcpu) 104 #endif 105 106 struct cpu_info * 107 cpu_lookup(u_int index) 108 { 109 110 return &rump_cpus[index]; 111 } 112 113 static inline struct rumpcpu * 114 getnextcpu(void) 115 { 116 unsigned newcpu; 117 118 newcpu = atomic_inc_uint_nv(&nextcpu); 119 if (__predict_false(ncpu > UINT_MAX/2)) 120 atomic_and_uint(&nextcpu, 0); 121 newcpu = newcpu % ncpu; 122 123 return &rcpu_storage[newcpu]; 124 } 125 126 /* this could/should be mi_attach_cpu? */ 127 void 128 rump_cpus_bootstrap(int *nump) 129 { 130 struct cpu_info *ci; 131 int num = *nump; 132 int i; 133 134 if (num > MAXCPUS) { 135 aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) " 136 "available (adjusted)\n", num, MAXCPUS); 137 num = MAXCPUS; 138 } 139 140 for (i = 0; i < num; i++) { 141 ci = &rump_cpus[i]; 142 ci->ci_index = i; 143 } 144 145 kcpuset_create(&kcpuset_attached, true); 146 kcpuset_create(&kcpuset_running, true); 147 148 /* attach first cpu for bootstrap */ 149 rump_cpu_attach(&rump_cpus[0]); 150 ncpu = 1; 151 *nump = num; 152 } 153 154 void 155 rump_scheduler_init(int numcpu) 156 { 157 struct rumpcpu *rcpu; 158 struct cpu_info *ci; 159 int i; 160 161 rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN); 162 rumpuser_cv_init(&lwp0cv); 163 for (i = 0; i < numcpu; i++) { 164 rcpu = &rcpu_storage[i]; 165 ci = &rump_cpus[i]; 166 rcpu->rcpu_ci = ci; 167 ci->ci_schedstate.spc_mutex = 168 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 169 ci->ci_schedstate.spc_flags = SPCF_RUNNING; 170 rcpu->rcpu_wanted = 0; 171 rumpuser_cv_init(&rcpu->rcpu_cv); 172 rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN); 173 } 174 175 mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED); 176 } 177 178 /* 179 * condvar ops using scheduler lock as the rumpuser interlock. 180 */ 181 void 182 rump_schedlock_cv_wait(struct rumpuser_cv *cv) 183 { 184 struct lwp *l = curlwp; 185 struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]]; 186 187 /* mutex will be taken and released in cpu schedule/unschedule */ 188 rumpuser_cv_wait(cv, rcpu->rcpu_mtx); 189 } 190 191 int 192 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts) 193 { 194 struct lwp *l = curlwp; 195 struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]]; 196 197 /* mutex will be taken and released in cpu schedule/unschedule */ 198 return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx, 199 ts->tv_sec, ts->tv_nsec); 200 } 201 202 static void 203 lwp0busy(void) 204 { 205 206 /* busy lwp0 */ 207 KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC); 208 rumpuser_mutex_enter_nowrap(lwp0mtx); 209 while (lwp0isbusy) 210 rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx); 211 lwp0isbusy = true; 212 rumpuser_mutex_exit(lwp0mtx); 213 } 214 215 static void 216 lwp0rele(void) 217 { 218 219 rumpuser_mutex_enter_nowrap(lwp0mtx); 220 KASSERT(lwp0isbusy == true); 221 lwp0isbusy = false; 222 rumpuser_cv_signal(lwp0cv); 223 rumpuser_mutex_exit(lwp0mtx); 224 } 225 226 /* 227 * rump_schedule: ensure that the calling host thread has a valid lwp context. 228 * ie. ensure that curlwp != NULL. Also, ensure that there 229 * a 1:1 mapping between the lwp and rump kernel cpu. 230 */ 231 void 232 rump_schedule() 233 { 234 struct lwp *l; 235 236 /* 237 * If there is no dedicated lwp, allocate a temp one and 238 * set it to be free'd upon unschedule(). Use lwp0 context 239 * for reserving the necessary resources. Don't optimize 240 * for this case -- anyone who cares about performance will 241 * start a real thread. 242 */ 243 if (__predict_true((l = curlwp) != NULL)) { 244 rump_schedule_cpu(l); 245 LWP_CACHE_CREDS(l, l->l_proc); 246 } else { 247 lwp0busy(); 248 249 /* schedule cpu and use lwp0 */ 250 rump_schedule_cpu(&lwp0); 251 rump_lwproc_curlwp_set(&lwp0); 252 253 /* allocate thread, switch to it, and release lwp0 */ 254 l = rump__lwproc_alloclwp(initproc); 255 rump_lwproc_switch(l); 256 lwp0rele(); 257 258 /* 259 * mark new thread dead-on-unschedule. this 260 * means that we'll be running with l_refcnt == 0. 261 * relax, it's fine. 262 */ 263 rump_lwproc_releaselwp(); 264 } 265 } 266 267 void 268 rump_schedule_cpu(struct lwp *l) 269 { 270 271 rump_schedule_cpu_interlock(l, NULL); 272 } 273 274 /* 275 * Schedule a CPU. This optimizes for the case where we schedule 276 * the same thread often, and we have nCPU >= nFrequently-Running-Thread 277 * (where CPU is virtual rump cpu, not host CPU). 278 */ 279 void 280 rump_schedule_cpu_interlock(struct lwp *l, void *interlock) 281 { 282 struct rumpcpu *rcpu; 283 void *old; 284 bool domigrate; 285 bool bound = l->l_pflag & LP_BOUND; 286 287 l->l_stat = LSRUN; 288 289 /* 290 * First, try fastpath: if we were the previous user of the 291 * CPU, everything is in order cachewise and we can just 292 * proceed to use it. 293 * 294 * If we are a different thread (i.e. CAS fails), we must go 295 * through a memory barrier to ensure we get a truthful 296 * view of the world. 297 */ 298 299 KASSERT(l->l_target_cpu != NULL); 300 rcpu = &rcpu_storage[l->l_target_cpu-&rump_cpus[0]]; 301 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) { 302 if (interlock == rcpu->rcpu_mtx) 303 rumpuser_mutex_exit(rcpu->rcpu_mtx); 304 SCHED_FASTPATH(rcpu); 305 /* jones, you're the man */ 306 goto fastlane; 307 } 308 309 /* 310 * Else, it's the slowpath for us. First, determine if we 311 * can migrate. 312 */ 313 if (ncpu == 1) 314 domigrate = false; 315 else 316 domigrate = true; 317 318 /* Take lock. This acts as a load barrier too. */ 319 if (interlock != rcpu->rcpu_mtx) 320 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 321 322 for (;;) { 323 SCHED_SLOWPATH(rcpu); 324 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED); 325 326 /* CPU is free? */ 327 if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) { 328 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, 329 RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) { 330 break; 331 } 332 } 333 334 /* 335 * Do we want to migrate once? 336 * This may need a slightly better algorithm, or we 337 * might cache pingpong eternally for non-frequent 338 * threads. 339 */ 340 if (domigrate && !bound) { 341 domigrate = false; 342 SCHED_MIGRATED(rcpu); 343 rumpuser_mutex_exit(rcpu->rcpu_mtx); 344 rcpu = getnextcpu(); 345 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 346 continue; 347 } 348 349 /* Want CPU, wait until it's released an retry */ 350 rcpu->rcpu_wanted++; 351 rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx); 352 rcpu->rcpu_wanted--; 353 } 354 rumpuser_mutex_exit(rcpu->rcpu_mtx); 355 356 fastlane: 357 l->l_cpu = l->l_target_cpu = rcpu->rcpu_ci; 358 l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex; 359 l->l_ncsw++; 360 l->l_stat = LSONPROC; 361 362 rcpu->rcpu_ci->ci_curlwp = l; 363 } 364 365 void 366 rump_unschedule() 367 { 368 struct lwp *l = curlwp; 369 #ifdef DIAGNOSTIC 370 int nlock; 371 372 KERNEL_UNLOCK_ALL(l, &nlock); 373 KASSERT(nlock == 0); 374 #endif 375 376 KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex); 377 rump_unschedule_cpu(l); 378 l->l_mutex = &unruntime_lock; 379 l->l_stat = LSSTOP; 380 381 /* 382 * Check special conditions: 383 * 1) do we need to free the lwp which just unscheduled? 384 * (locking order: lwp0, cpu) 385 * 2) do we want to clear curlwp for the current host thread 386 */ 387 if (__predict_false(l->l_flag & LW_WEXIT)) { 388 lwp0busy(); 389 390 /* Now that we have lwp0, we can schedule a CPU again */ 391 rump_schedule_cpu(l); 392 393 /* switch to lwp0. this frees the old thread */ 394 KASSERT(l->l_flag & LW_WEXIT); 395 rump_lwproc_switch(&lwp0); 396 397 /* release lwp0 */ 398 rump_unschedule_cpu(&lwp0); 399 lwp0.l_mutex = &unruntime_lock; 400 lwp0.l_pflag &= ~LP_RUNNING; 401 lwp0rele(); 402 rump_lwproc_curlwp_clear(&lwp0); 403 404 } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) { 405 rump_lwproc_curlwp_clear(l); 406 l->l_flag &= ~LW_RUMP_CLEAR; 407 } 408 } 409 410 void 411 rump_unschedule_cpu(struct lwp *l) 412 { 413 414 rump_unschedule_cpu_interlock(l, NULL); 415 } 416 417 void 418 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock) 419 { 420 421 if ((l->l_pflag & LP_INTR) == 0) 422 rump_softint_run(l->l_cpu); 423 rump_unschedule_cpu1(l, interlock); 424 } 425 426 void 427 rump_unschedule_cpu1(struct lwp *l, void *interlock) 428 { 429 struct rumpcpu *rcpu; 430 struct cpu_info *ci; 431 void *old; 432 433 ci = l->l_cpu; 434 ci->ci_curlwp = NULL; 435 rcpu = &rcpu_storage[ci-&rump_cpus[0]]; 436 437 KASSERT(rcpu->rcpu_ci == ci); 438 439 /* 440 * Make sure all stores are seen before the CPU release. This 441 * is relevant only in the non-fastpath scheduling case, but 442 * we don't know here if that's going to happen, so need to 443 * expect the worst. 444 * 445 * If the scheduler interlock was requested by the caller, we 446 * need to obtain it before we release the CPU. Otherwise, we risk a 447 * race condition where another thread is scheduled onto the 448 * rump kernel CPU before our current thread can 449 * grab the interlock. 450 */ 451 if (interlock == rcpu->rcpu_mtx) 452 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 453 else 454 membar_exit(); 455 456 /* Release the CPU. */ 457 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l); 458 459 /* No waiters? No problems. We're outta here. */ 460 if (old == RCPULWP_BUSY) { 461 return; 462 } 463 464 KASSERT(old == RCPULWP_WANTED); 465 466 /* 467 * Ok, things weren't so snappy. 468 * 469 * Snailpath: take lock and signal anyone waiting for this CPU. 470 */ 471 472 if (interlock != rcpu->rcpu_mtx) 473 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 474 if (rcpu->rcpu_wanted) 475 rumpuser_cv_broadcast(rcpu->rcpu_cv); 476 if (interlock != rcpu->rcpu_mtx) 477 rumpuser_mutex_exit(rcpu->rcpu_mtx); 478 } 479 480 /* Give up and retake CPU (perhaps a different one) */ 481 void 482 yield() 483 { 484 struct lwp *l = curlwp; 485 int nlocks; 486 487 KERNEL_UNLOCK_ALL(l, &nlocks); 488 rump_unschedule_cpu(l); 489 rump_schedule_cpu(l); 490 KERNEL_LOCK(nlocks, l); 491 } 492 493 void 494 preempt() 495 { 496 497 yield(); 498 } 499 500 bool 501 kpreempt(uintptr_t where) 502 { 503 504 return false; 505 } 506 507 /* 508 * There is no kernel thread preemption in rump currently. But call 509 * the implementing macros anyway in case they grow some side-effects 510 * down the road. 511 */ 512 void 513 kpreempt_disable(void) 514 { 515 516 KPREEMPT_DISABLE(curlwp); 517 } 518 519 void 520 kpreempt_enable(void) 521 { 522 523 KPREEMPT_ENABLE(curlwp); 524 } 525 526 bool 527 kpreempt_disabled(void) 528 { 529 #if 0 530 const lwp_t *l = curlwp; 531 532 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 533 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 534 #endif 535 /* XXX: emulate cpu_kpreempt_disabled() */ 536 return true; 537 } 538 539 void 540 suspendsched(void) 541 { 542 543 /* 544 * Could wait until everyone is out and block further entries, 545 * but skip that for now. 546 */ 547 } 548 549 void 550 sched_nice(struct proc *p, int level) 551 { 552 553 /* nothing to do for now */ 554 } 555 556 void 557 sched_enqueue(struct lwp *l, bool swtch) 558 { 559 560 if (swtch) 561 panic("sched_enqueue with switcheroo"); 562 rump_thread_allow(l); 563 } 564 565 void 566 sched_dequeue(struct lwp *l) 567 { 568 569 panic("sched_dequeue not implemented"); 570 } 571