1 /* $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $"); 30 31 #include <sys/param.h> 32 #include <sys/atomic.h> 33 #include <sys/cpu.h> 34 #include <sys/kmem.h> 35 #include <sys/mutex.h> 36 #include <sys/namei.h> 37 #include <sys/queue.h> 38 #include <sys/select.h> 39 #include <sys/systm.h> 40 41 #include <rump-sys/kern.h> 42 43 #include <rump/rumpuser.h> 44 45 static struct rumpcpu { 46 /* needed in fastpath */ 47 struct cpu_info *rcpu_ci; 48 void *rcpu_prevlwp; 49 50 /* needed in slowpath */ 51 struct rumpuser_mtx *rcpu_mtx; 52 struct rumpuser_cv *rcpu_cv; 53 int rcpu_wanted; 54 55 /* offset 20 (P=4) or 36 (P=8) here */ 56 57 /* 58 * Some stats. Not really that necessary, but we should 59 * have room. Note that these overflow quite fast, so need 60 * to be collected often. 61 */ 62 unsigned int rcpu_fastpath; 63 unsigned int rcpu_slowpath; 64 unsigned int rcpu_migrated; 65 66 /* offset 32 (P=4) or 50 (P=8) */ 67 68 int rcpu_align[0] __aligned(CACHE_LINE_SIZE); 69 } rcpu_storage[MAXCPUS]; 70 71 static inline struct rumpcpu * 72 cpuinfo_to_rumpcpu(struct cpu_info *ci) 73 { 74 75 return &rcpu_storage[cpu_index(ci)]; 76 } 77 78 struct cpu_info rump_bootcpu; 79 80 #define RCPULWP_BUSY ((void *)-1) 81 #define RCPULWP_WANTED ((void *)-2) 82 83 static struct rumpuser_mtx *lwp0mtx; 84 static struct rumpuser_cv *lwp0cv; 85 static unsigned nextcpu; 86 87 kmutex_t unruntime_lock; /* unruntime lwp lock. practically unused */ 88 89 static bool lwp0isbusy = false; 90 91 /* 92 * Keep some stats. 93 * 94 * Keeping track of there is not really critical for speed, unless 95 * stats happen to be on a different cache line (CACHE_LINE_SIZE is 96 * really just a coarse estimate), so default for the performant case 97 * (i.e. no stats). 98 */ 99 #ifdef RUMPSCHED_STATS 100 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++; 101 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++; 102 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++; 103 #else 104 #define SCHED_FASTPATH(rcpu) 105 #define SCHED_SLOWPATH(rcpu) 106 #define SCHED_MIGRATED(rcpu) 107 #endif 108 109 struct cpu_info * 110 cpu_lookup(u_int index) 111 { 112 113 return rcpu_storage[index].rcpu_ci; 114 } 115 116 static inline struct rumpcpu * 117 getnextcpu(void) 118 { 119 unsigned newcpu; 120 121 newcpu = atomic_inc_uint_nv(&nextcpu); 122 if (__predict_false(ncpu > UINT_MAX/2)) 123 atomic_and_uint(&nextcpu, 0); 124 newcpu = newcpu % ncpu; 125 126 return &rcpu_storage[newcpu]; 127 } 128 129 /* this could/should be mi_attach_cpu? */ 130 void 131 rump_cpus_bootstrap(int *nump) 132 { 133 int num = *nump; 134 135 if (num > MAXCPUS) { 136 aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) " 137 "available (adjusted)\n", num, MAXCPUS); 138 num = MAXCPUS; 139 } 140 141 cpu_setmodel("rumpcore (virtual)"); 142 143 mi_cpu_init(); 144 145 /* attach first cpu for bootstrap */ 146 rump_cpu_attach(&rump_bootcpu); 147 ncpu = 1; 148 *nump = num; 149 } 150 151 void 152 rump_scheduler_init(int numcpu) 153 { 154 struct rumpcpu *rcpu; 155 struct cpu_info *ci; 156 int i; 157 158 rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN); 159 rumpuser_cv_init(&lwp0cv); 160 for (i = 0; i < numcpu; i++) { 161 if (i == 0) { 162 ci = &rump_bootcpu; 163 } else { 164 ci = kmem_zalloc(sizeof(*ci), KM_SLEEP); 165 ci->ci_index = i; 166 } 167 168 rcpu = &rcpu_storage[i]; 169 rcpu->rcpu_ci = ci; 170 rcpu->rcpu_wanted = 0; 171 rumpuser_cv_init(&rcpu->rcpu_cv); 172 rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN); 173 174 ci->ci_schedstate.spc_mutex = 175 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 176 ci->ci_schedstate.spc_flags = SPCF_RUNNING; 177 } 178 179 mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED); 180 } 181 182 void 183 rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv) 184 { 185 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci); 186 187 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 188 rumpuser_cv_signal(cv); 189 rumpuser_mutex_exit(rcpu->rcpu_mtx); 190 } 191 192 /* 193 * condvar ops using scheduler lock as the rumpuser interlock. 194 */ 195 void 196 rump_schedlock_cv_wait(struct rumpuser_cv *cv) 197 { 198 struct lwp *l = curlwp; 199 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu); 200 201 /* mutex will be taken and released in cpu schedule/unschedule */ 202 rumpuser_cv_wait(cv, rcpu->rcpu_mtx); 203 } 204 205 int 206 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts) 207 { 208 struct lwp *l = curlwp; 209 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu); 210 211 /* mutex will be taken and released in cpu schedule/unschedule */ 212 return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx, 213 ts->tv_sec, ts->tv_nsec); 214 } 215 216 static void 217 lwp0busy(void) 218 { 219 220 /* busy lwp0 */ 221 KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC); 222 rumpuser_mutex_enter_nowrap(lwp0mtx); 223 while (lwp0isbusy) 224 rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx); 225 lwp0isbusy = true; 226 rumpuser_mutex_exit(lwp0mtx); 227 } 228 229 static void 230 lwp0rele(void) 231 { 232 233 rumpuser_mutex_enter_nowrap(lwp0mtx); 234 KASSERT(lwp0isbusy == true); 235 lwp0isbusy = false; 236 rumpuser_cv_signal(lwp0cv); 237 rumpuser_mutex_exit(lwp0mtx); 238 } 239 240 /* 241 * rump_schedule: ensure that the calling host thread has a valid lwp context. 242 * ie. ensure that curlwp != NULL. Also, ensure that there 243 * a 1:1 mapping between the lwp and rump kernel cpu. 244 */ 245 void 246 rump_schedule() 247 { 248 struct lwp *l; 249 250 /* 251 * If there is no dedicated lwp, allocate a temp one and 252 * set it to be free'd upon unschedule(). Use lwp0 context 253 * for reserving the necessary resources. Don't optimize 254 * for this case -- anyone who cares about performance will 255 * start a real thread. 256 */ 257 if (__predict_true((l = curlwp) != NULL)) { 258 struct proc *p = l->l_proc; 259 rump_schedule_cpu(l); 260 if (l->l_cred != p->p_cred) { 261 kauth_cred_t oc = l->l_cred; 262 mutex_enter(p->p_lock); 263 l->l_cred = kauth_cred_hold(p->p_cred); 264 mutex_exit(p->p_lock); 265 kauth_cred_free(oc); 266 } 267 } else { 268 lwp0busy(); 269 270 /* schedule cpu and use lwp0 */ 271 rump_schedule_cpu(&lwp0); 272 rump_lwproc_curlwp_set(&lwp0); 273 274 /* allocate thread, switch to it, and release lwp0 */ 275 l = rump__lwproc_alloclwp(initproc); 276 rump_lwproc_switch(l); 277 lwp0rele(); 278 279 /* 280 * mark new thread dead-on-unschedule. this 281 * means that we'll be running with l_refcnt == 0. 282 * relax, it's fine. 283 */ 284 rump_lwproc_releaselwp(); 285 } 286 } 287 288 void 289 rump_schedule_cpu(struct lwp *l) 290 { 291 292 rump_schedule_cpu_interlock(l, NULL); 293 } 294 295 /* 296 * Schedule a CPU. This optimizes for the case where we schedule 297 * the same thread often, and we have nCPU >= nFrequently-Running-Thread 298 * (where CPU is virtual rump cpu, not host CPU). 299 */ 300 void 301 rump_schedule_cpu_interlock(struct lwp *l, void *interlock) 302 { 303 struct rumpcpu *rcpu; 304 struct cpu_info *ci; 305 void *old; 306 bool domigrate; 307 bool bound = l->l_pflag & LP_BOUND; 308 309 l->l_stat = LSRUN; 310 311 /* 312 * First, try fastpath: if we were the previous user of the 313 * CPU, everything is in order cachewise and we can just 314 * proceed to use it. 315 * 316 * If we are a different thread (i.e. CAS fails), we must go 317 * through a memory barrier to ensure we get a truthful 318 * view of the world. 319 */ 320 321 KASSERT(l->l_target_cpu != NULL); 322 rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu); 323 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) { 324 if (interlock == rcpu->rcpu_mtx) 325 rumpuser_mutex_exit(rcpu->rcpu_mtx); 326 SCHED_FASTPATH(rcpu); 327 /* jones, you're the man */ 328 goto fastlane; 329 } 330 331 /* 332 * Else, it's the slowpath for us. First, determine if we 333 * can migrate. 334 */ 335 if (ncpu == 1) 336 domigrate = false; 337 else 338 domigrate = true; 339 340 /* Take lock. This acts as a load barrier too. */ 341 if (interlock != rcpu->rcpu_mtx) 342 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 343 344 for (;;) { 345 SCHED_SLOWPATH(rcpu); 346 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED); 347 348 /* CPU is free? */ 349 if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) { 350 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, 351 RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) { 352 break; 353 } 354 } 355 356 /* 357 * Do we want to migrate once? 358 * This may need a slightly better algorithm, or we 359 * might cache pingpong eternally for non-frequent 360 * threads. 361 */ 362 if (domigrate && !bound) { 363 domigrate = false; 364 SCHED_MIGRATED(rcpu); 365 rumpuser_mutex_exit(rcpu->rcpu_mtx); 366 rcpu = getnextcpu(); 367 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 368 continue; 369 } 370 371 /* Want CPU, wait until it's released an retry */ 372 rcpu->rcpu_wanted++; 373 rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx); 374 rcpu->rcpu_wanted--; 375 } 376 rumpuser_mutex_exit(rcpu->rcpu_mtx); 377 378 fastlane: 379 ci = rcpu->rcpu_ci; 380 l->l_cpu = l->l_target_cpu = ci; 381 l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex; 382 l->l_ru.ru_nvcsw++; 383 l->l_stat = LSONPROC; 384 385 /* 386 * No interrupts, so ci_curlwp === cpu_onproc. 387 * Okay, we could make an attempt to not set cpu_onproc 388 * in the case that an interrupt is scheduled immediately 389 * after a user proc, but leave that for later. 390 */ 391 ci->ci_curlwp = ci->ci_onproc = l; 392 } 393 394 void 395 rump_unschedule() 396 { 397 struct lwp *l = curlwp; 398 #ifdef DIAGNOSTIC 399 int nlock; 400 401 KERNEL_UNLOCK_ALL(l, &nlock); 402 KASSERT(nlock == 0); 403 #endif 404 405 KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex); 406 rump_unschedule_cpu(l); 407 l->l_mutex = &unruntime_lock; 408 l->l_stat = LSSTOP; 409 410 /* 411 * Check special conditions: 412 * 1) do we need to free the lwp which just unscheduled? 413 * (locking order: lwp0, cpu) 414 * 2) do we want to clear curlwp for the current host thread 415 */ 416 if (__predict_false(l->l_flag & LW_WEXIT)) { 417 lwp0busy(); 418 419 /* Now that we have lwp0, we can schedule a CPU again */ 420 rump_schedule_cpu(l); 421 422 /* switch to lwp0. this frees the old thread */ 423 KASSERT(l->l_flag & LW_WEXIT); 424 rump_lwproc_switch(&lwp0); 425 426 /* release lwp0 */ 427 rump_unschedule_cpu(&lwp0); 428 lwp0.l_mutex = &unruntime_lock; 429 lwp0.l_pflag &= ~LP_RUNNING; 430 lwp0rele(); 431 rump_lwproc_curlwp_clear(&lwp0); 432 433 } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) { 434 rump_lwproc_curlwp_clear(l); 435 l->l_flag &= ~LW_RUMP_CLEAR; 436 } 437 } 438 439 void 440 rump_unschedule_cpu(struct lwp *l) 441 { 442 443 rump_unschedule_cpu_interlock(l, NULL); 444 } 445 446 void 447 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock) 448 { 449 450 if ((l->l_pflag & LP_INTR) == 0) 451 rump_softint_run(l->l_cpu); 452 rump_unschedule_cpu1(l, interlock); 453 } 454 455 void 456 rump_unschedule_cpu1(struct lwp *l, void *interlock) 457 { 458 struct rumpcpu *rcpu; 459 struct cpu_info *ci; 460 void *old; 461 462 ci = l->l_cpu; 463 ci->ci_curlwp = ci->ci_onproc = NULL; 464 rcpu = cpuinfo_to_rumpcpu(ci); 465 466 KASSERT(rcpu->rcpu_ci == ci); 467 468 /* 469 * Make sure all stores are seen before the CPU release. This 470 * is relevant only in the non-fastpath scheduling case, but 471 * we don't know here if that's going to happen, so need to 472 * expect the worst. 473 * 474 * If the scheduler interlock was requested by the caller, we 475 * need to obtain it before we release the CPU. Otherwise, we risk a 476 * race condition where another thread is scheduled onto the 477 * rump kernel CPU before our current thread can 478 * grab the interlock. 479 */ 480 if (interlock == rcpu->rcpu_mtx) 481 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 482 else 483 membar_release(); /* XXX what does this pair with? */ 484 485 /* Release the CPU. */ 486 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l); 487 488 /* No waiters? No problems. We're outta here. */ 489 if (old == RCPULWP_BUSY) { 490 return; 491 } 492 493 KASSERT(old == RCPULWP_WANTED); 494 495 /* 496 * Ok, things weren't so snappy. 497 * 498 * Snailpath: take lock and signal anyone waiting for this CPU. 499 */ 500 501 if (interlock != rcpu->rcpu_mtx) 502 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); 503 if (rcpu->rcpu_wanted) 504 rumpuser_cv_broadcast(rcpu->rcpu_cv); 505 if (interlock != rcpu->rcpu_mtx) 506 rumpuser_mutex_exit(rcpu->rcpu_mtx); 507 } 508 509 /* Give up and retake CPU (perhaps a different one) */ 510 void 511 yield() 512 { 513 struct lwp *l = curlwp; 514 int nlocks; 515 516 KERNEL_UNLOCK_ALL(l, &nlocks); 517 rump_unschedule_cpu(l); 518 rump_schedule_cpu(l); 519 KERNEL_LOCK(nlocks, l); 520 } 521 522 void 523 preempt() 524 { 525 526 yield(); 527 } 528 529 bool 530 kpreempt(uintptr_t where) 531 { 532 533 return false; 534 } 535 536 /* 537 * There is no kernel thread preemption in rump currently. But call 538 * the implementing macros anyway in case they grow some side-effects 539 * down the road. 540 */ 541 void 542 kpreempt_disable(void) 543 { 544 545 KPREEMPT_DISABLE(curlwp); 546 } 547 548 void 549 kpreempt_enable(void) 550 { 551 552 KPREEMPT_ENABLE(curlwp); 553 } 554 555 bool 556 kpreempt_disabled(void) 557 { 558 #if 0 559 const lwp_t *l = curlwp; 560 561 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 562 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 563 #endif 564 /* XXX: emulate cpu_kpreempt_disabled() */ 565 return true; 566 } 567 568 void 569 suspendsched(void) 570 { 571 572 /* 573 * Could wait until everyone is out and block further entries, 574 * but skip that for now. 575 */ 576 } 577 578 void 579 sched_nice(struct proc *p, int level) 580 { 581 582 /* nothing to do for now */ 583 } 584 585 void 586 setrunnable(struct lwp *l) 587 { 588 589 sched_enqueue(l); 590 } 591 592 void 593 sched_enqueue(struct lwp *l) 594 { 595 596 rump_thread_allow(l); 597 } 598 599 void 600 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock) 601 { 602 603 } 604 605 void 606 sched_resched_lwp(struct lwp *l, bool unlock) 607 { 608 609 } 610 611 void 612 sched_dequeue(struct lwp *l) 613 { 614 615 panic("sched_dequeue not implemented"); 616 } 617 618 void 619 preempt_point(void) 620 { 621 622 } 623 624 bool 625 preempt_needed(void) 626 { 627 628 return false; 629 } 630