1 /* $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $ */
2
3 /*
4 * Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $");
30
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40
41 #include <rump-sys/kern.h>
42
43 #include <rump/rumpuser.h>
44
45 static struct rumpcpu {
46 /* needed in fastpath */
47 struct cpu_info *rcpu_ci;
48 void *rcpu_prevlwp;
49
50 /* needed in slowpath */
51 struct rumpuser_mtx *rcpu_mtx;
52 struct rumpuser_cv *rcpu_cv;
53 int rcpu_wanted;
54
55 /* offset 20 (P=4) or 36 (P=8) here */
56
57 /*
58 * Some stats. Not really that necessary, but we should
59 * have room. Note that these overflow quite fast, so need
60 * to be collected often.
61 */
62 unsigned int rcpu_fastpath;
63 unsigned int rcpu_slowpath;
64 unsigned int rcpu_migrated;
65
66 /* offset 32 (P=4) or 50 (P=8) */
67
68 int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74
75 return &rcpu_storage[cpu_index(ci)];
76 }
77
78 struct cpu_info rump_bootcpu;
79
80 #define RCPULWP_BUSY ((void *)-1)
81 #define RCPULWP_WANTED ((void *)-2)
82
83 static struct rumpuser_mtx *lwp0mtx;
84 static struct rumpuser_cv *lwp0cv;
85 static unsigned nextcpu;
86
87 kmutex_t unruntime_lock; /* unruntime lwp lock. practically unused */
88
89 static bool lwp0isbusy = false;
90
91 /*
92 * Keep some stats.
93 *
94 * Keeping track of there is not really critical for speed, unless
95 * stats happen to be on a different cache line (CACHE_LINE_SIZE is
96 * really just a coarse estimate), so default for the performant case
97 * (i.e. no stats).
98 */
99 #ifdef RUMPSCHED_STATS
100 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
101 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
102 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
103 #else
104 #define SCHED_FASTPATH(rcpu)
105 #define SCHED_SLOWPATH(rcpu)
106 #define SCHED_MIGRATED(rcpu)
107 #endif
108
109 struct cpu_info *
cpu_lookup(u_int index)110 cpu_lookup(u_int index)
111 {
112
113 return rcpu_storage[index].rcpu_ci;
114 }
115
116 static inline struct rumpcpu *
getnextcpu(void)117 getnextcpu(void)
118 {
119 unsigned newcpu;
120
121 newcpu = atomic_inc_uint_nv(&nextcpu);
122 if (__predict_false(ncpu > UINT_MAX/2))
123 atomic_and_uint(&nextcpu, 0);
124 newcpu = newcpu % ncpu;
125
126 return &rcpu_storage[newcpu];
127 }
128
129 /* this could/should be mi_attach_cpu? */
130 void
rump_cpus_bootstrap(int * nump)131 rump_cpus_bootstrap(int *nump)
132 {
133 int num = *nump;
134
135 if (num > MAXCPUS) {
136 aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
137 "available (adjusted)\n", num, MAXCPUS);
138 num = MAXCPUS;
139 }
140
141 cpu_setmodel("rumpcore (virtual)");
142
143 mi_cpu_init();
144
145 /* attach first cpu for bootstrap */
146 rump_cpu_attach(&rump_bootcpu);
147 ncpu = 1;
148 *nump = num;
149 }
150
151 void
rump_scheduler_init(int numcpu)152 rump_scheduler_init(int numcpu)
153 {
154 struct rumpcpu *rcpu;
155 struct cpu_info *ci;
156 int i;
157
158 rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
159 rumpuser_cv_init(&lwp0cv);
160 for (i = 0; i < numcpu; i++) {
161 if (i == 0) {
162 ci = &rump_bootcpu;
163 } else {
164 ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
165 ci->ci_index = i;
166 }
167
168 rcpu = &rcpu_storage[i];
169 rcpu->rcpu_ci = ci;
170 rcpu->rcpu_wanted = 0;
171 rumpuser_cv_init(&rcpu->rcpu_cv);
172 rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173
174 ci->ci_schedstate.spc_mutex =
175 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
176 ci->ci_schedstate.spc_flags = SPCF_RUNNING;
177 }
178
179 mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
180 }
181
182 void
rump_schedlock_cv_signal(struct cpu_info * ci,struct rumpuser_cv * cv)183 rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
184 {
185 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);
186
187 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
188 rumpuser_cv_signal(cv);
189 rumpuser_mutex_exit(rcpu->rcpu_mtx);
190 }
191
192 /*
193 * condvar ops using scheduler lock as the rumpuser interlock.
194 */
195 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)196 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
197 {
198 struct lwp *l = curlwp;
199 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
200
201 /* mutex will be taken and released in cpu schedule/unschedule */
202 rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
203 }
204
205 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)206 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
207 {
208 struct lwp *l = curlwp;
209 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
210
211 /* mutex will be taken and released in cpu schedule/unschedule */
212 return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
213 ts->tv_sec, ts->tv_nsec);
214 }
215
216 static void
lwp0busy(void)217 lwp0busy(void)
218 {
219
220 /* busy lwp0 */
221 KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
222 rumpuser_mutex_enter_nowrap(lwp0mtx);
223 while (lwp0isbusy)
224 rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
225 lwp0isbusy = true;
226 rumpuser_mutex_exit(lwp0mtx);
227 }
228
229 static void
lwp0rele(void)230 lwp0rele(void)
231 {
232
233 rumpuser_mutex_enter_nowrap(lwp0mtx);
234 KASSERT(lwp0isbusy == true);
235 lwp0isbusy = false;
236 rumpuser_cv_signal(lwp0cv);
237 rumpuser_mutex_exit(lwp0mtx);
238 }
239
240 /*
241 * rump_schedule: ensure that the calling host thread has a valid lwp context.
242 * ie. ensure that curlwp != NULL. Also, ensure that there
243 * a 1:1 mapping between the lwp and rump kernel cpu.
244 */
245 void
rump_schedule()246 rump_schedule()
247 {
248 struct lwp *l;
249
250 /*
251 * If there is no dedicated lwp, allocate a temp one and
252 * set it to be free'd upon unschedule(). Use lwp0 context
253 * for reserving the necessary resources. Don't optimize
254 * for this case -- anyone who cares about performance will
255 * start a real thread.
256 */
257 if (__predict_true((l = curlwp) != NULL)) {
258 struct proc *p = l->l_proc;
259 rump_schedule_cpu(l);
260 if (l->l_cred != p->p_cred) {
261 kauth_cred_t oc = l->l_cred;
262 mutex_enter(p->p_lock);
263 l->l_cred = kauth_cred_hold(p->p_cred);
264 mutex_exit(p->p_lock);
265 kauth_cred_free(oc);
266 }
267 } else {
268 lwp0busy();
269
270 /* schedule cpu and use lwp0 */
271 rump_schedule_cpu(&lwp0);
272 rump_lwproc_curlwp_set(&lwp0);
273
274 /* allocate thread, switch to it, and release lwp0 */
275 l = rump__lwproc_alloclwp(initproc);
276 rump_lwproc_switch(l);
277 lwp0rele();
278
279 /*
280 * mark new thread dead-on-unschedule. this
281 * means that we'll be running with l_refcnt == 0.
282 * relax, it's fine.
283 */
284 rump_lwproc_releaselwp();
285 }
286 }
287
288 void
rump_schedule_cpu(struct lwp * l)289 rump_schedule_cpu(struct lwp *l)
290 {
291
292 rump_schedule_cpu_interlock(l, NULL);
293 }
294
295 /*
296 * Schedule a CPU. This optimizes for the case where we schedule
297 * the same thread often, and we have nCPU >= nFrequently-Running-Thread
298 * (where CPU is virtual rump cpu, not host CPU).
299 */
300 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)301 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
302 {
303 struct rumpcpu *rcpu;
304 struct cpu_info *ci;
305 void *old;
306 bool domigrate;
307 bool bound = l->l_pflag & LP_BOUND;
308
309 l->l_stat = LSRUN;
310
311 /*
312 * First, try fastpath: if we were the previous user of the
313 * CPU, everything is in order cachewise and we can just
314 * proceed to use it.
315 *
316 * If we are a different thread (i.e. CAS fails), we must go
317 * through a memory barrier to ensure we get a truthful
318 * view of the world.
319 */
320
321 KASSERT(l->l_target_cpu != NULL);
322 rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
323 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
324 if (interlock == rcpu->rcpu_mtx)
325 rumpuser_mutex_exit(rcpu->rcpu_mtx);
326 SCHED_FASTPATH(rcpu);
327 /* jones, you're the man */
328 goto fastlane;
329 }
330
331 /*
332 * Else, it's the slowpath for us. First, determine if we
333 * can migrate.
334 */
335 if (ncpu == 1)
336 domigrate = false;
337 else
338 domigrate = true;
339
340 /* Take lock. This acts as a load barrier too. */
341 if (interlock != rcpu->rcpu_mtx)
342 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
343
344 for (;;) {
345 SCHED_SLOWPATH(rcpu);
346 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
347
348 /* CPU is free? */
349 if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
350 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
351 RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
352 break;
353 }
354 }
355
356 /*
357 * Do we want to migrate once?
358 * This may need a slightly better algorithm, or we
359 * might cache pingpong eternally for non-frequent
360 * threads.
361 */
362 if (domigrate && !bound) {
363 domigrate = false;
364 SCHED_MIGRATED(rcpu);
365 rumpuser_mutex_exit(rcpu->rcpu_mtx);
366 rcpu = getnextcpu();
367 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
368 continue;
369 }
370
371 /* Want CPU, wait until it's released an retry */
372 rcpu->rcpu_wanted++;
373 rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
374 rcpu->rcpu_wanted--;
375 }
376 rumpuser_mutex_exit(rcpu->rcpu_mtx);
377
378 fastlane:
379 ci = rcpu->rcpu_ci;
380 l->l_cpu = l->l_target_cpu = ci;
381 l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
382 l->l_ru.ru_nvcsw++;
383 l->l_stat = LSONPROC;
384
385 /*
386 * No interrupts, so ci_curlwp === cpu_onproc.
387 * Okay, we could make an attempt to not set cpu_onproc
388 * in the case that an interrupt is scheduled immediately
389 * after a user proc, but leave that for later.
390 */
391 ci->ci_curlwp = ci->ci_onproc = l;
392 }
393
394 void
rump_unschedule()395 rump_unschedule()
396 {
397 struct lwp *l = curlwp;
398 #ifdef DIAGNOSTIC
399 int nlock;
400
401 KERNEL_UNLOCK_ALL(l, &nlock);
402 KASSERT(nlock == 0);
403 #endif
404
405 KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
406 rump_unschedule_cpu(l);
407 l->l_mutex = &unruntime_lock;
408 l->l_stat = LSSTOP;
409
410 /*
411 * Check special conditions:
412 * 1) do we need to free the lwp which just unscheduled?
413 * (locking order: lwp0, cpu)
414 * 2) do we want to clear curlwp for the current host thread
415 */
416 if (__predict_false(l->l_flag & LW_WEXIT)) {
417 lwp0busy();
418
419 /* Now that we have lwp0, we can schedule a CPU again */
420 rump_schedule_cpu(l);
421
422 /* switch to lwp0. this frees the old thread */
423 KASSERT(l->l_flag & LW_WEXIT);
424 rump_lwproc_switch(&lwp0);
425
426 /* release lwp0 */
427 rump_unschedule_cpu(&lwp0);
428 lwp0.l_mutex = &unruntime_lock;
429 lwp0.l_pflag &= ~LP_RUNNING;
430 lwp0rele();
431 rump_lwproc_curlwp_clear(&lwp0);
432
433 } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
434 rump_lwproc_curlwp_clear(l);
435 l->l_flag &= ~LW_RUMP_CLEAR;
436 }
437 }
438
439 void
rump_unschedule_cpu(struct lwp * l)440 rump_unschedule_cpu(struct lwp *l)
441 {
442
443 rump_unschedule_cpu_interlock(l, NULL);
444 }
445
446 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)447 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
448 {
449
450 if ((l->l_pflag & LP_INTR) == 0)
451 rump_softint_run(l->l_cpu);
452 rump_unschedule_cpu1(l, interlock);
453 }
454
455 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)456 rump_unschedule_cpu1(struct lwp *l, void *interlock)
457 {
458 struct rumpcpu *rcpu;
459 struct cpu_info *ci;
460 void *old;
461
462 ci = l->l_cpu;
463 ci->ci_curlwp = ci->ci_onproc = NULL;
464 rcpu = cpuinfo_to_rumpcpu(ci);
465
466 KASSERT(rcpu->rcpu_ci == ci);
467
468 /*
469 * Make sure all stores are seen before the CPU release. This
470 * is relevant only in the non-fastpath scheduling case, but
471 * we don't know here if that's going to happen, so need to
472 * expect the worst.
473 *
474 * If the scheduler interlock was requested by the caller, we
475 * need to obtain it before we release the CPU. Otherwise, we risk a
476 * race condition where another thread is scheduled onto the
477 * rump kernel CPU before our current thread can
478 * grab the interlock.
479 */
480 if (interlock == rcpu->rcpu_mtx)
481 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
482 else
483 membar_release(); /* XXX what does this pair with? */
484
485 /* Release the CPU. */
486 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
487
488 /* No waiters? No problems. We're outta here. */
489 if (old == RCPULWP_BUSY) {
490 return;
491 }
492
493 KASSERT(old == RCPULWP_WANTED);
494
495 /*
496 * Ok, things weren't so snappy.
497 *
498 * Snailpath: take lock and signal anyone waiting for this CPU.
499 */
500
501 if (interlock != rcpu->rcpu_mtx)
502 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
503 if (rcpu->rcpu_wanted)
504 rumpuser_cv_broadcast(rcpu->rcpu_cv);
505 if (interlock != rcpu->rcpu_mtx)
506 rumpuser_mutex_exit(rcpu->rcpu_mtx);
507 }
508
509 /* Give up and retake CPU (perhaps a different one) */
510 void
yield()511 yield()
512 {
513 struct lwp *l = curlwp;
514 int nlocks;
515
516 KERNEL_UNLOCK_ALL(l, &nlocks);
517 rump_unschedule_cpu(l);
518 rump_schedule_cpu(l);
519 KERNEL_LOCK(nlocks, l);
520 }
521
522 void
preempt()523 preempt()
524 {
525
526 yield();
527 }
528
529 bool
kpreempt(uintptr_t where)530 kpreempt(uintptr_t where)
531 {
532
533 return false;
534 }
535
536 /*
537 * There is no kernel thread preemption in rump currently. But call
538 * the implementing macros anyway in case they grow some side-effects
539 * down the road.
540 */
541 void
kpreempt_disable(void)542 kpreempt_disable(void)
543 {
544
545 KPREEMPT_DISABLE(curlwp);
546 }
547
548 void
kpreempt_enable(void)549 kpreempt_enable(void)
550 {
551
552 KPREEMPT_ENABLE(curlwp);
553 }
554
555 bool
kpreempt_disabled(void)556 kpreempt_disabled(void)
557 {
558 #if 0
559 const lwp_t *l = curlwp;
560
561 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
562 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
563 #endif
564 /* XXX: emulate cpu_kpreempt_disabled() */
565 return true;
566 }
567
568 void
suspendsched(void)569 suspendsched(void)
570 {
571
572 /*
573 * Could wait until everyone is out and block further entries,
574 * but skip that for now.
575 */
576 }
577
578 void
sched_nice(struct proc * p,int level)579 sched_nice(struct proc *p, int level)
580 {
581
582 /* nothing to do for now */
583 }
584
585 void
setrunnable(struct lwp * l)586 setrunnable(struct lwp *l)
587 {
588
589 sched_enqueue(l);
590 }
591
592 void
sched_enqueue(struct lwp * l)593 sched_enqueue(struct lwp *l)
594 {
595
596 rump_thread_allow(l);
597 }
598
599 void
sched_resched_cpu(struct cpu_info * ci,pri_t pri,bool unlock)600 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
601 {
602
603 }
604
605 void
sched_resched_lwp(struct lwp * l,bool unlock)606 sched_resched_lwp(struct lwp *l, bool unlock)
607 {
608
609 }
610
611 void
sched_dequeue(struct lwp * l)612 sched_dequeue(struct lwp *l)
613 {
614
615 panic("sched_dequeue not implemented");
616 }
617
618 void
preempt_point(void)619 preempt_point(void)
620 {
621
622 }
623
624 bool
preempt_needed(void)625 preempt_needed(void)
626 {
627
628 return false;
629 }
630