xref: /netbsd-src/sys/rump/librump/rumpkern/scheduler.c (revision 68fa58437753598de948829082f591c269b48777)
1 /*      $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $");
30 
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40 
41 #include <rump-sys/kern.h>
42 
43 #include <rump/rumpuser.h>
44 
45 static struct rumpcpu {
46 	/* needed in fastpath */
47 	struct cpu_info *rcpu_ci;
48 	void *rcpu_prevlwp;
49 
50 	/* needed in slowpath */
51 	struct rumpuser_mtx *rcpu_mtx;
52 	struct rumpuser_cv *rcpu_cv;
53 	int rcpu_wanted;
54 
55 	/* offset 20 (P=4) or 36 (P=8) here */
56 
57 	/*
58 	 * Some stats.  Not really that necessary, but we should
59 	 * have room.  Note that these overflow quite fast, so need
60 	 * to be collected often.
61 	 */
62 	unsigned int rcpu_fastpath;
63 	unsigned int rcpu_slowpath;
64 	unsigned int rcpu_migrated;
65 
66 	/* offset 32 (P=4) or 50 (P=8) */
67 
68 	int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70 
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74 
75 	return &rcpu_storage[cpu_index(ci)];
76 }
77 
78 struct cpu_info rump_bootcpu;
79 
80 #define RCPULWP_BUSY	((void *)-1)
81 #define RCPULWP_WANTED	((void *)-2)
82 
83 static struct rumpuser_mtx *lwp0mtx;
84 static struct rumpuser_cv *lwp0cv;
85 static unsigned nextcpu;
86 
87 kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
88 
89 static bool lwp0isbusy = false;
90 
91 /*
92  * Keep some stats.
93  *
94  * Keeping track of there is not really critical for speed, unless
95  * stats happen to be on a different cache line (CACHE_LINE_SIZE is
96  * really just a coarse estimate), so default for the performant case
97  * (i.e. no stats).
98  */
99 #ifdef RUMPSCHED_STATS
100 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
101 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
102 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
103 #else
104 #define SCHED_FASTPATH(rcpu)
105 #define SCHED_SLOWPATH(rcpu)
106 #define SCHED_MIGRATED(rcpu)
107 #endif
108 
109 struct cpu_info *
cpu_lookup(u_int index)110 cpu_lookup(u_int index)
111 {
112 
113 	return rcpu_storage[index].rcpu_ci;
114 }
115 
116 static inline struct rumpcpu *
getnextcpu(void)117 getnextcpu(void)
118 {
119 	unsigned newcpu;
120 
121 	newcpu = atomic_inc_uint_nv(&nextcpu);
122 	if (__predict_false(ncpu > UINT_MAX/2))
123 		atomic_and_uint(&nextcpu, 0);
124 	newcpu = newcpu % ncpu;
125 
126 	return &rcpu_storage[newcpu];
127 }
128 
129 /* this could/should be mi_attach_cpu? */
130 void
rump_cpus_bootstrap(int * nump)131 rump_cpus_bootstrap(int *nump)
132 {
133 	int num = *nump;
134 
135 	if (num > MAXCPUS) {
136 		aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
137 		    "available (adjusted)\n", num, MAXCPUS);
138 		num = MAXCPUS;
139 	}
140 
141 	cpu_setmodel("rumpcore (virtual)");
142 
143 	mi_cpu_init();
144 
145 	/* attach first cpu for bootstrap */
146 	rump_cpu_attach(&rump_bootcpu);
147 	ncpu = 1;
148 	*nump = num;
149 }
150 
151 void
rump_scheduler_init(int numcpu)152 rump_scheduler_init(int numcpu)
153 {
154 	struct rumpcpu *rcpu;
155 	struct cpu_info *ci;
156 	int i;
157 
158 	rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
159 	rumpuser_cv_init(&lwp0cv);
160 	for (i = 0; i < numcpu; i++) {
161 		if (i == 0) {
162 			ci = &rump_bootcpu;
163 		} else {
164 			ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
165 			ci->ci_index = i;
166 		}
167 
168 		rcpu = &rcpu_storage[i];
169 		rcpu->rcpu_ci = ci;
170 		rcpu->rcpu_wanted = 0;
171 		rumpuser_cv_init(&rcpu->rcpu_cv);
172 		rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173 
174 		ci->ci_schedstate.spc_mutex =
175 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
176 		ci->ci_schedstate.spc_flags = SPCF_RUNNING;
177 	}
178 
179 	mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
180 }
181 
182 void
rump_schedlock_cv_signal(struct cpu_info * ci,struct rumpuser_cv * cv)183 rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
184 {
185 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);
186 
187 	rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
188 	rumpuser_cv_signal(cv);
189 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
190 }
191 
192 /*
193  * condvar ops using scheduler lock as the rumpuser interlock.
194  */
195 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)196 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
197 {
198 	struct lwp *l = curlwp;
199 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
200 
201 	/* mutex will be taken and released in cpu schedule/unschedule */
202 	rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
203 }
204 
205 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)206 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
207 {
208 	struct lwp *l = curlwp;
209 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
210 
211 	/* mutex will be taken and released in cpu schedule/unschedule */
212 	return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
213 	    ts->tv_sec, ts->tv_nsec);
214 }
215 
216 static void
lwp0busy(void)217 lwp0busy(void)
218 {
219 
220 	/* busy lwp0 */
221 	KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
222 	rumpuser_mutex_enter_nowrap(lwp0mtx);
223 	while (lwp0isbusy)
224 		rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
225 	lwp0isbusy = true;
226 	rumpuser_mutex_exit(lwp0mtx);
227 }
228 
229 static void
lwp0rele(void)230 lwp0rele(void)
231 {
232 
233 	rumpuser_mutex_enter_nowrap(lwp0mtx);
234 	KASSERT(lwp0isbusy == true);
235 	lwp0isbusy = false;
236 	rumpuser_cv_signal(lwp0cv);
237 	rumpuser_mutex_exit(lwp0mtx);
238 }
239 
240 /*
241  * rump_schedule: ensure that the calling host thread has a valid lwp context.
242  * ie. ensure that curlwp != NULL.  Also, ensure that there
243  * a 1:1 mapping between the lwp and rump kernel cpu.
244  */
245 void
rump_schedule()246 rump_schedule()
247 {
248 	struct lwp *l;
249 
250 	/*
251 	 * If there is no dedicated lwp, allocate a temp one and
252 	 * set it to be free'd upon unschedule().  Use lwp0 context
253 	 * for reserving the necessary resources.  Don't optimize
254 	 * for this case -- anyone who cares about performance will
255 	 * start a real thread.
256 	 */
257 	if (__predict_true((l = curlwp) != NULL)) {
258 		struct proc *p = l->l_proc;
259 		rump_schedule_cpu(l);
260 		if (l->l_cred != p->p_cred) {
261 			kauth_cred_t oc = l->l_cred;
262 			mutex_enter(p->p_lock);
263 			l->l_cred = kauth_cred_hold(p->p_cred);
264 			mutex_exit(p->p_lock);
265 			kauth_cred_free(oc);
266 		}
267 	} else {
268 		lwp0busy();
269 
270 		/* schedule cpu and use lwp0 */
271 		rump_schedule_cpu(&lwp0);
272 		rump_lwproc_curlwp_set(&lwp0);
273 
274 		/* allocate thread, switch to it, and release lwp0 */
275 		l = rump__lwproc_alloclwp(initproc);
276 		rump_lwproc_switch(l);
277 		lwp0rele();
278 
279 		/*
280 		 * mark new thread dead-on-unschedule.  this
281 		 * means that we'll be running with l_refcnt == 0.
282 		 * relax, it's fine.
283 		 */
284 		rump_lwproc_releaselwp();
285 	}
286 }
287 
288 void
rump_schedule_cpu(struct lwp * l)289 rump_schedule_cpu(struct lwp *l)
290 {
291 
292 	rump_schedule_cpu_interlock(l, NULL);
293 }
294 
295 /*
296  * Schedule a CPU.  This optimizes for the case where we schedule
297  * the same thread often, and we have nCPU >= nFrequently-Running-Thread
298  * (where CPU is virtual rump cpu, not host CPU).
299  */
300 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)301 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
302 {
303 	struct rumpcpu *rcpu;
304 	struct cpu_info *ci;
305 	void *old;
306 	bool domigrate;
307 	bool bound = l->l_pflag & LP_BOUND;
308 
309 	l->l_stat = LSRUN;
310 
311 	/*
312 	 * First, try fastpath: if we were the previous user of the
313 	 * CPU, everything is in order cachewise and we can just
314 	 * proceed to use it.
315 	 *
316 	 * If we are a different thread (i.e. CAS fails), we must go
317 	 * through a memory barrier to ensure we get a truthful
318 	 * view of the world.
319 	 */
320 
321 	KASSERT(l->l_target_cpu != NULL);
322 	rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
323 	if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
324 		if (interlock == rcpu->rcpu_mtx)
325 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
326 		SCHED_FASTPATH(rcpu);
327 		/* jones, you're the man */
328 		goto fastlane;
329 	}
330 
331 	/*
332 	 * Else, it's the slowpath for us.  First, determine if we
333 	 * can migrate.
334 	 */
335 	if (ncpu == 1)
336 		domigrate = false;
337 	else
338 		domigrate = true;
339 
340 	/* Take lock.  This acts as a load barrier too. */
341 	if (interlock != rcpu->rcpu_mtx)
342 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
343 
344 	for (;;) {
345 		SCHED_SLOWPATH(rcpu);
346 		old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
347 
348 		/* CPU is free? */
349 		if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
350 			if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
351 			    RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
352 				break;
353 			}
354 		}
355 
356 		/*
357 		 * Do we want to migrate once?
358 		 * This may need a slightly better algorithm, or we
359 		 * might cache pingpong eternally for non-frequent
360 		 * threads.
361 		 */
362 		if (domigrate && !bound) {
363 			domigrate = false;
364 			SCHED_MIGRATED(rcpu);
365 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
366 			rcpu = getnextcpu();
367 			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
368 			continue;
369 		}
370 
371 		/* Want CPU, wait until it's released an retry */
372 		rcpu->rcpu_wanted++;
373 		rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
374 		rcpu->rcpu_wanted--;
375 	}
376 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
377 
378  fastlane:
379 	ci = rcpu->rcpu_ci;
380 	l->l_cpu = l->l_target_cpu = ci;
381 	l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
382 	l->l_ru.ru_nvcsw++;
383 	l->l_stat = LSONPROC;
384 
385 	/*
386 	 * No interrupts, so ci_curlwp === cpu_onproc.
387 	 * Okay, we could make an attempt to not set cpu_onproc
388 	 * in the case that an interrupt is scheduled immediately
389 	 * after a user proc, but leave that for later.
390 	 */
391 	ci->ci_curlwp = ci->ci_onproc = l;
392 }
393 
394 void
rump_unschedule()395 rump_unschedule()
396 {
397 	struct lwp *l = curlwp;
398 #ifdef DIAGNOSTIC
399 	int nlock;
400 
401 	KERNEL_UNLOCK_ALL(l, &nlock);
402 	KASSERT(nlock == 0);
403 #endif
404 
405 	KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
406 	rump_unschedule_cpu(l);
407 	l->l_mutex = &unruntime_lock;
408 	l->l_stat = LSSTOP;
409 
410 	/*
411 	 * Check special conditions:
412 	 *  1) do we need to free the lwp which just unscheduled?
413 	 *     (locking order: lwp0, cpu)
414 	 *  2) do we want to clear curlwp for the current host thread
415 	 */
416 	if (__predict_false(l->l_flag & LW_WEXIT)) {
417 		lwp0busy();
418 
419 		/* Now that we have lwp0, we can schedule a CPU again */
420 		rump_schedule_cpu(l);
421 
422 		/* switch to lwp0.  this frees the old thread */
423 		KASSERT(l->l_flag & LW_WEXIT);
424 		rump_lwproc_switch(&lwp0);
425 
426 		/* release lwp0 */
427 		rump_unschedule_cpu(&lwp0);
428 		lwp0.l_mutex = &unruntime_lock;
429 		lwp0.l_pflag &= ~LP_RUNNING;
430 		lwp0rele();
431 		rump_lwproc_curlwp_clear(&lwp0);
432 
433 	} else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
434 		rump_lwproc_curlwp_clear(l);
435 		l->l_flag &= ~LW_RUMP_CLEAR;
436 	}
437 }
438 
439 void
rump_unschedule_cpu(struct lwp * l)440 rump_unschedule_cpu(struct lwp *l)
441 {
442 
443 	rump_unschedule_cpu_interlock(l, NULL);
444 }
445 
446 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)447 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
448 {
449 
450 	if ((l->l_pflag & LP_INTR) == 0)
451 		rump_softint_run(l->l_cpu);
452 	rump_unschedule_cpu1(l, interlock);
453 }
454 
455 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)456 rump_unschedule_cpu1(struct lwp *l, void *interlock)
457 {
458 	struct rumpcpu *rcpu;
459 	struct cpu_info *ci;
460 	void *old;
461 
462 	ci = l->l_cpu;
463 	ci->ci_curlwp = ci->ci_onproc = NULL;
464 	rcpu = cpuinfo_to_rumpcpu(ci);
465 
466 	KASSERT(rcpu->rcpu_ci == ci);
467 
468 	/*
469 	 * Make sure all stores are seen before the CPU release.  This
470 	 * is relevant only in the non-fastpath scheduling case, but
471 	 * we don't know here if that's going to happen, so need to
472 	 * expect the worst.
473 	 *
474 	 * If the scheduler interlock was requested by the caller, we
475 	 * need to obtain it before we release the CPU.  Otherwise, we risk a
476 	 * race condition where another thread is scheduled onto the
477 	 * rump kernel CPU before our current thread can
478 	 * grab the interlock.
479 	 */
480 	if (interlock == rcpu->rcpu_mtx)
481 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
482 	else
483 		membar_release(); /* XXX what does this pair with? */
484 
485 	/* Release the CPU. */
486 	old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
487 
488 	/* No waiters?  No problems.  We're outta here. */
489 	if (old == RCPULWP_BUSY) {
490 		return;
491 	}
492 
493 	KASSERT(old == RCPULWP_WANTED);
494 
495 	/*
496 	 * Ok, things weren't so snappy.
497 	 *
498 	 * Snailpath: take lock and signal anyone waiting for this CPU.
499 	 */
500 
501 	if (interlock != rcpu->rcpu_mtx)
502 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
503 	if (rcpu->rcpu_wanted)
504 		rumpuser_cv_broadcast(rcpu->rcpu_cv);
505 	if (interlock != rcpu->rcpu_mtx)
506 		rumpuser_mutex_exit(rcpu->rcpu_mtx);
507 }
508 
509 /* Give up and retake CPU (perhaps a different one) */
510 void
yield()511 yield()
512 {
513 	struct lwp *l = curlwp;
514 	int nlocks;
515 
516 	KERNEL_UNLOCK_ALL(l, &nlocks);
517 	rump_unschedule_cpu(l);
518 	rump_schedule_cpu(l);
519 	KERNEL_LOCK(nlocks, l);
520 }
521 
522 void
preempt()523 preempt()
524 {
525 
526 	yield();
527 }
528 
529 bool
kpreempt(uintptr_t where)530 kpreempt(uintptr_t where)
531 {
532 
533 	return false;
534 }
535 
536 /*
537  * There is no kernel thread preemption in rump currently.  But call
538  * the implementing macros anyway in case they grow some side-effects
539  * down the road.
540  */
541 void
kpreempt_disable(void)542 kpreempt_disable(void)
543 {
544 
545 	KPREEMPT_DISABLE(curlwp);
546 }
547 
548 void
kpreempt_enable(void)549 kpreempt_enable(void)
550 {
551 
552 	KPREEMPT_ENABLE(curlwp);
553 }
554 
555 bool
kpreempt_disabled(void)556 kpreempt_disabled(void)
557 {
558 #if 0
559 	const lwp_t *l = curlwp;
560 
561 	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
562 	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
563 #endif
564 	/* XXX: emulate cpu_kpreempt_disabled() */
565 	return true;
566 }
567 
568 void
suspendsched(void)569 suspendsched(void)
570 {
571 
572 	/*
573 	 * Could wait until everyone is out and block further entries,
574 	 * but skip that for now.
575 	 */
576 }
577 
578 void
sched_nice(struct proc * p,int level)579 sched_nice(struct proc *p, int level)
580 {
581 
582 	/* nothing to do for now */
583 }
584 
585 void
setrunnable(struct lwp * l)586 setrunnable(struct lwp *l)
587 {
588 
589 	sched_enqueue(l);
590 }
591 
592 void
sched_enqueue(struct lwp * l)593 sched_enqueue(struct lwp *l)
594 {
595 
596 	rump_thread_allow(l);
597 }
598 
599 void
sched_resched_cpu(struct cpu_info * ci,pri_t pri,bool unlock)600 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
601 {
602 
603 }
604 
605 void
sched_resched_lwp(struct lwp * l,bool unlock)606 sched_resched_lwp(struct lwp *l, bool unlock)
607 {
608 
609 }
610 
611 void
sched_dequeue(struct lwp * l)612 sched_dequeue(struct lwp *l)
613 {
614 
615 	panic("sched_dequeue not implemented");
616 }
617 
618 void
preempt_point(void)619 preempt_point(void)
620 {
621 
622 }
623 
624 bool
preempt_needed(void)625 preempt_needed(void)
626 {
627 
628 	return false;
629 }
630