xref: /netbsd-src/sys/rump/librump/rumpkern/scheduler.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*      $NetBSD: scheduler.c,v 1.39 2014/06/07 11:08:09 rmind Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.39 2014/06/07 11:08:09 rmind Exp $");
30 
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40 
41 #include <rump/rumpuser.h>
42 
43 #include "rump_private.h"
44 
45 static struct cpu_info rump_cpus[MAXCPUS];
46 static struct rumpcpu {
47 	/* needed in fastpath */
48 	struct cpu_info *rcpu_ci;
49 	void *rcpu_prevlwp;
50 
51 	/* needed in slowpath */
52 	struct rumpuser_mtx *rcpu_mtx;
53 	struct rumpuser_cv *rcpu_cv;
54 	int rcpu_wanted;
55 
56 	/* offset 20 (P=4) or 36 (P=8) here */
57 
58 	/*
59 	 * Some stats.  Not really that necessary, but we should
60 	 * have room.  Note that these overflow quite fast, so need
61 	 * to be collected often.
62 	 */
63 	unsigned int rcpu_fastpath;
64 	unsigned int rcpu_slowpath;
65 	unsigned int rcpu_migrated;
66 
67 	/* offset 32 (P=4) or 50 (P=8) */
68 
69 	int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
70 } rcpu_storage[MAXCPUS];
71 
72 struct cpu_info *rump_cpu = &rump_cpus[0];
73 kcpuset_t *kcpuset_attached = NULL;
74 kcpuset_t *kcpuset_running = NULL;
75 int ncpu;
76 
77 #define RCPULWP_BUSY	((void *)-1)
78 #define RCPULWP_WANTED	((void *)-2)
79 
80 static struct rumpuser_mtx *lwp0mtx;
81 static struct rumpuser_cv *lwp0cv;
82 static unsigned nextcpu;
83 
84 kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
85 
86 static bool lwp0isbusy = false;
87 
88 /*
89  * Keep some stats.
90  *
91  * Keeping track of there is not really critical for speed, unless
92  * stats happen to be on a different cache line (CACHE_LINE_SIZE is
93  * really just a coarse estimate), so default for the performant case
94  * (i.e. no stats).
95  */
96 #ifdef RUMPSCHED_STATS
97 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
98 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
99 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
100 #else
101 #define SCHED_FASTPATH(rcpu)
102 #define SCHED_SLOWPATH(rcpu)
103 #define SCHED_MIGRATED(rcpu)
104 #endif
105 
106 struct cpu_info *
107 cpu_lookup(u_int index)
108 {
109 
110 	return &rump_cpus[index];
111 }
112 
113 static inline struct rumpcpu *
114 getnextcpu(void)
115 {
116 	unsigned newcpu;
117 
118 	newcpu = atomic_inc_uint_nv(&nextcpu);
119 	if (__predict_false(ncpu > UINT_MAX/2))
120 		atomic_and_uint(&nextcpu, 0);
121 	newcpu = newcpu % ncpu;
122 
123 	return &rcpu_storage[newcpu];
124 }
125 
126 /* this could/should be mi_attach_cpu? */
127 void
128 rump_cpus_bootstrap(int *nump)
129 {
130 	struct cpu_info *ci;
131 	int num = *nump;
132 	int i;
133 
134 	if (num > MAXCPUS) {
135 		aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
136 		    "available (adjusted)\n", num, MAXCPUS);
137 		num = MAXCPUS;
138 	}
139 
140 	for (i = 0; i < num; i++) {
141 		ci = &rump_cpus[i];
142 		ci->ci_index = i;
143 	}
144 
145 	kcpuset_create(&kcpuset_attached, true);
146 	kcpuset_create(&kcpuset_running, true);
147 
148 	/* attach first cpu for bootstrap */
149 	rump_cpu_attach(&rump_cpus[0]);
150 	ncpu = 1;
151 	*nump = num;
152 }
153 
154 void
155 rump_scheduler_init(int numcpu)
156 {
157 	struct rumpcpu *rcpu;
158 	struct cpu_info *ci;
159 	int i;
160 
161 	rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
162 	rumpuser_cv_init(&lwp0cv);
163 	for (i = 0; i < numcpu; i++) {
164 		rcpu = &rcpu_storage[i];
165 		ci = &rump_cpus[i];
166 		rcpu->rcpu_ci = ci;
167 		ci->ci_schedstate.spc_mutex =
168 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
169 		ci->ci_schedstate.spc_flags = SPCF_RUNNING;
170 		rcpu->rcpu_wanted = 0;
171 		rumpuser_cv_init(&rcpu->rcpu_cv);
172 		rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173 	}
174 
175 	mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
176 }
177 
178 /*
179  * condvar ops using scheduler lock as the rumpuser interlock.
180  */
181 void
182 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
183 {
184 	struct lwp *l = curlwp;
185 	struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
186 
187 	/* mutex will be taken and released in cpu schedule/unschedule */
188 	rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
189 }
190 
191 int
192 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
193 {
194 	struct lwp *l = curlwp;
195 	struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
196 
197 	/* mutex will be taken and released in cpu schedule/unschedule */
198 	return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
199 	    ts->tv_sec, ts->tv_nsec);
200 }
201 
202 static void
203 lwp0busy(void)
204 {
205 
206 	/* busy lwp0 */
207 	KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
208 	rumpuser_mutex_enter_nowrap(lwp0mtx);
209 	while (lwp0isbusy)
210 		rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
211 	lwp0isbusy = true;
212 	rumpuser_mutex_exit(lwp0mtx);
213 }
214 
215 static void
216 lwp0rele(void)
217 {
218 
219 	rumpuser_mutex_enter_nowrap(lwp0mtx);
220 	KASSERT(lwp0isbusy == true);
221 	lwp0isbusy = false;
222 	rumpuser_cv_signal(lwp0cv);
223 	rumpuser_mutex_exit(lwp0mtx);
224 }
225 
226 /*
227  * rump_schedule: ensure that the calling host thread has a valid lwp context.
228  * ie. ensure that curlwp != NULL.  Also, ensure that there
229  * a 1:1 mapping between the lwp and rump kernel cpu.
230  */
231 void
232 rump_schedule()
233 {
234 	struct lwp *l;
235 
236 	/*
237 	 * If there is no dedicated lwp, allocate a temp one and
238 	 * set it to be free'd upon unschedule().  Use lwp0 context
239 	 * for reserving the necessary resources.  Don't optimize
240 	 * for this case -- anyone who cares about performance will
241 	 * start a real thread.
242 	 */
243 	if (__predict_true((l = curlwp) != NULL)) {
244 		rump_schedule_cpu(l);
245 		LWP_CACHE_CREDS(l, l->l_proc);
246 	} else {
247 		lwp0busy();
248 
249 		/* schedule cpu and use lwp0 */
250 		rump_schedule_cpu(&lwp0);
251 		rump_lwproc_curlwp_set(&lwp0);
252 
253 		/* allocate thread, switch to it, and release lwp0 */
254 		l = rump__lwproc_alloclwp(initproc);
255 		rump_lwproc_switch(l);
256 		lwp0rele();
257 
258 		/*
259 		 * mark new thread dead-on-unschedule.  this
260 		 * means that we'll be running with l_refcnt == 0.
261 		 * relax, it's fine.
262 		 */
263 		rump_lwproc_releaselwp();
264 	}
265 }
266 
267 void
268 rump_schedule_cpu(struct lwp *l)
269 {
270 
271 	rump_schedule_cpu_interlock(l, NULL);
272 }
273 
274 /*
275  * Schedule a CPU.  This optimizes for the case where we schedule
276  * the same thread often, and we have nCPU >= nFrequently-Running-Thread
277  * (where CPU is virtual rump cpu, not host CPU).
278  */
279 void
280 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
281 {
282 	struct rumpcpu *rcpu;
283 	void *old;
284 	bool domigrate;
285 	bool bound = l->l_pflag & LP_BOUND;
286 
287 	l->l_stat = LSRUN;
288 
289 	/*
290 	 * First, try fastpath: if we were the previous user of the
291 	 * CPU, everything is in order cachewise and we can just
292 	 * proceed to use it.
293 	 *
294 	 * If we are a different thread (i.e. CAS fails), we must go
295 	 * through a memory barrier to ensure we get a truthful
296 	 * view of the world.
297 	 */
298 
299 	KASSERT(l->l_target_cpu != NULL);
300 	rcpu = &rcpu_storage[l->l_target_cpu-&rump_cpus[0]];
301 	if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
302 		if (interlock == rcpu->rcpu_mtx)
303 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
304 		SCHED_FASTPATH(rcpu);
305 		/* jones, you're the man */
306 		goto fastlane;
307 	}
308 
309 	/*
310 	 * Else, it's the slowpath for us.  First, determine if we
311 	 * can migrate.
312 	 */
313 	if (ncpu == 1)
314 		domigrate = false;
315 	else
316 		domigrate = true;
317 
318 	/* Take lock.  This acts as a load barrier too. */
319 	if (interlock != rcpu->rcpu_mtx)
320 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
321 
322 	for (;;) {
323 		SCHED_SLOWPATH(rcpu);
324 		old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
325 
326 		/* CPU is free? */
327 		if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
328 			if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
329 			    RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
330 				break;
331 			}
332 		}
333 
334 		/*
335 		 * Do we want to migrate once?
336 		 * This may need a slightly better algorithm, or we
337 		 * might cache pingpong eternally for non-frequent
338 		 * threads.
339 		 */
340 		if (domigrate && !bound) {
341 			domigrate = false;
342 			SCHED_MIGRATED(rcpu);
343 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
344 			rcpu = getnextcpu();
345 			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
346 			continue;
347 		}
348 
349 		/* Want CPU, wait until it's released an retry */
350 		rcpu->rcpu_wanted++;
351 		rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
352 		rcpu->rcpu_wanted--;
353 	}
354 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
355 
356  fastlane:
357 	l->l_cpu = l->l_target_cpu = rcpu->rcpu_ci;
358 	l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
359 	l->l_ncsw++;
360 	l->l_stat = LSONPROC;
361 
362 	rcpu->rcpu_ci->ci_curlwp = l;
363 }
364 
365 void
366 rump_unschedule()
367 {
368 	struct lwp *l = curlwp;
369 #ifdef DIAGNOSTIC
370 	int nlock;
371 
372 	KERNEL_UNLOCK_ALL(l, &nlock);
373 	KASSERT(nlock == 0);
374 #endif
375 
376 	KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
377 	rump_unschedule_cpu(l);
378 	l->l_mutex = &unruntime_lock;
379 	l->l_stat = LSSTOP;
380 
381 	/*
382 	 * Check special conditions:
383 	 *  1) do we need to free the lwp which just unscheduled?
384 	 *     (locking order: lwp0, cpu)
385 	 *  2) do we want to clear curlwp for the current host thread
386 	 */
387 	if (__predict_false(l->l_flag & LW_WEXIT)) {
388 		lwp0busy();
389 
390 		/* Now that we have lwp0, we can schedule a CPU again */
391 		rump_schedule_cpu(l);
392 
393 		/* switch to lwp0.  this frees the old thread */
394 		KASSERT(l->l_flag & LW_WEXIT);
395 		rump_lwproc_switch(&lwp0);
396 
397 		/* release lwp0 */
398 		rump_unschedule_cpu(&lwp0);
399 		lwp0.l_mutex = &unruntime_lock;
400 		lwp0.l_pflag &= ~LP_RUNNING;
401 		lwp0rele();
402 		rump_lwproc_curlwp_clear(&lwp0);
403 
404 	} else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
405 		rump_lwproc_curlwp_clear(l);
406 		l->l_flag &= ~LW_RUMP_CLEAR;
407 	}
408 }
409 
410 void
411 rump_unschedule_cpu(struct lwp *l)
412 {
413 
414 	rump_unschedule_cpu_interlock(l, NULL);
415 }
416 
417 void
418 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
419 {
420 
421 	if ((l->l_pflag & LP_INTR) == 0)
422 		rump_softint_run(l->l_cpu);
423 	rump_unschedule_cpu1(l, interlock);
424 }
425 
426 void
427 rump_unschedule_cpu1(struct lwp *l, void *interlock)
428 {
429 	struct rumpcpu *rcpu;
430 	struct cpu_info *ci;
431 	void *old;
432 
433 	ci = l->l_cpu;
434 	ci->ci_curlwp = NULL;
435 	rcpu = &rcpu_storage[ci-&rump_cpus[0]];
436 
437 	KASSERT(rcpu->rcpu_ci == ci);
438 
439 	/*
440 	 * Make sure all stores are seen before the CPU release.  This
441 	 * is relevant only in the non-fastpath scheduling case, but
442 	 * we don't know here if that's going to happen, so need to
443 	 * expect the worst.
444 	 *
445 	 * If the scheduler interlock was requested by the caller, we
446 	 * need to obtain it before we release the CPU.  Otherwise, we risk a
447 	 * race condition where another thread is scheduled onto the
448 	 * rump kernel CPU before our current thread can
449 	 * grab the interlock.
450 	 */
451 	if (interlock == rcpu->rcpu_mtx)
452 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
453 	else
454 		membar_exit();
455 
456 	/* Release the CPU. */
457 	old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
458 
459 	/* No waiters?  No problems.  We're outta here. */
460 	if (old == RCPULWP_BUSY) {
461 		return;
462 	}
463 
464 	KASSERT(old == RCPULWP_WANTED);
465 
466 	/*
467 	 * Ok, things weren't so snappy.
468 	 *
469 	 * Snailpath: take lock and signal anyone waiting for this CPU.
470 	 */
471 
472 	if (interlock != rcpu->rcpu_mtx)
473 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
474 	if (rcpu->rcpu_wanted)
475 		rumpuser_cv_broadcast(rcpu->rcpu_cv);
476 	if (interlock != rcpu->rcpu_mtx)
477 		rumpuser_mutex_exit(rcpu->rcpu_mtx);
478 }
479 
480 /* Give up and retake CPU (perhaps a different one) */
481 void
482 yield()
483 {
484 	struct lwp *l = curlwp;
485 	int nlocks;
486 
487 	KERNEL_UNLOCK_ALL(l, &nlocks);
488 	rump_unschedule_cpu(l);
489 	rump_schedule_cpu(l);
490 	KERNEL_LOCK(nlocks, l);
491 }
492 
493 void
494 preempt()
495 {
496 
497 	yield();
498 }
499 
500 bool
501 kpreempt(uintptr_t where)
502 {
503 
504 	return false;
505 }
506 
507 /*
508  * There is no kernel thread preemption in rump currently.  But call
509  * the implementing macros anyway in case they grow some side-effects
510  * down the road.
511  */
512 void
513 kpreempt_disable(void)
514 {
515 
516 	KPREEMPT_DISABLE(curlwp);
517 }
518 
519 void
520 kpreempt_enable(void)
521 {
522 
523 	KPREEMPT_ENABLE(curlwp);
524 }
525 
526 bool
527 kpreempt_disabled(void)
528 {
529 #if 0
530 	const lwp_t *l = curlwp;
531 
532 	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
533 	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
534 #endif
535 	/* XXX: emulate cpu_kpreempt_disabled() */
536 	return true;
537 }
538 
539 void
540 suspendsched(void)
541 {
542 
543 	/*
544 	 * Could wait until everyone is out and block further entries,
545 	 * but skip that for now.
546 	 */
547 }
548 
549 void
550 sched_nice(struct proc *p, int level)
551 {
552 
553 	/* nothing to do for now */
554 }
555 
556 void
557 sched_enqueue(struct lwp *l, bool swtch)
558 {
559 
560 	if (swtch)
561 		panic("sched_enqueue with switcheroo");
562 	rump_thread_allow(l);
563 }
564 
565 void
566 sched_dequeue(struct lwp *l)
567 {
568 
569 	panic("sched_dequeue not implemented");
570 }
571