xref: /netbsd-src/sys/rump/librump/rumpkern/scheduler.c (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1 /*      $NetBSD: scheduler.c,v 1.27 2011/10/31 13:17:22 yamt Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.27 2011/10/31 13:17:22 yamt Exp $");
30 
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40 
41 #include <rump/rumpuser.h>
42 
43 #include "rump_private.h"
44 
45 static struct cpu_info rump_cpus[MAXCPUS];
46 static struct rumpcpu {
47 	/* needed in fastpath */
48 	struct cpu_info *rcpu_ci;
49 	void *rcpu_prevlwp;
50 
51 	/* needed in slowpath */
52 	struct rumpuser_mtx *rcpu_mtx;
53 	struct rumpuser_cv *rcpu_cv;
54 	int rcpu_wanted;
55 
56 	/* offset 20 (P=4) or 36 (P=8) here */
57 
58 	/*
59 	 * Some stats.  Not really that necessary, but we should
60 	 * have room.  Note that these overflow quite fast, so need
61 	 * to be collected often.
62 	 */
63 	unsigned int rcpu_fastpath;
64 	unsigned int rcpu_slowpath;
65 	unsigned int rcpu_migrated;
66 
67 	/* offset 32 (P=4) or 50 (P=8) */
68 
69 	int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
70 } rcpu_storage[MAXCPUS];
71 struct cpu_info *rump_cpu = &rump_cpus[0];
72 int ncpu;
73 
74 #define RCPULWP_BUSY	((void *)-1)
75 #define RCPULWP_WANTED	((void *)-2)
76 
77 static struct rumpuser_mtx *lwp0mtx;
78 static struct rumpuser_cv *lwp0cv;
79 static unsigned nextcpu;
80 
81 kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
82 
83 static bool lwp0isbusy = false;
84 
85 /*
86  * Keep some stats.
87  *
88  * Keeping track of there is not really critical for speed, unless
89  * stats happen to be on a different cache line (CACHE_LINE_SIZE is
90  * really just a coarse estimate), so default for the performant case
91  * (i.e. no stats).
92  */
93 #ifdef RUMPSCHED_STATS
94 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
95 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
96 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
97 #else
98 #define SCHED_FASTPATH(rcpu)
99 #define SCHED_SLOWPATH(rcpu)
100 #define SCHED_MIGRATED(rcpu)
101 #endif
102 
103 struct cpu_info *
104 cpu_lookup(u_int index)
105 {
106 
107 	return &rump_cpus[index];
108 }
109 
110 static inline struct rumpcpu *
111 getnextcpu(void)
112 {
113 	unsigned newcpu;
114 
115 	newcpu = atomic_inc_uint_nv(&nextcpu);
116 	if (__predict_false(ncpu > UINT_MAX/2))
117 		atomic_and_uint(&nextcpu, 0);
118 	newcpu = newcpu % ncpu;
119 
120 	return &rcpu_storage[newcpu];
121 }
122 
123 /* this could/should be mi_attach_cpu? */
124 void
125 rump_cpus_bootstrap(int *nump)
126 {
127 	struct rumpcpu *rcpu;
128 	struct cpu_info *ci;
129 	int num = *nump;
130 	int i;
131 
132 	if (num > MAXCPUS) {
133 		aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
134 		    "available (adjusted)\n", num, MAXCPUS);
135 		num = MAXCPUS;
136 	}
137 
138 	for (i = 0; i < num; i++) {
139 		rcpu = &rcpu_storage[i];
140 		ci = &rump_cpus[i];
141 		ci->ci_index = i;
142 	}
143 
144 	/* attach first cpu for bootstrap */
145 	rump_cpu_attach(&rump_cpus[0]);
146 	ncpu = 1;
147 	*nump = num;
148 }
149 
150 void
151 rump_scheduler_init(int numcpu)
152 {
153 	struct rumpcpu *rcpu;
154 	struct cpu_info *ci;
155 	int i;
156 
157 	rumpuser_mutex_init(&lwp0mtx);
158 	rumpuser_cv_init(&lwp0cv);
159 	for (i = 0; i < numcpu; i++) {
160 		rcpu = &rcpu_storage[i];
161 		ci = &rump_cpus[i];
162 		rcpu->rcpu_ci = ci;
163 		ci->ci_schedstate.spc_mutex =
164 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
165 		ci->ci_schedstate.spc_flags = SPCF_RUNNING;
166 		rcpu->rcpu_wanted = 0;
167 		rumpuser_cv_init(&rcpu->rcpu_cv);
168 		rumpuser_mutex_init(&rcpu->rcpu_mtx);
169 	}
170 
171 	mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_NONE);
172 }
173 
174 /*
175  * condvar ops using scheduler lock as the rumpuser interlock.
176  */
177 void
178 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
179 {
180 	struct lwp *l = curlwp;
181 	struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
182 
183 	/* mutex will be taken and released in cpu schedule/unschedule */
184 	rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
185 }
186 
187 int
188 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
189 {
190 	struct lwp *l = curlwp;
191 	struct rumpcpu *rcpu = &rcpu_storage[l->l_cpu-&rump_cpus[0]];
192 
193 	/* mutex will be taken and released in cpu schedule/unschedule */
194 	return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
195 	    ts->tv_sec, ts->tv_nsec);
196 }
197 
198 static void
199 lwp0busy(void)
200 {
201 
202 	/* busy lwp0 */
203 	KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
204 	rumpuser_mutex_enter_nowrap(lwp0mtx);
205 	while (lwp0isbusy)
206 		rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
207 	lwp0isbusy = true;
208 	rumpuser_mutex_exit(lwp0mtx);
209 }
210 
211 static void
212 lwp0rele(void)
213 {
214 
215 	rumpuser_mutex_enter_nowrap(lwp0mtx);
216 	KASSERT(lwp0isbusy == true);
217 	lwp0isbusy = false;
218 	rumpuser_cv_signal(lwp0cv);
219 	rumpuser_mutex_exit(lwp0mtx);
220 }
221 
222 /*
223  * rump_schedule: ensure that the calling host thread has a valid lwp context.
224  * ie. ensure that rumpuser_get_curlwp() != NULL.
225  */
226 void
227 rump_schedule()
228 {
229 	struct lwp *l;
230 
231 	/*
232 	 * If there is no dedicated lwp, allocate a temp one and
233 	 * set it to be free'd upon unschedule().  Use lwp0 context
234 	 * for reserving the necessary resources.  Don't optimize
235 	 * for this case -- anyone who cares about performance will
236 	 * start a real thread.
237 	 */
238 	if (__predict_true((l = rumpuser_get_curlwp()) != NULL)) {
239 		rump_schedule_cpu(l);
240 		LWP_CACHE_CREDS(l, l->l_proc);
241 	} else {
242 		lwp0busy();
243 
244 		/* schedule cpu and use lwp0 */
245 		rump_schedule_cpu(&lwp0);
246 		rumpuser_set_curlwp(&lwp0);
247 
248 		/* allocate thread, switch to it, and release lwp0 */
249 		l = rump__lwproc_alloclwp(initproc);
250 		rump_lwproc_switch(l);
251 		lwp0rele();
252 
253 		/*
254 		 * mark new thread dead-on-unschedule.  this
255 		 * means that we'll be running with l_refcnt == 0.
256 		 * relax, it's fine.
257 		 */
258 		rump_lwproc_releaselwp();
259 	}
260 }
261 
262 void
263 rump_schedule_cpu(struct lwp *l)
264 {
265 
266 	rump_schedule_cpu_interlock(l, NULL);
267 }
268 
269 /*
270  * Schedule a CPU.  This optimizes for the case where we schedule
271  * the same thread often, and we have nCPU >= nFrequently-Running-Thread
272  * (where CPU is virtual rump cpu, not host CPU).
273  */
274 void
275 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
276 {
277 	struct rumpcpu *rcpu;
278 	void *old;
279 	bool domigrate;
280 	bool bound = l->l_pflag & LP_BOUND;
281 
282 	l->l_stat = LSRUN;
283 
284 	/*
285 	 * First, try fastpath: if we were the previous user of the
286 	 * CPU, everything is in order cachewise and we can just
287 	 * proceed to use it.
288 	 *
289 	 * If we are a different thread (i.e. CAS fails), we must go
290 	 * through a memory barrier to ensure we get a truthful
291 	 * view of the world.
292 	 */
293 
294 	KASSERT(l->l_target_cpu != NULL);
295 	rcpu = &rcpu_storage[l->l_target_cpu-&rump_cpus[0]];
296 	if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
297 		if (__predict_true(interlock == rcpu->rcpu_mtx))
298 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
299 		SCHED_FASTPATH(rcpu);
300 		/* jones, you're the man */
301 		goto fastlane;
302 	}
303 
304 	/*
305 	 * Else, it's the slowpath for us.  First, determine if we
306 	 * can migrate.
307 	 */
308 	if (ncpu == 1)
309 		domigrate = false;
310 	else
311 		domigrate = true;
312 
313 	/* Take lock.  This acts as a load barrier too. */
314 	if (__predict_true(interlock != rcpu->rcpu_mtx))
315 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
316 
317 	for (;;) {
318 		SCHED_SLOWPATH(rcpu);
319 		old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
320 
321 		/* CPU is free? */
322 		if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
323 			if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
324 			    RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
325 				break;
326 			}
327 		}
328 
329 		/*
330 		 * Do we want to migrate once?
331 		 * This may need a slightly better algorithm, or we
332 		 * might cache pingpong eternally for non-frequent
333 		 * threads.
334 		 */
335 		if (domigrate && !bound) {
336 			domigrate = false;
337 			SCHED_MIGRATED(rcpu);
338 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
339 			rcpu = getnextcpu();
340 			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
341 			continue;
342 		}
343 
344 		/* Want CPU, wait until it's released an retry */
345 		rcpu->rcpu_wanted++;
346 		rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
347 		rcpu->rcpu_wanted--;
348 	}
349 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
350 
351  fastlane:
352 	l->l_cpu = l->l_target_cpu = rcpu->rcpu_ci;
353 	l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
354 	l->l_ncsw++;
355 	l->l_stat = LSONPROC;
356 
357 	rcpu->rcpu_ci->ci_curlwp = l;
358 }
359 
360 void
361 rump_unschedule()
362 {
363 	struct lwp *l = rumpuser_get_curlwp();
364 #ifdef DIAGNOSTIC
365 	int nlock;
366 
367 	KERNEL_UNLOCK_ALL(l, &nlock);
368 	KASSERT(nlock == 0);
369 #endif
370 
371 	KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
372 	rump_unschedule_cpu(l);
373 	l->l_mutex = &unruntime_lock;
374 	l->l_stat = LSSTOP;
375 
376 	/*
377 	 * Check special conditions:
378 	 *  1) do we need to free the lwp which just unscheduled?
379 	 *     (locking order: lwp0, cpu)
380 	 *  2) do we want to clear curlwp for the current host thread
381 	 */
382 	if (__predict_false(l->l_flag & LW_WEXIT)) {
383 		lwp0busy();
384 
385 		/* Now that we have lwp0, we can schedule a CPU again */
386 		rump_schedule_cpu(l);
387 
388 		/* switch to lwp0.  this frees the old thread */
389 		KASSERT(l->l_flag & LW_WEXIT);
390 		rump_lwproc_switch(&lwp0);
391 
392 		/* release lwp0 */
393 		rump_unschedule_cpu(&lwp0);
394 		lwp0.l_mutex = &unruntime_lock;
395 		lwp0.l_pflag &= ~LP_RUNNING;
396 		lwp0rele();
397 		rumpuser_set_curlwp(NULL);
398 
399 	} else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
400 		rumpuser_set_curlwp(NULL);
401 		l->l_flag &= ~LW_RUMP_CLEAR;
402 	}
403 }
404 
405 void
406 rump_unschedule_cpu(struct lwp *l)
407 {
408 
409 	rump_unschedule_cpu_interlock(l, NULL);
410 }
411 
412 void
413 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
414 {
415 
416 	if ((l->l_pflag & LP_INTR) == 0)
417 		rump_softint_run(l->l_cpu);
418 	rump_unschedule_cpu1(l, interlock);
419 }
420 
421 void
422 rump_unschedule_cpu1(struct lwp *l, void *interlock)
423 {
424 	struct rumpcpu *rcpu;
425 	struct cpu_info *ci;
426 	void *old;
427 
428 	ci = l->l_cpu;
429 	ci->ci_curlwp = NULL;
430 	rcpu = &rcpu_storage[ci-&rump_cpus[0]];
431 
432 	KASSERT(rcpu->rcpu_ci == ci);
433 
434 	/*
435 	 * Make sure all stores are seen before the CPU release.  This
436 	 * is relevant only in the non-fastpath scheduling case, but
437 	 * we don't know here if that's going to happen, so need to
438 	 * expect the worst.
439 	 */
440 	membar_exit();
441 
442 	/* Release the CPU. */
443 	old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
444 
445 	/* No waiters?  No problems.  We're outta here. */
446 	if (old == RCPULWP_BUSY) {
447 		/* Was the scheduler interlock requested? */
448 		if (__predict_false(interlock == rcpu->rcpu_mtx))
449 			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
450 		return;
451 	}
452 
453 	KASSERT(old == RCPULWP_WANTED);
454 
455 	/*
456 	 * Ok, things weren't so snappy.
457 	 *
458 	 * Snailpath: take lock and signal anyone waiting for this CPU.
459 	 */
460 
461 	rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
462 	if (rcpu->rcpu_wanted)
463 		rumpuser_cv_broadcast(rcpu->rcpu_cv);
464 
465 	if (__predict_true(interlock != rcpu->rcpu_mtx))
466 		rumpuser_mutex_exit(rcpu->rcpu_mtx);
467 }
468 
469 /* Give up and retake CPU (perhaps a different one) */
470 void
471 yield()
472 {
473 	struct lwp *l = curlwp;
474 	int nlocks;
475 
476 	KERNEL_UNLOCK_ALL(l, &nlocks);
477 	rump_unschedule_cpu(l);
478 	rump_schedule_cpu(l);
479 	KERNEL_LOCK(nlocks, l);
480 }
481 
482 void
483 preempt()
484 {
485 
486 	yield();
487 }
488 
489 bool
490 kpreempt(uintptr_t where)
491 {
492 
493 	return false;
494 }
495 
496 /*
497  * There is no kernel thread preemption in rump currently.  But call
498  * the implementing macros anyway in case they grow some side-effects
499  * down the road.
500  */
501 void
502 kpreempt_disable(void)
503 {
504 
505 	KPREEMPT_DISABLE(curlwp);
506 }
507 
508 void
509 kpreempt_enable(void)
510 {
511 
512 	KPREEMPT_ENABLE(curlwp);
513 }
514 
515 void
516 suspendsched(void)
517 {
518 
519 	/*
520 	 * Could wait until everyone is out and block further entries,
521 	 * but skip that for now.
522 	 */
523 }
524 
525 void
526 sched_nice(struct proc *p, int level)
527 {
528 
529 	/* nothing to do for now */
530 }
531