xref: /dflybsd-src/sys/kern/kern_dsched.c (revision d4b8aec4bb44a374c3e91969c1a7df6569da7be3)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		noop_prepare;
58 static dsched_teardown_t	noop_teardown;
59 static dsched_cancel_t		noop_cancel;
60 static dsched_queue_t		noop_queue;
61 
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 
64 static int	dsched_inited = 0;
65 static int	default_set = 0;
66 
67 struct lock	dsched_lock;
68 static int	dsched_debug_enable = 0;
69 
70 struct dsched_stats	dsched_stats;
71 
72 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
73 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
74 struct objcache_malloc_args dsched_thread_io_malloc_args = {
75 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
76 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
77 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
78 
79 static struct objcache	*dsched_diskctx_cache;
80 static struct objcache	*dsched_tdctx_cache;
81 static struct objcache	*dsched_tdio_cache;
82 
83 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
84 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
85 
86 struct lock	dsched_tdctx_lock;
87 
88 static struct dsched_policy_head dsched_policy_list =
89 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
90 
91 static struct dsched_policy dsched_noop_policy = {
92 	.name = "noop",
93 
94 	.prepare = noop_prepare,
95 	.teardown = noop_teardown,
96 	.cancel_all = noop_cancel,
97 	.bio_queue = noop_queue
98 };
99 
100 static struct dsched_policy *default_policy = &dsched_noop_policy;
101 
102 /*
103  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
104  * using kvprintf
105  */
106 int
107 dsched_debug(int level, char *fmt, ...)
108 {
109 	__va_list ap;
110 
111 	__va_start(ap, fmt);
112 	if (level <= dsched_debug_enable)
113 		kvprintf(fmt, ap);
114 	__va_end(ap);
115 
116 	return 0;
117 }
118 
119 /*
120  * Called on disk_create()
121  * tries to read which policy to use from loader.conf, if there's
122  * none specified, the default policy is used.
123  */
124 void
125 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
126 {
127 	char tunable_key[SPECNAMELEN + 48];
128 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
129 	struct dsched_policy *policy = NULL;
130 
131 	/* Also look for serno stuff? */
132 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
133 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
134 
135 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
136 	    head_name, unit);
137 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
138 	    sizeof(sched_policy)) != 0) {
139 		policy = dsched_find_policy(sched_policy);
140 	}
141 
142 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
143 	    head_name);
144 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
145 	    sizeof(sched_policy)) != 0)) {
146 		policy = dsched_find_policy(sched_policy);
147 	}
148 
149 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
150 	if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
151 	    sizeof(sched_policy)) != 0)) {
152 		policy = dsched_find_policy(sched_policy);
153 	}
154 
155 	if (!policy) {
156 		if (!default_set) {
157 			dsched_debug(0, "No policy for %s%d specified, "
158 			    "or policy not found\n", head_name, unit);
159 		}
160 		dsched_set_policy(dp, default_policy);
161 	} else {
162 		dsched_set_policy(dp, policy);
163 	}
164 
165 	ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
166 	dsched_sysctl_add_disk(
167 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
168 	    tunable_key);
169 
170 	lockmgr(&dsched_lock, LK_RELEASE);
171 }
172 
173 /*
174  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
175  * there's any policy associated with the serial number of the device.
176  */
177 void
178 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
179 {
180 	char tunable_key[SPECNAMELEN + 48];
181 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
182 	struct dsched_policy *policy = NULL;
183 
184 	if (info->d_serialno == NULL)
185 		return;
186 
187 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
188 
189 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
190 	    info->d_serialno);
191 
192 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
193 	    sizeof(sched_policy)) != 0)) {
194 		policy = dsched_find_policy(sched_policy);
195 	}
196 
197 	if (policy) {
198 		dsched_switch(dp, policy);
199 	}
200 
201 	dsched_sysctl_add_disk(
202 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
203 	    info->d_serialno);
204 
205 	lockmgr(&dsched_lock, LK_RELEASE);
206 }
207 
208 /*
209  * Called on disk_destroy()
210  * shuts down the scheduler core and cancels all remaining bios
211  */
212 void
213 dsched_disk_destroy_callback(struct disk *dp)
214 {
215 	struct dsched_policy *old_policy;
216 	struct dsched_disk_ctx *diskctx;
217 
218 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
219 
220 	diskctx = dsched_get_disk_priv(dp);
221 
222 	old_policy = dp->d_sched_policy;
223 	dp->d_sched_policy = &dsched_noop_policy;
224 	old_policy->cancel_all(dsched_get_disk_priv(dp));
225 	old_policy->teardown(dsched_get_disk_priv(dp));
226 
227 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
228 		sysctl_ctx_free(&diskctx->sysctl_ctx);
229 
230 	policy_destroy(dp);
231 	atomic_subtract_int(&old_policy->ref_count, 1);
232 	KKASSERT(old_policy->ref_count >= 0);
233 
234 	lockmgr(&dsched_lock, LK_RELEASE);
235 }
236 
237 
238 void
239 dsched_queue(struct disk *dp, struct bio *bio)
240 {
241 	struct dsched_thread_ctx	*tdctx;
242 	struct dsched_thread_io		*tdio;
243 	struct dsched_disk_ctx		*diskctx;
244 
245 	int found = 0, error = 0;
246 
247 	tdctx = dsched_get_buf_priv(bio->bio_buf);
248 	if (tdctx == NULL) {
249 		/* We don't handle this case, let dsched dispatch */
250 		atomic_add_int(&dsched_stats.no_tdctx, 1);
251 		dsched_strategy_raw(dp, bio);
252 		return;
253 	}
254 
255 	DSCHED_THREAD_CTX_LOCK(tdctx);
256 
257 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
258 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
259 		if (tdio->dp == dp) {
260 			dsched_thread_io_ref(tdio);
261 			found = 1;
262 			break;
263 		}
264 	}
265 
266 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
267 	dsched_clr_buf_priv(bio->bio_buf);
268 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
269 
270 	KKASSERT(found == 1);
271 	diskctx = dsched_get_disk_priv(dp);
272 	dsched_disk_ctx_ref(diskctx);
273 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
274 
275 	if (error) {
276 		dsched_strategy_raw(dp, bio);
277 	}
278 	dsched_disk_ctx_unref(diskctx);
279 	dsched_thread_io_unref(tdio);
280 }
281 
282 
283 /*
284  * Called from each module_init or module_attach of each policy
285  * registers the policy in the local policy list.
286  */
287 int
288 dsched_register(struct dsched_policy *d_policy)
289 {
290 	struct dsched_policy *policy;
291 	int error = 0;
292 
293 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
294 
295 	policy = dsched_find_policy(d_policy->name);
296 
297 	if (!policy) {
298 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
299 		atomic_add_int(&d_policy->ref_count, 1);
300 	} else {
301 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
302 		    d_policy->name);
303 		error = EEXIST;
304 	}
305 
306 	lockmgr(&dsched_lock, LK_RELEASE);
307 	return error;
308 }
309 
310 /*
311  * Called from each module_detach of each policy
312  * unregisters the policy
313  */
314 int
315 dsched_unregister(struct dsched_policy *d_policy)
316 {
317 	struct dsched_policy *policy;
318 
319 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
320 	policy = dsched_find_policy(d_policy->name);
321 
322 	if (policy) {
323 		if (policy->ref_count > 1) {
324 			lockmgr(&dsched_lock, LK_RELEASE);
325 			return EBUSY;
326 		}
327 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
328 		atomic_subtract_int(&policy->ref_count, 1);
329 		KKASSERT(policy->ref_count == 0);
330 	}
331 	lockmgr(&dsched_lock, LK_RELEASE);
332 	return 0;
333 }
334 
335 
336 /*
337  * switches the policy by first removing the old one and then
338  * enabling the new one.
339  */
340 int
341 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
342 {
343 	struct dsched_policy *old_policy;
344 
345 	/* If we are asked to set the same policy, do nothing */
346 	if (dp->d_sched_policy == new_policy)
347 		return 0;
348 
349 	/* lock everything down, diskwise */
350 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
351 	old_policy = dp->d_sched_policy;
352 
353 	atomic_subtract_int(&old_policy->ref_count, 1);
354 	KKASSERT(old_policy->ref_count >= 0);
355 
356 	dp->d_sched_policy = &dsched_noop_policy;
357 	old_policy->teardown(dsched_get_disk_priv(dp));
358 	policy_destroy(dp);
359 
360 	/* Bring everything back to life */
361 	dsched_set_policy(dp, new_policy);
362 	lockmgr(&dsched_lock, LK_RELEASE);
363 	return 0;
364 }
365 
366 
367 /*
368  * Loads a given policy and attaches it to the specified disk.
369  * Also initializes the core for the policy
370  */
371 void
372 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
373 {
374 	int locked = 0;
375 
376 	/* Check if it is locked already. if not, we acquire the devfs lock */
377 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
378 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
379 		locked = 1;
380 	}
381 
382 	policy_new(dp, new_policy);
383 	new_policy->prepare(dsched_get_disk_priv(dp));
384 	dp->d_sched_policy = new_policy;
385 	atomic_add_int(&new_policy->ref_count, 1);
386 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
387 	    new_policy->name);
388 
389 	/* If we acquired the lock, we also get rid of it */
390 	if (locked)
391 		lockmgr(&dsched_lock, LK_RELEASE);
392 }
393 
394 struct dsched_policy*
395 dsched_find_policy(char *search)
396 {
397 	struct dsched_policy *policy;
398 	struct dsched_policy *policy_found = NULL;
399 	int locked = 0;
400 
401 	/* Check if it is locked already. if not, we acquire the devfs lock */
402 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
403 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
404 		locked = 1;
405 	}
406 
407 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
408 		if (!strcmp(policy->name, search)) {
409 			policy_found = policy;
410 			break;
411 		}
412 	}
413 
414 	/* If we acquired the lock, we also get rid of it */
415 	if (locked)
416 		lockmgr(&dsched_lock, LK_RELEASE);
417 
418 	return policy_found;
419 }
420 
421 struct disk*
422 dsched_find_disk(char *search)
423 {
424 	struct disk *dp_found = NULL;
425 	struct disk *dp = NULL;
426 
427 	while((dp = disk_enumerate(dp))) {
428 		if (!strcmp(dp->d_cdev->si_name, search)) {
429 			dp_found = dp;
430 			break;
431 		}
432 	}
433 
434 	return dp_found;
435 }
436 
437 struct disk*
438 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
439 {
440 	while ((dp = disk_enumerate(dp))) {
441 		if (dp->d_sched_policy == policy)
442 			return dp;
443 	}
444 
445 	return NULL;
446 }
447 
448 struct dsched_policy *
449 dsched_policy_enumerate(struct dsched_policy *pol)
450 {
451 	if (!pol)
452 		return (TAILQ_FIRST(&dsched_policy_list));
453 	else
454 		return (TAILQ_NEXT(pol, link));
455 }
456 
457 void
458 dsched_cancel_bio(struct bio *bp)
459 {
460 	bp->bio_buf->b_error = ENXIO;
461 	bp->bio_buf->b_flags |= B_ERROR;
462 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
463 
464 	biodone(bp);
465 }
466 
467 void
468 dsched_strategy_raw(struct disk *dp, struct bio *bp)
469 {
470 	/*
471 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
472 	 * to avoid panics
473 	 */
474 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
475 	if(bp->bio_track != NULL) {
476 		dsched_debug(LOG_INFO,
477 		    "dsched_strategy_raw sees non-NULL bio_track!! "
478 		    "bio: %p\n", bp);
479 		bp->bio_track = NULL;
480 	}
481 	dev_dstrategy(dp->d_rawdev, bp);
482 }
483 
484 void
485 dsched_strategy_sync(struct disk *dp, struct bio *bio)
486 {
487 	struct buf *bp, *nbp;
488 	struct bio *nbio;
489 
490 	bp = bio->bio_buf;
491 
492 	nbp = getpbuf(NULL);
493 	nbio = &nbp->b_bio1;
494 
495 	nbp->b_cmd = bp->b_cmd;
496 	nbp->b_bufsize = bp->b_bufsize;
497 	nbp->b_runningbufspace = bp->b_runningbufspace;
498 	nbp->b_bcount = bp->b_bcount;
499 	nbp->b_resid = bp->b_resid;
500 	nbp->b_data = bp->b_data;
501 #if 0
502 	/*
503 	 * Buffers undergoing device I/O do not need a kvabase/size.
504 	 */
505 	nbp->b_kvabase = bp->b_kvabase;
506 	nbp->b_kvasize = bp->b_kvasize;
507 #endif
508 	nbp->b_dirtyend = bp->b_dirtyend;
509 
510 	nbio->bio_done = biodone_sync;
511 	nbio->bio_flags |= BIO_SYNC;
512 	nbio->bio_track = NULL;
513 
514 	nbio->bio_caller_info1.ptr = dp;
515 	nbio->bio_offset = bio->bio_offset;
516 
517 	dev_dstrategy(dp->d_rawdev, nbio);
518 	biowait(nbio, "dschedsync");
519 	bp->b_resid = nbp->b_resid;
520 	bp->b_error = nbp->b_error;
521 	biodone(bio);
522 #if 0
523 	nbp->b_kvabase = NULL;
524 	nbp->b_kvasize = 0;
525 #endif
526 	relpbuf(nbp, NULL);
527 }
528 
529 void
530 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
531 {
532 	struct bio *nbio;
533 
534 	nbio = push_bio(bio);
535 	nbio->bio_done = done;
536 	nbio->bio_offset = bio->bio_offset;
537 
538 	dsched_set_bio_dp(nbio, dp);
539 	dsched_set_bio_priv(nbio, priv);
540 
541 	getmicrotime(&nbio->bio_caller_info3.tv);
542 	dev_dstrategy(dp->d_rawdev, nbio);
543 }
544 
545 void
546 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
547 {
548 	int refcount;
549 
550 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
551 
552 	KKASSERT(refcount >= 0);
553 }
554 
555 void
556 dsched_thread_io_ref(struct dsched_thread_io *tdio)
557 {
558 	int refcount;
559 
560 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
561 
562 	KKASSERT(refcount >= 0);
563 }
564 
565 void
566 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
567 {
568 	int refcount;
569 
570 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
571 
572 	KKASSERT(refcount >= 0);
573 }
574 
575 void
576 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
577 {
578 	struct dsched_thread_io	*tdio, *tdio2;
579 	int refcount;
580 
581 	refcount = atomic_fetchadd_int(&diskctx->refcount, -1);
582 
583 
584 	KKASSERT(refcount >= 0 || refcount <= -0x400);
585 
586 	if (refcount == 1) {
587 		atomic_subtract_int(&diskctx->refcount, 0x400); /* mark as: in destruction */
588 #if 0
589 		kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
590 		print_backtrace(4);
591 #endif
592 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
593 		TAILQ_FOREACH_MUTABLE(tdio, &diskctx->tdio_list, dlink, tdio2) {
594 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
595 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
596 			dsched_thread_io_unref(tdio);
597 		}
598 		lockmgr(&diskctx->lock, LK_RELEASE);
599 		if (diskctx->dp->d_sched_policy->destroy_diskctx)
600 			diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
601 		objcache_put(dsched_diskctx_cache, diskctx);
602 		atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
603 	}
604 }
605 
606 void
607 dsched_thread_io_unref(struct dsched_thread_io *tdio)
608 {
609 	struct dsched_thread_ctx	*tdctx;
610 	struct dsched_disk_ctx	*diskctx;
611 	int refcount;
612 
613 	refcount = atomic_fetchadd_int(&tdio->refcount, -1);
614 
615 	KKASSERT(refcount >= 0 || refcount <= -0x400);
616 
617 	if (refcount == 1) {
618 		atomic_subtract_int(&tdio->refcount, 0x400); /* mark as: in destruction */
619 #if 0
620 		kprintf("tdio (%p) destruction started, trace:\n", tdio);
621 		print_backtrace(8);
622 #endif
623 		diskctx = tdio->diskctx;
624 		KKASSERT(diskctx != NULL);
625 		KKASSERT(tdio->qlength == 0);
626 
627 		if (tdio->flags & DSCHED_LINKED_DISK_CTX) {
628 			lockmgr(&diskctx->lock, LK_EXCLUSIVE);
629 
630 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
631 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
632 
633 			lockmgr(&diskctx->lock, LK_RELEASE);
634 		}
635 
636 		if (tdio->flags & DSCHED_LINKED_THREAD_CTX) {
637 			tdctx = tdio->tdctx;
638 			KKASSERT(tdctx != NULL);
639 
640 			lockmgr(&tdctx->lock, LK_EXCLUSIVE);
641 
642 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
643 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
644 
645 			lockmgr(&tdctx->lock, LK_RELEASE);
646 		}
647 		if (tdio->diskctx->dp->d_sched_policy->destroy_tdio)
648 			tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);
649 		objcache_put(dsched_tdio_cache, tdio);
650 		atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
651 #if 0
652 		dsched_disk_ctx_unref(diskctx);
653 #endif
654 	}
655 }
656 
657 void
658 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
659 {
660 	struct dsched_thread_io	*tdio, *tdio2;
661 	int refcount;
662 
663 	refcount = atomic_fetchadd_int(&tdctx->refcount, -1);
664 
665 	KKASSERT(refcount >= 0 || refcount <= -0x400);
666 
667 	if (refcount == 1) {
668 		atomic_subtract_int(&tdctx->refcount, 0x400); /* mark as: in destruction */
669 #if 0
670 		kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
671 		print_backtrace(8);
672 #endif
673 		DSCHED_GLOBAL_THREAD_CTX_LOCK();
674 
675 		TAILQ_FOREACH_MUTABLE(tdio, &tdctx->tdio_list, link, tdio2) {
676 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
677 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
678 			dsched_thread_io_unref(tdio);
679 		}
680 		TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
681 
682 		DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
683 
684 		objcache_put(dsched_tdctx_cache, tdctx);
685 		atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
686 	}
687 }
688 
689 
690 struct dsched_thread_io *
691 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
692     struct dsched_policy *pol)
693 {
694 	struct dsched_thread_io	*tdio;
695 #if 0
696 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
697 #endif
698 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
699 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
700 
701 	/* XXX: maybe we do need another ref for the disk list for tdio */
702 	dsched_thread_io_ref(tdio);
703 
704 	DSCHED_THREAD_IO_LOCKINIT(tdio);
705 	tdio->dp = dp;
706 
707 	tdio->diskctx = dsched_get_disk_priv(dp);
708 	TAILQ_INIT(&tdio->queue);
709 
710 	if (pol->new_tdio)
711 		pol->new_tdio(tdio);
712 
713 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
714 	tdio->flags |= DSCHED_LINKED_DISK_CTX;
715 
716 	if (tdctx) {
717 		tdio->tdctx = tdctx;
718 		tdio->p = tdctx->p;
719 
720 		/* Put the tdio in the tdctx list */
721 		DSCHED_THREAD_CTX_LOCK(tdctx);
722 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
723 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
724 		tdio->flags |= DSCHED_LINKED_THREAD_CTX;
725 	}
726 
727 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
728 	return tdio;
729 }
730 
731 
732 struct dsched_disk_ctx *
733 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
734 {
735 	struct dsched_disk_ctx *diskctx;
736 
737 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
738 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
739 	dsched_disk_ctx_ref(diskctx);
740 	diskctx->dp = dp;
741 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
742 	TAILQ_INIT(&diskctx->tdio_list);
743 
744 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
745 	if (pol->new_diskctx)
746 		pol->new_diskctx(diskctx);
747 	return diskctx;
748 }
749 
750 
751 struct dsched_thread_ctx *
752 dsched_thread_ctx_alloc(struct proc *p)
753 {
754 	struct dsched_thread_ctx	*tdctx;
755 	struct dsched_thread_io	*tdio;
756 	struct disk	*dp = NULL;
757 
758 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
759 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
760 	dsched_thread_ctx_ref(tdctx);
761 #if 0
762 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
763 #endif
764 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
765 	TAILQ_INIT(&tdctx->tdio_list);
766 	tdctx->p = p;
767 
768 	/* XXX */
769 	while ((dp = disk_enumerate(dp))) {
770 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
771 	}
772 
773 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
774 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
775 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
776 
777 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
778 	/* XXX: no callback here */
779 	return tdctx;
780 }
781 
782 void
783 policy_new(struct disk *dp, struct dsched_policy *pol) {
784 	struct dsched_thread_ctx *tdctx;
785 	struct dsched_disk_ctx *diskctx;
786 	struct dsched_thread_io *tdio;
787 
788 	diskctx = dsched_disk_ctx_alloc(dp, pol);
789 	dsched_disk_ctx_ref(diskctx);
790 	dsched_set_disk_priv(dp, diskctx);
791 
792 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
793 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
794 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
795 	}
796 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
797 
798 }
799 
800 void
801 policy_destroy(struct disk *dp) {
802 	struct dsched_disk_ctx *diskctx;
803 
804 	diskctx = dsched_get_disk_priv(dp);
805 	KKASSERT(diskctx != NULL);
806 
807 	dsched_disk_ctx_unref(diskctx); /* from prepare */
808 	dsched_disk_ctx_unref(diskctx); /* from alloc */
809 
810 	dsched_set_disk_priv(dp, NULL);
811 }
812 
813 void
814 dsched_new_buf(struct buf *bp)
815 {
816 	struct dsched_thread_ctx	*tdctx = NULL;
817 
818 	if (dsched_inited == 0)
819 		return;
820 
821 	if (curproc != NULL) {
822 		tdctx = dsched_get_proc_priv(curproc);
823 	} else {
824 		/* This is a kernel thread, so no proc info is available */
825 		tdctx = dsched_get_thread_priv(curthread);
826 	}
827 
828 #if 0
829 	/*
830 	 * XXX: hack. we don't want this assert because we aren't catching all
831 	 *	threads. mi_startup() is still getting away without an tdctx.
832 	 */
833 
834 	/* by now we should have an tdctx. if not, something bad is going on */
835 	KKASSERT(tdctx != NULL);
836 #endif
837 
838 	if (tdctx) {
839 		dsched_thread_ctx_ref(tdctx);
840 	}
841 	dsched_set_buf_priv(bp, tdctx);
842 }
843 
844 void
845 dsched_exit_buf(struct buf *bp)
846 {
847 	struct dsched_thread_ctx	*tdctx;
848 
849 	tdctx = dsched_get_buf_priv(bp);
850 	if (tdctx != NULL) {
851 		dsched_clr_buf_priv(bp);
852 		dsched_thread_ctx_unref(tdctx);
853 	}
854 }
855 
856 void
857 dsched_new_proc(struct proc *p)
858 {
859 	struct dsched_thread_ctx	*tdctx;
860 
861 	if (dsched_inited == 0)
862 		return;
863 
864 	KKASSERT(p != NULL);
865 
866 	tdctx = dsched_thread_ctx_alloc(p);
867 	tdctx->p = p;
868 	dsched_thread_ctx_ref(tdctx);
869 
870 	dsched_set_proc_priv(p, tdctx);
871 	atomic_add_int(&dsched_stats.nprocs, 1);
872 }
873 
874 
875 void
876 dsched_new_thread(struct thread *td)
877 {
878 	struct dsched_thread_ctx	*tdctx;
879 
880 	if (dsched_inited == 0)
881 		return;
882 
883 	KKASSERT(td != NULL);
884 
885 	tdctx = dsched_thread_ctx_alloc(NULL);
886 	tdctx->td = td;
887 	dsched_thread_ctx_ref(tdctx);
888 
889 	dsched_set_thread_priv(td, tdctx);
890 	atomic_add_int(&dsched_stats.nthreads, 1);
891 }
892 
893 void
894 dsched_exit_proc(struct proc *p)
895 {
896 	struct dsched_thread_ctx	*tdctx;
897 
898 	if (dsched_inited == 0)
899 		return;
900 
901 	KKASSERT(p != NULL);
902 
903 	tdctx = dsched_get_proc_priv(p);
904 	KKASSERT(tdctx != NULL);
905 
906 	tdctx->dead = 0xDEAD;
907 	dsched_set_proc_priv(p, 0);
908 
909 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
910 	dsched_thread_ctx_unref(tdctx); /* one for ref */
911 	atomic_subtract_int(&dsched_stats.nprocs, 1);
912 }
913 
914 
915 void
916 dsched_exit_thread(struct thread *td)
917 {
918 	struct dsched_thread_ctx	*tdctx;
919 
920 	if (dsched_inited == 0)
921 		return;
922 
923 	KKASSERT(td != NULL);
924 
925 	tdctx = dsched_get_thread_priv(td);
926 	KKASSERT(tdctx != NULL);
927 
928 	tdctx->dead = 0xDEAD;
929 	dsched_set_thread_priv(td, 0);
930 
931 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
932 	dsched_thread_ctx_unref(tdctx); /* one for ref */
933 	atomic_subtract_int(&dsched_stats.nthreads, 1);
934 }
935 
936 struct dsched_thread_io *
937 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
938     struct dsched_policy *pol) {
939 	struct dsched_thread_ctx *tdctx;
940 	struct dsched_thread_io *tdio;
941 
942 	tdctx = dsched_get_thread_priv(curthread);
943 	KKASSERT(tdctx != NULL);
944 
945 	tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
946 	return tdio;
947 }
948 
949 /* DEFAULT NOOP POLICY */
950 
951 static int
952 noop_prepare(struct dsched_disk_ctx *diskctx)
953 {
954 	return 0;
955 }
956 
957 static void
958 noop_teardown(struct dsched_disk_ctx *diskctx)
959 {
960 
961 }
962 
963 static void
964 noop_cancel(struct dsched_disk_ctx *diskctx)
965 {
966 
967 }
968 
969 static int
970 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
971     struct bio *bio)
972 {
973 	dsched_strategy_raw(diskctx->dp, bio);
974 #if 0
975 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
976 #endif
977 	return 0;
978 }
979 
980 /*
981  * SYSINIT stuff
982  */
983 static void
984 dsched_init(void)
985 {
986 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
987 					   NULL, NULL, NULL,
988 					   objcache_malloc_alloc,
989 					   objcache_malloc_free,
990 					   &dsched_thread_io_malloc_args );
991 
992 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
993 					   NULL, NULL, NULL,
994 					   objcache_malloc_alloc,
995 					   objcache_malloc_free,
996 					   &dsched_thread_ctx_malloc_args );
997 
998 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
999 					   NULL, NULL, NULL,
1000 					   objcache_malloc_alloc,
1001 					   objcache_malloc_free,
1002 					   &dsched_disk_ctx_malloc_args );
1003 
1004 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1005 
1006 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1007 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1008 
1009 	dsched_register(&dsched_noop_policy);
1010 
1011 	dsched_inited = 1;
1012 }
1013 
1014 static void
1015 dsched_uninit(void)
1016 {
1017 }
1018 
1019 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1020 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1021 
1022 /*
1023  * SYSCTL stuff
1024  */
1025 static int
1026 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1027 {
1028 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1029 }
1030 
1031 static int
1032 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1033 {
1034 	struct dsched_policy *pol = NULL;
1035 	int error, first = 1;
1036 
1037 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1038 
1039 	while ((pol = dsched_policy_enumerate(pol))) {
1040 		if (!first) {
1041 			error = SYSCTL_OUT(req, " ", 1);
1042 			if (error)
1043 				break;
1044 		} else {
1045 			first = 0;
1046 		}
1047 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1048 		if (error)
1049 			break;
1050 
1051 	}
1052 
1053 	lockmgr(&dsched_lock, LK_RELEASE);
1054 
1055 	error = SYSCTL_OUT(req, "", 1);
1056 
1057 	return error;
1058 }
1059 
1060 static int
1061 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1062 {
1063 	char buf[DSCHED_POLICY_NAME_LENGTH];
1064 	struct dsched_disk_ctx *diskctx = arg1;
1065 	struct dsched_policy *pol = NULL;
1066 	int error;
1067 
1068 	if (diskctx == NULL) {
1069 		return 0;
1070 	}
1071 
1072 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1073 
1074 	pol = diskctx->dp->d_sched_policy;
1075 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1076 
1077 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1078 	if (error || req->newptr == NULL) {
1079 		lockmgr(&dsched_lock, LK_RELEASE);
1080 		return (error);
1081 	}
1082 
1083 	pol = dsched_find_policy(buf);
1084 	if (pol == NULL) {
1085 		lockmgr(&dsched_lock, LK_RELEASE);
1086 		return 0;
1087 	}
1088 
1089 	dsched_switch(diskctx->dp, pol);
1090 
1091 	lockmgr(&dsched_lock, LK_RELEASE);
1092 
1093 	return error;
1094 }
1095 
1096 static int
1097 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1098 {
1099 	char buf[DSCHED_POLICY_NAME_LENGTH];
1100 	struct dsched_policy *pol = NULL;
1101 	int error;
1102 
1103 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1104 
1105 	pol = default_policy;
1106 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1107 
1108 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1109 	if (error || req->newptr == NULL) {
1110 		lockmgr(&dsched_lock, LK_RELEASE);
1111 		return (error);
1112 	}
1113 
1114 	pol = dsched_find_policy(buf);
1115 	if (pol == NULL) {
1116 		lockmgr(&dsched_lock, LK_RELEASE);
1117 		return 0;
1118 	}
1119 
1120 	default_set = 1;
1121 	default_policy = pol;
1122 
1123 	lockmgr(&dsched_lock, LK_RELEASE);
1124 
1125 	return error;
1126 }
1127 
1128 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1129     "Disk Scheduler Framework (dsched) magic");
1130 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1131     "List of disks and their policies");
1132 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1133     0, "Enable dsched debugging");
1134 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1135     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1136     "dsched statistics");
1137 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1138     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1139 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1140     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1141 
1142 static void
1143 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1144 {
1145 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1146 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1147 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1148 	}
1149 
1150 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1151 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1152 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1153 }
1154