xref: /dflybsd-src/sys/kern/kern_dsched.c (revision 5b22f1a7302b644c8e417d0bf1192e953e27d3b6)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		noop_prepare;
58 static dsched_teardown_t	noop_teardown;
59 static dsched_cancel_t		noop_cancel;
60 static dsched_queue_t		noop_queue;
61 
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 
64 static int	dsched_inited = 0;
65 static int	default_set = 0;
66 
67 struct lock	dsched_lock;
68 static int	dsched_debug_enable = 0;
69 
70 struct dsched_stats	dsched_stats;
71 
72 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
73 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
74 struct objcache_malloc_args dsched_thread_io_malloc_args = {
75 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
76 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
77 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
78 
79 static struct objcache	*dsched_diskctx_cache;
80 static struct objcache	*dsched_tdctx_cache;
81 static struct objcache	*dsched_tdio_cache;
82 
83 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
84 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
85 
86 struct lock	dsched_tdctx_lock;
87 
88 static struct dsched_policy_head dsched_policy_list =
89 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
90 
91 static struct dsched_policy dsched_noop_policy = {
92 	.name = "noop",
93 
94 	.prepare = noop_prepare,
95 	.teardown = noop_teardown,
96 	.cancel_all = noop_cancel,
97 	.bio_queue = noop_queue
98 };
99 
100 static struct dsched_policy *default_policy = &dsched_noop_policy;
101 
102 /*
103  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
104  * using kvprintf
105  */
106 int
107 dsched_debug(int level, char *fmt, ...)
108 {
109 	__va_list ap;
110 
111 	__va_start(ap, fmt);
112 	if (level <= dsched_debug_enable)
113 		kvprintf(fmt, ap);
114 	__va_end(ap);
115 
116 	return 0;
117 }
118 
119 /*
120  * Called on disk_create()
121  * tries to read which policy to use from loader.conf, if there's
122  * none specified, the default policy is used.
123  */
124 void
125 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
126 {
127 	char tunable_key[SPECNAMELEN + 48];
128 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
129 	struct dsched_policy *policy = NULL;
130 
131 	/* Also look for serno stuff? */
132 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
133 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
134 
135 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
136 	    head_name, unit);
137 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
138 	    sizeof(sched_policy)) != 0) {
139 		policy = dsched_find_policy(sched_policy);
140 	}
141 
142 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
143 	    head_name);
144 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
145 	    sizeof(sched_policy)) != 0)) {
146 		policy = dsched_find_policy(sched_policy);
147 	}
148 
149 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
150 	if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
151 	    sizeof(sched_policy)) != 0)) {
152 		policy = dsched_find_policy(sched_policy);
153 	}
154 
155 	if (!policy) {
156 		if (!default_set) {
157 			dsched_debug(0, "No policy for %s%d specified, "
158 			    "or policy not found\n", head_name, unit);
159 		}
160 		dsched_set_policy(dp, default_policy);
161 	} else {
162 		dsched_set_policy(dp, policy);
163 	}
164 
165 	ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
166 	dsched_sysctl_add_disk(
167 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
168 	    tunable_key);
169 
170 	lockmgr(&dsched_lock, LK_RELEASE);
171 }
172 
173 /*
174  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
175  * there's any policy associated with the serial number of the device.
176  */
177 void
178 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
179 {
180 	char tunable_key[SPECNAMELEN + 48];
181 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
182 	struct dsched_policy *policy = NULL;
183 
184 	if (info->d_serialno == NULL)
185 		return;
186 
187 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
188 
189 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
190 	    info->d_serialno);
191 
192 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
193 	    sizeof(sched_policy)) != 0)) {
194 		policy = dsched_find_policy(sched_policy);
195 	}
196 
197 	if (policy) {
198 		dsched_switch(dp, policy);
199 	}
200 
201 	dsched_sysctl_add_disk(
202 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
203 	    info->d_serialno);
204 
205 	lockmgr(&dsched_lock, LK_RELEASE);
206 }
207 
208 /*
209  * Called on disk_destroy()
210  * shuts down the scheduler core and cancels all remaining bios
211  */
212 void
213 dsched_disk_destroy_callback(struct disk *dp)
214 {
215 	struct dsched_policy *old_policy;
216 	struct dsched_disk_ctx *diskctx;
217 
218 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
219 
220 	diskctx = dsched_get_disk_priv(dp);
221 
222 	old_policy = dp->d_sched_policy;
223 	dp->d_sched_policy = &dsched_noop_policy;
224 	old_policy->cancel_all(dsched_get_disk_priv(dp));
225 	old_policy->teardown(dsched_get_disk_priv(dp));
226 
227 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
228 		sysctl_ctx_free(&diskctx->sysctl_ctx);
229 
230 	policy_destroy(dp);
231 	atomic_subtract_int(&old_policy->ref_count, 1);
232 	KKASSERT(old_policy->ref_count >= 0);
233 
234 	lockmgr(&dsched_lock, LK_RELEASE);
235 }
236 
237 
238 void
239 dsched_queue(struct disk *dp, struct bio *bio)
240 {
241 	struct dsched_thread_ctx	*tdctx;
242 	struct dsched_thread_io		*tdio;
243 	struct dsched_disk_ctx		*diskctx;
244 
245 	int found = 0, error = 0;
246 
247 	tdctx = dsched_get_buf_priv(bio->bio_buf);
248 	if (tdctx == NULL) {
249 		/* We don't handle this case, let dsched dispatch */
250 		atomic_add_int(&dsched_stats.no_tdctx, 1);
251 		dsched_strategy_raw(dp, bio);
252 		return;
253 	}
254 
255 	DSCHED_THREAD_CTX_LOCK(tdctx);
256 
257 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
258 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
259 		if (tdio->dp == dp) {
260 			dsched_thread_io_ref(tdio);
261 			found = 1;
262 			break;
263 		}
264 	}
265 
266 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
267 	dsched_clr_buf_priv(bio->bio_buf);
268 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
269 
270 	KKASSERT(found == 1);
271 	diskctx = dsched_get_disk_priv(dp);
272 	dsched_disk_ctx_ref(diskctx);
273 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
274 
275 	if (error) {
276 		dsched_strategy_raw(dp, bio);
277 	}
278 	dsched_disk_ctx_unref(diskctx);
279 	dsched_thread_io_unref(tdio);
280 }
281 
282 
283 /*
284  * Called from each module_init or module_attach of each policy
285  * registers the policy in the local policy list.
286  */
287 int
288 dsched_register(struct dsched_policy *d_policy)
289 {
290 	struct dsched_policy *policy;
291 	int error = 0;
292 
293 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
294 
295 	policy = dsched_find_policy(d_policy->name);
296 
297 	if (!policy) {
298 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
299 		atomic_add_int(&d_policy->ref_count, 1);
300 	} else {
301 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
302 		    d_policy->name);
303 		error = EEXIST;
304 	}
305 
306 	lockmgr(&dsched_lock, LK_RELEASE);
307 	return error;
308 }
309 
310 /*
311  * Called from each module_detach of each policy
312  * unregisters the policy
313  */
314 int
315 dsched_unregister(struct dsched_policy *d_policy)
316 {
317 	struct dsched_policy *policy;
318 
319 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
320 	policy = dsched_find_policy(d_policy->name);
321 
322 	if (policy) {
323 		if (policy->ref_count > 1) {
324 			lockmgr(&dsched_lock, LK_RELEASE);
325 			return EBUSY;
326 		}
327 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
328 		atomic_subtract_int(&policy->ref_count, 1);
329 		KKASSERT(policy->ref_count == 0);
330 	}
331 	lockmgr(&dsched_lock, LK_RELEASE);
332 	return 0;
333 }
334 
335 
336 /*
337  * switches the policy by first removing the old one and then
338  * enabling the new one.
339  */
340 int
341 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
342 {
343 	struct dsched_policy *old_policy;
344 
345 	/* If we are asked to set the same policy, do nothing */
346 	if (dp->d_sched_policy == new_policy)
347 		return 0;
348 
349 	/* lock everything down, diskwise */
350 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
351 	old_policy = dp->d_sched_policy;
352 
353 	atomic_subtract_int(&old_policy->ref_count, 1);
354 	KKASSERT(old_policy->ref_count >= 0);
355 
356 	dp->d_sched_policy = &dsched_noop_policy;
357 	old_policy->teardown(dsched_get_disk_priv(dp));
358 	policy_destroy(dp);
359 
360 	/* Bring everything back to life */
361 	dsched_set_policy(dp, new_policy);
362 	lockmgr(&dsched_lock, LK_RELEASE);
363 	return 0;
364 }
365 
366 
367 /*
368  * Loads a given policy and attaches it to the specified disk.
369  * Also initializes the core for the policy
370  */
371 void
372 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
373 {
374 	int locked = 0;
375 
376 	/* Check if it is locked already. if not, we acquire the devfs lock */
377 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
378 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
379 		locked = 1;
380 	}
381 
382 	policy_new(dp, new_policy);
383 	new_policy->prepare(dsched_get_disk_priv(dp));
384 	dp->d_sched_policy = new_policy;
385 	atomic_add_int(&new_policy->ref_count, 1);
386 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
387 	    new_policy->name);
388 
389 	/* If we acquired the lock, we also get rid of it */
390 	if (locked)
391 		lockmgr(&dsched_lock, LK_RELEASE);
392 }
393 
394 struct dsched_policy*
395 dsched_find_policy(char *search)
396 {
397 	struct dsched_policy *policy;
398 	struct dsched_policy *policy_found = NULL;
399 	int locked = 0;
400 
401 	/* Check if it is locked already. if not, we acquire the devfs lock */
402 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
403 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
404 		locked = 1;
405 	}
406 
407 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
408 		if (!strcmp(policy->name, search)) {
409 			policy_found = policy;
410 			break;
411 		}
412 	}
413 
414 	/* If we acquired the lock, we also get rid of it */
415 	if (locked)
416 		lockmgr(&dsched_lock, LK_RELEASE);
417 
418 	return policy_found;
419 }
420 
421 struct disk*
422 dsched_find_disk(char *search)
423 {
424 	struct disk *dp_found = NULL;
425 	struct disk *dp = NULL;
426 
427 	while((dp = disk_enumerate(dp))) {
428 		if (!strcmp(dp->d_cdev->si_name, search)) {
429 			dp_found = dp;
430 			break;
431 		}
432 	}
433 
434 	return dp_found;
435 }
436 
437 struct disk*
438 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
439 {
440 	while ((dp = disk_enumerate(dp))) {
441 		if (dp->d_sched_policy == policy)
442 			return dp;
443 	}
444 
445 	return NULL;
446 }
447 
448 struct dsched_policy *
449 dsched_policy_enumerate(struct dsched_policy *pol)
450 {
451 	if (!pol)
452 		return (TAILQ_FIRST(&dsched_policy_list));
453 	else
454 		return (TAILQ_NEXT(pol, link));
455 }
456 
457 void
458 dsched_cancel_bio(struct bio *bp)
459 {
460 	bp->bio_buf->b_error = ENXIO;
461 	bp->bio_buf->b_flags |= B_ERROR;
462 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
463 
464 	biodone(bp);
465 }
466 
467 void
468 dsched_strategy_raw(struct disk *dp, struct bio *bp)
469 {
470 	/*
471 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
472 	 * to avoid panics
473 	 */
474 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
475 	if(bp->bio_track != NULL) {
476 		dsched_debug(LOG_INFO,
477 		    "dsched_strategy_raw sees non-NULL bio_track!! "
478 		    "bio: %p\n", bp);
479 		bp->bio_track = NULL;
480 	}
481 	dev_dstrategy(dp->d_rawdev, bp);
482 }
483 
484 void
485 dsched_strategy_sync(struct disk *dp, struct bio *bio)
486 {
487 	struct buf *bp, *nbp;
488 	struct bio *nbio;
489 
490 	bp = bio->bio_buf;
491 
492 	nbp = getpbuf(NULL);
493 	nbio = &nbp->b_bio1;
494 
495 	nbp->b_cmd = bp->b_cmd;
496 	nbp->b_bufsize = bp->b_bufsize;
497 	nbp->b_runningbufspace = bp->b_runningbufspace;
498 	nbp->b_bcount = bp->b_bcount;
499 	nbp->b_resid = bp->b_resid;
500 	nbp->b_data = bp->b_data;
501 	nbp->b_kvabase = bp->b_kvabase;
502 	nbp->b_kvasize = bp->b_kvasize;
503 	nbp->b_dirtyend = bp->b_dirtyend;
504 
505 	nbio->bio_done = biodone_sync;
506 	nbio->bio_flags |= BIO_SYNC;
507 	nbio->bio_track = NULL;
508 
509 	nbio->bio_caller_info1.ptr = dp;
510 	nbio->bio_offset = bio->bio_offset;
511 
512 	dev_dstrategy(dp->d_rawdev, nbio);
513 	biowait(nbio, "dschedsync");
514 	bp->b_resid = nbp->b_resid;
515 	bp->b_error = nbp->b_error;
516 	biodone(bio);
517 	relpbuf(nbp, NULL);
518 }
519 
520 void
521 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
522 {
523 	struct bio *nbio;
524 
525 	nbio = push_bio(bio);
526 	nbio->bio_done = done;
527 	nbio->bio_offset = bio->bio_offset;
528 
529 	dsched_set_bio_dp(nbio, dp);
530 	dsched_set_bio_priv(nbio, priv);
531 
532 	getmicrotime(&nbio->bio_caller_info3.tv);
533 	dev_dstrategy(dp->d_rawdev, nbio);
534 }
535 
536 void
537 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
538 {
539 	int refcount;
540 
541 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
542 
543 	KKASSERT(refcount >= 0);
544 }
545 
546 void
547 dsched_thread_io_ref(struct dsched_thread_io *tdio)
548 {
549 	int refcount;
550 
551 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
552 
553 	KKASSERT(refcount >= 0);
554 }
555 
556 void
557 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
558 {
559 	int refcount;
560 
561 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
562 
563 	KKASSERT(refcount >= 0);
564 }
565 
566 void
567 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
568 {
569 	struct dsched_thread_io	*tdio, *tdio2;
570 	int refcount;
571 
572 	refcount = atomic_fetchadd_int(&diskctx->refcount, -1);
573 
574 
575 	KKASSERT(refcount >= 0 || refcount <= -0x400);
576 
577 	if (refcount == 1) {
578 		atomic_subtract_int(&diskctx->refcount, 0x400); /* mark as: in destruction */
579 #if 0
580 		kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
581 		print_backtrace(4);
582 #endif
583 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
584 		TAILQ_FOREACH_MUTABLE(tdio, &diskctx->tdio_list, dlink, tdio2) {
585 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
586 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
587 			dsched_thread_io_unref(tdio);
588 		}
589 		lockmgr(&diskctx->lock, LK_RELEASE);
590 		if (diskctx->dp->d_sched_policy->destroy_diskctx)
591 			diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
592 		objcache_put(dsched_diskctx_cache, diskctx);
593 		atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
594 	}
595 }
596 
597 void
598 dsched_thread_io_unref(struct dsched_thread_io *tdio)
599 {
600 	struct dsched_thread_ctx	*tdctx;
601 	struct dsched_disk_ctx	*diskctx;
602 	int refcount;
603 
604 	refcount = atomic_fetchadd_int(&tdio->refcount, -1);
605 
606 	KKASSERT(refcount >= 0 || refcount <= -0x400);
607 
608 	if (refcount == 1) {
609 		atomic_subtract_int(&tdio->refcount, 0x400); /* mark as: in destruction */
610 #if 0
611 		kprintf("tdio (%p) destruction started, trace:\n", tdio);
612 		print_backtrace(8);
613 #endif
614 		diskctx = tdio->diskctx;
615 		KKASSERT(diskctx != NULL);
616 		KKASSERT(tdio->qlength == 0);
617 
618 		if (tdio->flags & DSCHED_LINKED_DISK_CTX) {
619 			lockmgr(&diskctx->lock, LK_EXCLUSIVE);
620 
621 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
622 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
623 
624 			lockmgr(&diskctx->lock, LK_RELEASE);
625 		}
626 
627 		if (tdio->flags & DSCHED_LINKED_THREAD_CTX) {
628 			tdctx = tdio->tdctx;
629 			KKASSERT(tdctx != NULL);
630 
631 			lockmgr(&tdctx->lock, LK_EXCLUSIVE);
632 
633 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
634 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
635 
636 			lockmgr(&tdctx->lock, LK_RELEASE);
637 		}
638 		if (tdio->diskctx->dp->d_sched_policy->destroy_tdio)
639 			tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);
640 		objcache_put(dsched_tdio_cache, tdio);
641 		atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
642 #if 0
643 		dsched_disk_ctx_unref(diskctx);
644 #endif
645 	}
646 }
647 
648 void
649 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
650 {
651 	struct dsched_thread_io	*tdio, *tdio2;
652 	int refcount;
653 
654 	refcount = atomic_fetchadd_int(&tdctx->refcount, -1);
655 
656 	KKASSERT(refcount >= 0 || refcount <= -0x400);
657 
658 	if (refcount == 1) {
659 		atomic_subtract_int(&tdctx->refcount, 0x400); /* mark as: in destruction */
660 #if 0
661 		kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
662 		print_backtrace(8);
663 #endif
664 		DSCHED_GLOBAL_THREAD_CTX_LOCK();
665 
666 		TAILQ_FOREACH_MUTABLE(tdio, &tdctx->tdio_list, link, tdio2) {
667 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
668 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
669 			dsched_thread_io_unref(tdio);
670 		}
671 		TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
672 
673 		DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
674 
675 		objcache_put(dsched_tdctx_cache, tdctx);
676 		atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
677 	}
678 }
679 
680 
681 struct dsched_thread_io *
682 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
683     struct dsched_policy *pol)
684 {
685 	struct dsched_thread_io	*tdio;
686 #if 0
687 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
688 #endif
689 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
690 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
691 
692 	/* XXX: maybe we do need another ref for the disk list for tdio */
693 	dsched_thread_io_ref(tdio);
694 
695 	DSCHED_THREAD_IO_LOCKINIT(tdio);
696 	tdio->dp = dp;
697 
698 	tdio->diskctx = dsched_get_disk_priv(dp);
699 	TAILQ_INIT(&tdio->queue);
700 
701 	if (pol->new_tdio)
702 		pol->new_tdio(tdio);
703 
704 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
705 	tdio->flags |= DSCHED_LINKED_DISK_CTX;
706 
707 	if (tdctx) {
708 		tdio->tdctx = tdctx;
709 		tdio->p = tdctx->p;
710 
711 		/* Put the tdio in the tdctx list */
712 		DSCHED_THREAD_CTX_LOCK(tdctx);
713 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
714 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
715 		tdio->flags |= DSCHED_LINKED_THREAD_CTX;
716 	}
717 
718 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
719 	return tdio;
720 }
721 
722 
723 struct dsched_disk_ctx *
724 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
725 {
726 	struct dsched_disk_ctx *diskctx;
727 
728 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
729 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
730 	dsched_disk_ctx_ref(diskctx);
731 	diskctx->dp = dp;
732 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
733 	TAILQ_INIT(&diskctx->tdio_list);
734 
735 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
736 	if (pol->new_diskctx)
737 		pol->new_diskctx(diskctx);
738 	return diskctx;
739 }
740 
741 
742 struct dsched_thread_ctx *
743 dsched_thread_ctx_alloc(struct proc *p)
744 {
745 	struct dsched_thread_ctx	*tdctx;
746 	struct dsched_thread_io	*tdio;
747 	struct disk	*dp = NULL;
748 
749 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
750 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
751 	dsched_thread_ctx_ref(tdctx);
752 #if 0
753 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
754 #endif
755 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
756 	TAILQ_INIT(&tdctx->tdio_list);
757 	tdctx->p = p;
758 
759 	/* XXX */
760 	while ((dp = disk_enumerate(dp))) {
761 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
762 	}
763 
764 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
765 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
766 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
767 
768 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
769 	/* XXX: no callback here */
770 	return tdctx;
771 }
772 
773 void
774 policy_new(struct disk *dp, struct dsched_policy *pol) {
775 	struct dsched_thread_ctx *tdctx;
776 	struct dsched_disk_ctx *diskctx;
777 	struct dsched_thread_io *tdio;
778 
779 	diskctx = dsched_disk_ctx_alloc(dp, pol);
780 	dsched_disk_ctx_ref(diskctx);
781 	dsched_set_disk_priv(dp, diskctx);
782 
783 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
784 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
785 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
786 	}
787 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
788 
789 }
790 
791 void
792 policy_destroy(struct disk *dp) {
793 	struct dsched_disk_ctx *diskctx;
794 
795 	diskctx = dsched_get_disk_priv(dp);
796 	KKASSERT(diskctx != NULL);
797 
798 	dsched_disk_ctx_unref(diskctx); /* from prepare */
799 	dsched_disk_ctx_unref(diskctx); /* from alloc */
800 
801 	dsched_set_disk_priv(dp, NULL);
802 }
803 
804 void
805 dsched_new_buf(struct buf *bp)
806 {
807 	struct dsched_thread_ctx	*tdctx = NULL;
808 
809 	if (dsched_inited == 0)
810 		return;
811 
812 	if (curproc != NULL) {
813 		tdctx = dsched_get_proc_priv(curproc);
814 	} else {
815 		/* This is a kernel thread, so no proc info is available */
816 		tdctx = dsched_get_thread_priv(curthread);
817 	}
818 
819 #if 0
820 	/*
821 	 * XXX: hack. we don't want this assert because we aren't catching all
822 	 *	threads. mi_startup() is still getting away without an tdctx.
823 	 */
824 
825 	/* by now we should have an tdctx. if not, something bad is going on */
826 	KKASSERT(tdctx != NULL);
827 #endif
828 
829 	if (tdctx) {
830 		dsched_thread_ctx_ref(tdctx);
831 	}
832 	dsched_set_buf_priv(bp, tdctx);
833 }
834 
835 void
836 dsched_exit_buf(struct buf *bp)
837 {
838 	struct dsched_thread_ctx	*tdctx;
839 
840 	tdctx = dsched_get_buf_priv(bp);
841 	if (tdctx != NULL) {
842 		dsched_clr_buf_priv(bp);
843 		dsched_thread_ctx_unref(tdctx);
844 	}
845 }
846 
847 void
848 dsched_new_proc(struct proc *p)
849 {
850 	struct dsched_thread_ctx	*tdctx;
851 
852 	if (dsched_inited == 0)
853 		return;
854 
855 	KKASSERT(p != NULL);
856 
857 	tdctx = dsched_thread_ctx_alloc(p);
858 	tdctx->p = p;
859 	dsched_thread_ctx_ref(tdctx);
860 
861 	dsched_set_proc_priv(p, tdctx);
862 	atomic_add_int(&dsched_stats.nprocs, 1);
863 }
864 
865 
866 void
867 dsched_new_thread(struct thread *td)
868 {
869 	struct dsched_thread_ctx	*tdctx;
870 
871 	if (dsched_inited == 0)
872 		return;
873 
874 	KKASSERT(td != NULL);
875 
876 	tdctx = dsched_thread_ctx_alloc(NULL);
877 	tdctx->td = td;
878 	dsched_thread_ctx_ref(tdctx);
879 
880 	dsched_set_thread_priv(td, tdctx);
881 	atomic_add_int(&dsched_stats.nthreads, 1);
882 }
883 
884 void
885 dsched_exit_proc(struct proc *p)
886 {
887 	struct dsched_thread_ctx	*tdctx;
888 
889 	if (dsched_inited == 0)
890 		return;
891 
892 	KKASSERT(p != NULL);
893 
894 	tdctx = dsched_get_proc_priv(p);
895 	KKASSERT(tdctx != NULL);
896 
897 	tdctx->dead = 0xDEAD;
898 	dsched_set_proc_priv(p, 0);
899 
900 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
901 	dsched_thread_ctx_unref(tdctx); /* one for ref */
902 	atomic_subtract_int(&dsched_stats.nprocs, 1);
903 }
904 
905 
906 void
907 dsched_exit_thread(struct thread *td)
908 {
909 	struct dsched_thread_ctx	*tdctx;
910 
911 	if (dsched_inited == 0)
912 		return;
913 
914 	KKASSERT(td != NULL);
915 
916 	tdctx = dsched_get_thread_priv(td);
917 	KKASSERT(tdctx != NULL);
918 
919 	tdctx->dead = 0xDEAD;
920 	dsched_set_thread_priv(td, 0);
921 
922 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
923 	dsched_thread_ctx_unref(tdctx); /* one for ref */
924 	atomic_subtract_int(&dsched_stats.nthreads, 1);
925 }
926 
927 struct dsched_thread_io *
928 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
929     struct dsched_policy *pol) {
930 	struct dsched_thread_ctx *tdctx;
931 	struct dsched_thread_io *tdio;
932 
933 	tdctx = dsched_get_thread_priv(curthread);
934 	KKASSERT(tdctx != NULL);
935 
936 	tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
937 	return tdio;
938 }
939 
940 /* DEFAULT NOOP POLICY */
941 
942 static int
943 noop_prepare(struct dsched_disk_ctx *diskctx)
944 {
945 	return 0;
946 }
947 
948 static void
949 noop_teardown(struct dsched_disk_ctx *diskctx)
950 {
951 
952 }
953 
954 static void
955 noop_cancel(struct dsched_disk_ctx *diskctx)
956 {
957 
958 }
959 
960 static int
961 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
962     struct bio *bio)
963 {
964 	dsched_strategy_raw(diskctx->dp, bio);
965 #if 0
966 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
967 #endif
968 	return 0;
969 }
970 
971 /*
972  * SYSINIT stuff
973  */
974 static void
975 dsched_init(void)
976 {
977 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
978 					   NULL, NULL, NULL,
979 					   objcache_malloc_alloc,
980 					   objcache_malloc_free,
981 					   &dsched_thread_io_malloc_args );
982 
983 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
984 					   NULL, NULL, NULL,
985 					   objcache_malloc_alloc,
986 					   objcache_malloc_free,
987 					   &dsched_thread_ctx_malloc_args );
988 
989 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
990 					   NULL, NULL, NULL,
991 					   objcache_malloc_alloc,
992 					   objcache_malloc_free,
993 					   &dsched_disk_ctx_malloc_args );
994 
995 	bzero(&dsched_stats, sizeof(struct dsched_stats));
996 
997 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
998 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
999 
1000 	dsched_register(&dsched_noop_policy);
1001 
1002 	dsched_inited = 1;
1003 }
1004 
1005 static void
1006 dsched_uninit(void)
1007 {
1008 }
1009 
1010 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1011 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1012 
1013 /*
1014  * SYSCTL stuff
1015  */
1016 static int
1017 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1018 {
1019 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1020 }
1021 
1022 static int
1023 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1024 {
1025 	struct dsched_policy *pol = NULL;
1026 	int error, first = 1;
1027 
1028 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1029 
1030 	while ((pol = dsched_policy_enumerate(pol))) {
1031 		if (!first) {
1032 			error = SYSCTL_OUT(req, " ", 1);
1033 			if (error)
1034 				break;
1035 		} else {
1036 			first = 0;
1037 		}
1038 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1039 		if (error)
1040 			break;
1041 
1042 	}
1043 
1044 	lockmgr(&dsched_lock, LK_RELEASE);
1045 
1046 	error = SYSCTL_OUT(req, "", 1);
1047 
1048 	return error;
1049 }
1050 
1051 static int
1052 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1053 {
1054 	char buf[DSCHED_POLICY_NAME_LENGTH];
1055 	struct dsched_disk_ctx *diskctx = arg1;
1056 	struct dsched_policy *pol = NULL;
1057 	int error;
1058 
1059 	if (diskctx == NULL) {
1060 		return 0;
1061 	}
1062 
1063 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1064 
1065 	pol = diskctx->dp->d_sched_policy;
1066 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1067 
1068 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1069 	if (error || req->newptr == NULL) {
1070 		lockmgr(&dsched_lock, LK_RELEASE);
1071 		return (error);
1072 	}
1073 
1074 	pol = dsched_find_policy(buf);
1075 	if (pol == NULL) {
1076 		lockmgr(&dsched_lock, LK_RELEASE);
1077 		return 0;
1078 	}
1079 
1080 	dsched_switch(diskctx->dp, pol);
1081 
1082 	lockmgr(&dsched_lock, LK_RELEASE);
1083 
1084 	return error;
1085 }
1086 
1087 static int
1088 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1089 {
1090 	char buf[DSCHED_POLICY_NAME_LENGTH];
1091 	struct dsched_policy *pol = NULL;
1092 	int error;
1093 
1094 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1095 
1096 	pol = default_policy;
1097 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1098 
1099 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1100 	if (error || req->newptr == NULL) {
1101 		lockmgr(&dsched_lock, LK_RELEASE);
1102 		return (error);
1103 	}
1104 
1105 	pol = dsched_find_policy(buf);
1106 	if (pol == NULL) {
1107 		lockmgr(&dsched_lock, LK_RELEASE);
1108 		return 0;
1109 	}
1110 
1111 	default_set = 1;
1112 	default_policy = pol;
1113 
1114 	lockmgr(&dsched_lock, LK_RELEASE);
1115 
1116 	return error;
1117 }
1118 
1119 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1120     "Disk Scheduler Framework (dsched) magic");
1121 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1122     "List of disks and their policies");
1123 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1124     0, "Enable dsched debugging");
1125 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1126     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1127     "dsched statistics");
1128 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1129     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1130 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1131     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1132 
1133 static void
1134 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1135 {
1136 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1137 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1138 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1139 	}
1140 
1141 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1142 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1143 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1144 }
1145