xref: /dflybsd-src/sys/kern/kern_dsched.c (revision d83c779ab2c938232fa7b53777cd18cc9c4fc8e4)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		default_prepare;
58 static dsched_teardown_t	default_teardown;
59 static dsched_cancel_t		default_cancel;
60 static dsched_queue_t		default_queue;
61 
62 static d_open_t      dsched_dev_open;
63 static d_close_t     dsched_dev_close;
64 static d_ioctl_t     dsched_dev_ioctl;
65 
66 static int dsched_dev_list_disks(struct dsched_ioctl *data);
67 static int dsched_dev_list_disk(struct dsched_ioctl *data);
68 static int dsched_dev_list_policies(struct dsched_ioctl *data);
69 static int dsched_dev_handle_switch(char *disk, char *policy);
70 
71 static int	dsched_inited = 0;
72 
73 struct lock	dsched_lock;
74 static int	dsched_debug_enable = 0;
75 static cdev_t	dsched_dev;
76 
77 struct dsched_stats	dsched_stats;
78 
79 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
80 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
81 struct objcache_malloc_args dsched_thread_io_malloc_args = {
82 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
83 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
84 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
85 
86 static struct objcache	*dsched_diskctx_cache;
87 static struct objcache	*dsched_tdctx_cache;
88 static struct objcache	*dsched_tdio_cache;
89 
90 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
91 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
92 
93 struct lock	dsched_tdctx_lock;
94 
95 static struct dsched_policy_head dsched_policy_list =
96 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
97 
98 static struct dsched_policy dsched_default_policy = {
99 	.name = "noop",
100 
101 	.prepare = default_prepare,
102 	.teardown = default_teardown,
103 	.cancel_all = default_cancel,
104 	.bio_queue = default_queue
105 };
106 
107 static struct dev_ops dsched_dev_ops = {
108 	{ "dsched", 0, 0 },
109 	.d_open = dsched_dev_open,
110 	.d_close = dsched_dev_close,
111 	.d_ioctl = dsched_dev_ioctl
112 };
113 
114 /*
115  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
116  * using kvprintf
117  */
118 int
119 dsched_debug(int level, char *fmt, ...)
120 {
121 	__va_list ap;
122 
123 	__va_start(ap, fmt);
124 	if (level <= dsched_debug_enable)
125 		kvprintf(fmt, ap);
126 	__va_end(ap);
127 
128 	return 0;
129 }
130 
131 /*
132  * Called on disk_create()
133  * tries to read which policy to use from loader.conf, if there's
134  * none specified, the default policy is used.
135  */
136 void
137 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
138 {
139 	char tunable_key[SPECNAMELEN + 48];
140 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
141 	struct dsched_policy *policy = NULL;
142 
143 	/* Also look for serno stuff? */
144 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
145 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
146 
147 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
148 	    head_name, unit);
149 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
150 	    sizeof(sched_policy)) != 0) {
151 		policy = dsched_find_policy(sched_policy);
152 	}
153 
154 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
155 	    head_name);
156 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
157 	    sizeof(sched_policy)) != 0)) {
158 		policy = dsched_find_policy(sched_policy);
159 	}
160 
161 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
162 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
163 	    sizeof(sched_policy)) != 0)) {
164 		policy = dsched_find_policy(sched_policy);
165 	}
166 
167 	if (!policy) {
168 		dsched_debug(0, "No policy for %s%d specified, "
169 		    "or policy not found\n", head_name, unit);
170 		dsched_set_policy(dp, &dsched_default_policy);
171 	} else {
172 		dsched_set_policy(dp, policy);
173 	}
174 
175 	lockmgr(&dsched_lock, LK_RELEASE);
176 }
177 
178 /*
179  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
180  * there's any policy associated with the serial number of the device.
181  */
182 void
183 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
184 {
185 	char tunable_key[SPECNAMELEN + 48];
186 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
187 	struct dsched_policy *policy = NULL;
188 
189 	if (info->d_serialno == NULL)
190 		return;
191 
192 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
193 
194 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
195 	    info->d_serialno);
196 
197 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
198 	    sizeof(sched_policy)) != 0)) {
199 		policy = dsched_find_policy(sched_policy);
200 	}
201 
202 	if (policy) {
203 		dsched_switch(dp, policy);
204 	}
205 
206 	lockmgr(&dsched_lock, LK_RELEASE);
207 }
208 
209 /*
210  * Called on disk_destroy()
211  * shuts down the scheduler core and cancels all remaining bios
212  */
213 void
214 dsched_disk_destroy_callback(struct disk *dp)
215 {
216 	struct dsched_policy *old_policy;
217 
218 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
219 
220 	old_policy = dp->d_sched_policy;
221 	dp->d_sched_policy = &dsched_default_policy;
222 	old_policy->cancel_all(dsched_get_disk_priv(dp));
223 	old_policy->teardown(dsched_get_disk_priv(dp));
224 	policy_destroy(dp);
225 	atomic_subtract_int(&old_policy->ref_count, 1);
226 	KKASSERT(old_policy->ref_count >= 0);
227 
228 	lockmgr(&dsched_lock, LK_RELEASE);
229 }
230 
231 
232 void
233 dsched_queue(struct disk *dp, struct bio *bio)
234 {
235 	struct dsched_thread_ctx	*tdctx;
236 	struct dsched_thread_io		*tdio;
237 	struct dsched_disk_ctx		*diskctx;
238 
239 	int found = 0, error = 0;
240 
241 	tdctx = dsched_get_buf_priv(bio->bio_buf);
242 	if (tdctx == NULL) {
243 		/* We don't handle this case, let dsched dispatch */
244 		atomic_add_int(&dsched_stats.no_tdctx, 1);
245 		dsched_strategy_raw(dp, bio);
246 		return;
247 	}
248 
249 	DSCHED_THREAD_CTX_LOCK(tdctx);
250 
251 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
252 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
253 		if (tdio->dp == dp) {
254 			dsched_thread_io_ref(tdio);
255 			found = 1;
256 			break;
257 		}
258 	}
259 
260 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
261 	dsched_clr_buf_priv(bio->bio_buf);
262 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
263 
264 	KKASSERT(found == 1);
265 	diskctx = dsched_get_disk_priv(dp);
266 	dsched_disk_ctx_ref(diskctx);
267 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
268 
269 	if (error) {
270 		dsched_strategy_raw(dp, bio);
271 	}
272 	dsched_disk_ctx_unref(diskctx);
273 	dsched_thread_io_unref(tdio);
274 }
275 
276 
277 /*
278  * Called from each module_init or module_attach of each policy
279  * registers the policy in the local policy list.
280  */
281 int
282 dsched_register(struct dsched_policy *d_policy)
283 {
284 	struct dsched_policy *policy;
285 	int error = 0;
286 
287 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
288 
289 	policy = dsched_find_policy(d_policy->name);
290 
291 	if (!policy) {
292 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
293 		atomic_add_int(&d_policy->ref_count, 1);
294 	} else {
295 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
296 		    d_policy->name);
297 		error = EEXIST;
298 	}
299 
300 	lockmgr(&dsched_lock, LK_RELEASE);
301 	return error;
302 }
303 
304 /*
305  * Called from each module_detach of each policy
306  * unregisters the policy
307  */
308 int
309 dsched_unregister(struct dsched_policy *d_policy)
310 {
311 	struct dsched_policy *policy;
312 
313 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
314 	policy = dsched_find_policy(d_policy->name);
315 
316 	if (policy) {
317 		if (policy->ref_count > 1) {
318 			lockmgr(&dsched_lock, LK_RELEASE);
319 			return EBUSY;
320 		}
321 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
322 		atomic_subtract_int(&policy->ref_count, 1);
323 		KKASSERT(policy->ref_count == 0);
324 	}
325 	lockmgr(&dsched_lock, LK_RELEASE);
326 	return 0;
327 }
328 
329 
330 /*
331  * switches the policy by first removing the old one and then
332  * enabling the new one.
333  */
334 int
335 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
336 {
337 	struct dsched_policy *old_policy;
338 
339 	/* If we are asked to set the same policy, do nothing */
340 	if (dp->d_sched_policy == new_policy)
341 		return 0;
342 
343 	/* lock everything down, diskwise */
344 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
345 	old_policy = dp->d_sched_policy;
346 
347 	atomic_subtract_int(&old_policy->ref_count, 1);
348 	KKASSERT(old_policy->ref_count >= 0);
349 
350 	dp->d_sched_policy = &dsched_default_policy;
351 	old_policy->teardown(dsched_get_disk_priv(dp));
352 	policy_destroy(dp);
353 
354 	/* Bring everything back to life */
355 	dsched_set_policy(dp, new_policy);
356 	lockmgr(&dsched_lock, LK_RELEASE);
357 	return 0;
358 }
359 
360 
361 /*
362  * Loads a given policy and attaches it to the specified disk.
363  * Also initializes the core for the policy
364  */
365 void
366 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
367 {
368 	int locked = 0;
369 
370 	/* Check if it is locked already. if not, we acquire the devfs lock */
371 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
372 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
373 		locked = 1;
374 	}
375 
376 	policy_new(dp, new_policy);
377 	new_policy->prepare(dsched_get_disk_priv(dp));
378 	dp->d_sched_policy = new_policy;
379 	atomic_add_int(&new_policy->ref_count, 1);
380 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
381 	    new_policy->name);
382 
383 	/* If we acquired the lock, we also get rid of it */
384 	if (locked)
385 		lockmgr(&dsched_lock, LK_RELEASE);
386 }
387 
388 struct dsched_policy*
389 dsched_find_policy(char *search)
390 {
391 	struct dsched_policy *policy;
392 	struct dsched_policy *policy_found = NULL;
393 	int locked = 0;
394 
395 	/* Check if it is locked already. if not, we acquire the devfs lock */
396 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
397 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
398 		locked = 1;
399 	}
400 
401 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
402 		if (!strcmp(policy->name, search)) {
403 			policy_found = policy;
404 			break;
405 		}
406 	}
407 
408 	/* If we acquired the lock, we also get rid of it */
409 	if (locked)
410 		lockmgr(&dsched_lock, LK_RELEASE);
411 
412 	return policy_found;
413 }
414 
415 struct disk*
416 dsched_find_disk(char *search)
417 {
418 	struct disk *dp_found = NULL;
419 	struct disk *dp = NULL;
420 
421 	while((dp = disk_enumerate(dp))) {
422 		if (!strcmp(dp->d_cdev->si_name, search)) {
423 			dp_found = dp;
424 			break;
425 		}
426 	}
427 
428 	return dp_found;
429 }
430 
431 struct disk*
432 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
433 {
434 	while ((dp = disk_enumerate(dp))) {
435 		if (dp->d_sched_policy == policy)
436 			return dp;
437 	}
438 
439 	return NULL;
440 }
441 
442 struct dsched_policy *
443 dsched_policy_enumerate(struct dsched_policy *pol)
444 {
445 	if (!pol)
446 		return (TAILQ_FIRST(&dsched_policy_list));
447 	else
448 		return (TAILQ_NEXT(pol, link));
449 }
450 
451 void
452 dsched_cancel_bio(struct bio *bp)
453 {
454 	bp->bio_buf->b_error = ENXIO;
455 	bp->bio_buf->b_flags |= B_ERROR;
456 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
457 
458 	biodone(bp);
459 }
460 
461 void
462 dsched_strategy_raw(struct disk *dp, struct bio *bp)
463 {
464 	/*
465 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
466 	 * to avoid panics
467 	 */
468 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
469 	if(bp->bio_track != NULL) {
470 		dsched_debug(LOG_INFO,
471 		    "dsched_strategy_raw sees non-NULL bio_track!! "
472 		    "bio: %p\n", bp);
473 		bp->bio_track = NULL;
474 	}
475 	dev_dstrategy(dp->d_rawdev, bp);
476 }
477 
478 void
479 dsched_strategy_sync(struct disk *dp, struct bio *bio)
480 {
481 	struct buf *bp, *nbp;
482 	struct bio *nbio;
483 
484 	bp = bio->bio_buf;
485 
486 	nbp = getpbuf(NULL);
487 	nbio = &nbp->b_bio1;
488 
489 	nbp->b_cmd = bp->b_cmd;
490 	nbp->b_bufsize = bp->b_bufsize;
491 	nbp->b_runningbufspace = bp->b_runningbufspace;
492 	nbp->b_bcount = bp->b_bcount;
493 	nbp->b_resid = bp->b_resid;
494 	nbp->b_data = bp->b_data;
495 	nbp->b_kvabase = bp->b_kvabase;
496 	nbp->b_kvasize = bp->b_kvasize;
497 	nbp->b_dirtyend = bp->b_dirtyend;
498 
499 	nbio->bio_done = biodone_sync;
500 	nbio->bio_flags |= BIO_SYNC;
501 	nbio->bio_track = NULL;
502 
503 	nbio->bio_caller_info1.ptr = dp;
504 	nbio->bio_offset = bio->bio_offset;
505 
506 	dev_dstrategy(dp->d_rawdev, nbio);
507 	biowait(nbio, "dschedsync");
508 	bp->b_resid = nbp->b_resid;
509 	bp->b_error = nbp->b_error;
510 	biodone(bio);
511 	relpbuf(nbp, NULL);
512 }
513 
514 void
515 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
516 {
517 	struct bio *nbio;
518 
519 	nbio = push_bio(bio);
520 	nbio->bio_done = done;
521 	nbio->bio_offset = bio->bio_offset;
522 
523 	dsched_set_bio_dp(nbio, dp);
524 	dsched_set_bio_priv(nbio, priv);
525 
526 	getmicrotime(&nbio->bio_caller_info3.tv);
527 	dev_dstrategy(dp->d_rawdev, nbio);
528 }
529 
530 void
531 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
532 {
533 	int refcount;
534 
535 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
536 
537 	KKASSERT(refcount >= 0);
538 }
539 
540 void
541 dsched_thread_io_ref(struct dsched_thread_io *tdio)
542 {
543 	int refcount;
544 
545 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
546 
547 	KKASSERT(refcount >= 0);
548 }
549 
550 void
551 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
552 {
553 	int refcount;
554 
555 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
556 
557 	KKASSERT(refcount >= 0);
558 }
559 
560 void
561 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
562 {
563 	struct dsched_thread_io	*tdio, *tdio2;
564 	int refcount;
565 
566 	refcount = atomic_fetchadd_int(&diskctx->refcount, -1);
567 
568 
569 	KKASSERT(refcount >= 0 || refcount <= -0x400);
570 
571 	if (refcount == 1) {
572 		atomic_subtract_int(&diskctx->refcount, 0x400); /* mark as: in destruction */
573 #if 0
574 		kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
575 		print_backtrace(4);
576 #endif
577 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
578 		TAILQ_FOREACH_MUTABLE(tdio, &diskctx->tdio_list, dlink, tdio2) {
579 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
580 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
581 			dsched_thread_io_unref(tdio);
582 		}
583 		lockmgr(&diskctx->lock, LK_RELEASE);
584 		if (diskctx->dp->d_sched_policy->destroy_diskctx)
585 			diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
586 		objcache_put(dsched_diskctx_cache, diskctx);
587 		atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
588 	}
589 }
590 
591 void
592 dsched_thread_io_unref(struct dsched_thread_io *tdio)
593 {
594 	struct dsched_thread_ctx	*tdctx;
595 	struct dsched_disk_ctx	*diskctx;
596 	int refcount;
597 
598 	refcount = atomic_fetchadd_int(&tdio->refcount, -1);
599 
600 	KKASSERT(refcount >= 0 || refcount <= -0x400);
601 
602 	if (refcount == 1) {
603 		atomic_subtract_int(&tdio->refcount, 0x400); /* mark as: in destruction */
604 #if 0
605 		kprintf("tdio (%p) destruction started, trace:\n", tdio);
606 		print_backtrace(8);
607 #endif
608 		diskctx = tdio->diskctx;
609 		KKASSERT(diskctx != NULL);
610 		KKASSERT(tdio->qlength == 0);
611 
612 		if (tdio->flags & DSCHED_LINKED_DISK_CTX) {
613 			lockmgr(&diskctx->lock, LK_EXCLUSIVE);
614 
615 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
616 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
617 
618 			lockmgr(&diskctx->lock, LK_RELEASE);
619 		}
620 
621 		if (tdio->flags & DSCHED_LINKED_THREAD_CTX) {
622 			tdctx = tdio->tdctx;
623 			KKASSERT(tdctx != NULL);
624 
625 			lockmgr(&tdctx->lock, LK_EXCLUSIVE);
626 
627 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
628 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
629 
630 			lockmgr(&tdctx->lock, LK_RELEASE);
631 		}
632 		if (tdio->diskctx->dp->d_sched_policy->destroy_tdio)
633 			tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);
634 		objcache_put(dsched_tdio_cache, tdio);
635 		atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
636 #if 0
637 		dsched_disk_ctx_unref(diskctx);
638 #endif
639 	}
640 }
641 
642 void
643 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
644 {
645 	struct dsched_thread_io	*tdio, *tdio2;
646 	int refcount;
647 
648 	refcount = atomic_fetchadd_int(&tdctx->refcount, -1);
649 
650 	KKASSERT(refcount >= 0 || refcount <= -0x400);
651 
652 	if (refcount == 1) {
653 		atomic_subtract_int(&tdctx->refcount, 0x400); /* mark as: in destruction */
654 #if 0
655 		kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
656 		print_backtrace(8);
657 #endif
658 		DSCHED_GLOBAL_THREAD_CTX_LOCK();
659 
660 		TAILQ_FOREACH_MUTABLE(tdio, &tdctx->tdio_list, link, tdio2) {
661 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
662 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
663 			dsched_thread_io_unref(tdio);
664 		}
665 		TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
666 
667 		DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
668 
669 		objcache_put(dsched_tdctx_cache, tdctx);
670 		atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
671 	}
672 }
673 
674 
675 struct dsched_thread_io *
676 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
677     struct dsched_policy *pol)
678 {
679 	struct dsched_thread_io	*tdio;
680 #if 0
681 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
682 #endif
683 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
684 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
685 
686 	/* XXX: maybe we do need another ref for the disk list for tdio */
687 	dsched_thread_io_ref(tdio);
688 
689 	DSCHED_THREAD_IO_LOCKINIT(tdio);
690 	tdio->dp = dp;
691 
692 	tdio->diskctx = dsched_get_disk_priv(dp);
693 	TAILQ_INIT(&tdio->queue);
694 
695 	if (pol->new_tdio)
696 		pol->new_tdio(tdio);
697 
698 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
699 	tdio->flags |= DSCHED_LINKED_DISK_CTX;
700 
701 	if (tdctx) {
702 		tdio->tdctx = tdctx;
703 		tdio->p = tdctx->p;
704 
705 		/* Put the tdio in the tdctx list */
706 		DSCHED_THREAD_CTX_LOCK(tdctx);
707 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
708 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
709 		tdio->flags |= DSCHED_LINKED_THREAD_CTX;
710 	}
711 
712 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
713 	return tdio;
714 }
715 
716 
717 struct dsched_disk_ctx *
718 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
719 {
720 	struct dsched_disk_ctx *diskctx;
721 
722 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
723 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
724 	dsched_disk_ctx_ref(diskctx);
725 	diskctx->dp = dp;
726 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
727 	TAILQ_INIT(&diskctx->tdio_list);
728 
729 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
730 	if (pol->new_diskctx)
731 		pol->new_diskctx(diskctx);
732 	return diskctx;
733 }
734 
735 
736 struct dsched_thread_ctx *
737 dsched_thread_ctx_alloc(struct proc *p)
738 {
739 	struct dsched_thread_ctx	*tdctx;
740 	struct dsched_thread_io	*tdio;
741 	struct disk	*dp = NULL;
742 
743 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
744 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
745 	dsched_thread_ctx_ref(tdctx);
746 #if 0
747 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
748 #endif
749 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
750 	TAILQ_INIT(&tdctx->tdio_list);
751 	tdctx->p = p;
752 
753 	/* XXX */
754 	while ((dp = disk_enumerate(dp))) {
755 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
756 	}
757 
758 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
759 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
760 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
761 
762 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
763 	/* XXX: no callback here */
764 	return tdctx;
765 }
766 
767 void
768 policy_new(struct disk *dp, struct dsched_policy *pol) {
769 	struct dsched_thread_ctx *tdctx;
770 	struct dsched_disk_ctx *diskctx;
771 	struct dsched_thread_io *tdio;
772 
773 	diskctx = dsched_disk_ctx_alloc(dp, pol);
774 	dsched_disk_ctx_ref(diskctx);
775 	dsched_set_disk_priv(dp, diskctx);
776 
777 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
778 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
779 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
780 	}
781 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
782 
783 }
784 
785 void
786 policy_destroy(struct disk *dp) {
787 	struct dsched_disk_ctx *diskctx;
788 
789 	diskctx = dsched_get_disk_priv(dp);
790 	KKASSERT(diskctx != NULL);
791 
792 	dsched_disk_ctx_unref(diskctx); /* from prepare */
793 	dsched_disk_ctx_unref(diskctx); /* from alloc */
794 
795 	dsched_set_disk_priv(dp, NULL);
796 }
797 
798 void
799 dsched_new_buf(struct buf *bp)
800 {
801 	struct dsched_thread_ctx	*tdctx = NULL;
802 
803 	if (dsched_inited == 0)
804 		return;
805 
806 	if (curproc != NULL) {
807 		tdctx = dsched_get_proc_priv(curproc);
808 	} else {
809 		/* This is a kernel thread, so no proc info is available */
810 		tdctx = dsched_get_thread_priv(curthread);
811 	}
812 
813 #if 0
814 	/*
815 	 * XXX: hack. we don't want this assert because we aren't catching all
816 	 *	threads. mi_startup() is still getting away without an tdctx.
817 	 */
818 
819 	/* by now we should have an tdctx. if not, something bad is going on */
820 	KKASSERT(tdctx != NULL);
821 #endif
822 
823 	if (tdctx) {
824 		dsched_thread_ctx_ref(tdctx);
825 	}
826 	dsched_set_buf_priv(bp, tdctx);
827 }
828 
829 void
830 dsched_exit_buf(struct buf *bp)
831 {
832 	struct dsched_thread_ctx	*tdctx;
833 
834 	tdctx = dsched_get_buf_priv(bp);
835 	if (tdctx != NULL) {
836 		dsched_clr_buf_priv(bp);
837 		dsched_thread_ctx_unref(tdctx);
838 	}
839 }
840 
841 void
842 dsched_new_proc(struct proc *p)
843 {
844 	struct dsched_thread_ctx	*tdctx;
845 
846 	if (dsched_inited == 0)
847 		return;
848 
849 	KKASSERT(p != NULL);
850 
851 	tdctx = dsched_thread_ctx_alloc(p);
852 	tdctx->p = p;
853 	dsched_thread_ctx_ref(tdctx);
854 
855 	dsched_set_proc_priv(p, tdctx);
856 	atomic_add_int(&dsched_stats.nprocs, 1);
857 }
858 
859 
860 void
861 dsched_new_thread(struct thread *td)
862 {
863 	struct dsched_thread_ctx	*tdctx;
864 
865 	if (dsched_inited == 0)
866 		return;
867 
868 	KKASSERT(td != NULL);
869 
870 	tdctx = dsched_thread_ctx_alloc(NULL);
871 	tdctx->td = td;
872 	dsched_thread_ctx_ref(tdctx);
873 
874 	dsched_set_thread_priv(td, tdctx);
875 	atomic_add_int(&dsched_stats.nthreads, 1);
876 }
877 
878 void
879 dsched_exit_proc(struct proc *p)
880 {
881 	struct dsched_thread_ctx	*tdctx;
882 
883 	if (dsched_inited == 0)
884 		return;
885 
886 	KKASSERT(p != NULL);
887 
888 	tdctx = dsched_get_proc_priv(p);
889 	KKASSERT(tdctx != NULL);
890 
891 	tdctx->dead = 0xDEAD;
892 	dsched_set_proc_priv(p, 0);
893 
894 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
895 	dsched_thread_ctx_unref(tdctx); /* one for ref */
896 	atomic_subtract_int(&dsched_stats.nprocs, 1);
897 }
898 
899 
900 void
901 dsched_exit_thread(struct thread *td)
902 {
903 	struct dsched_thread_ctx	*tdctx;
904 
905 	if (dsched_inited == 0)
906 		return;
907 
908 	KKASSERT(td != NULL);
909 
910 	tdctx = dsched_get_thread_priv(td);
911 	KKASSERT(tdctx != NULL);
912 
913 	tdctx->dead = 0xDEAD;
914 	dsched_set_thread_priv(td, 0);
915 
916 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
917 	dsched_thread_ctx_unref(tdctx); /* one for ref */
918 	atomic_subtract_int(&dsched_stats.nthreads, 1);
919 }
920 
921 /* DEFAULT NOOP POLICY */
922 
923 static int
924 default_prepare(struct dsched_disk_ctx *diskctx)
925 {
926 	return 0;
927 }
928 
929 static void
930 default_teardown(struct dsched_disk_ctx *diskctx)
931 {
932 
933 }
934 
935 static void
936 default_cancel(struct dsched_disk_ctx *diskctx)
937 {
938 
939 }
940 
941 static int
942 default_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
943     struct bio *bio)
944 {
945 	dsched_strategy_raw(diskctx->dp, bio);
946 #if 0
947 	dsched_strategy_async(diskctx->dp, bio, default_completed, NULL);
948 #endif
949 	return 0;
950 }
951 
952 
953 /*
954  * dsched device stuff
955  */
956 
957 static int
958 dsched_dev_list_disks(struct dsched_ioctl *data)
959 {
960 	struct disk *dp = NULL;
961 	uint32_t i;
962 
963 	for (i = 0; (i <= data->num_elem) && (dp = disk_enumerate(dp)); i++);
964 
965 	if (dp == NULL)
966 		return -1;
967 
968 	strncpy(data->dev_name, dp->d_cdev->si_name, sizeof(data->dev_name));
969 
970 	if (dp->d_sched_policy) {
971 		strncpy(data->pol_name, dp->d_sched_policy->name,
972 		    sizeof(data->pol_name));
973 	} else {
974 		strncpy(data->pol_name, "N/A (error)", 12);
975 	}
976 
977 	return 0;
978 }
979 
980 static int
981 dsched_dev_list_disk(struct dsched_ioctl *data)
982 {
983 	struct disk *dp = NULL;
984 	int found = 0;
985 
986 	while ((dp = disk_enumerate(dp))) {
987 		if (!strncmp(dp->d_cdev->si_name, data->dev_name,
988 		    sizeof(data->dev_name))) {
989 			KKASSERT(dp->d_sched_policy != NULL);
990 
991 			found = 1;
992 			strncpy(data->pol_name, dp->d_sched_policy->name,
993 			    sizeof(data->pol_name));
994 			break;
995 		}
996 	}
997 	if (!found)
998 		return -1;
999 
1000 	return 0;
1001 }
1002 
1003 static int
1004 dsched_dev_list_policies(struct dsched_ioctl *data)
1005 {
1006 	struct dsched_policy *pol = NULL;
1007 	uint32_t i;
1008 
1009 	for (i = 0; (i <= data->num_elem) && (pol = dsched_policy_enumerate(pol)); i++);
1010 
1011 	if (pol == NULL)
1012 		return -1;
1013 
1014 	strncpy(data->pol_name, pol->name, sizeof(data->pol_name));
1015 	return 0;
1016 }
1017 
1018 static int
1019 dsched_dev_handle_switch(char *disk, char *policy)
1020 {
1021 	struct disk *dp;
1022 	struct dsched_policy *pol;
1023 
1024 	dp = dsched_find_disk(disk);
1025 	pol = dsched_find_policy(policy);
1026 
1027 	if ((dp == NULL) || (pol == NULL))
1028 		return -1;
1029 
1030 	return (dsched_switch(dp, pol));
1031 }
1032 
1033 static int
1034 dsched_dev_open(struct dev_open_args *ap)
1035 {
1036 	/*
1037 	 * Only allow read-write access.
1038 	 */
1039 	if (((ap->a_oflags & FWRITE) == 0) || ((ap->a_oflags & FREAD) == 0))
1040 		return(EPERM);
1041 
1042 	/*
1043 	 * We don't allow nonblocking access.
1044 	 */
1045 	if ((ap->a_oflags & O_NONBLOCK) != 0) {
1046 		kprintf("dsched_dev: can't do nonblocking access\n");
1047 		return(ENODEV);
1048 	}
1049 
1050 	return 0;
1051 }
1052 
1053 static int
1054 dsched_dev_close(struct dev_close_args *ap)
1055 {
1056 	return 0;
1057 }
1058 
1059 static int
1060 dsched_dev_ioctl(struct dev_ioctl_args *ap)
1061 {
1062 	int error;
1063 	struct dsched_ioctl *data;
1064 
1065 	error = 0;
1066 	data = (struct dsched_ioctl *)ap->a_data;
1067 
1068 	switch(ap->a_cmd) {
1069 	case DSCHED_SET_DEVICE_POLICY:
1070 		if (dsched_dev_handle_switch(data->dev_name, data->pol_name))
1071 			error = ENOENT; /* No such file or directory */
1072 		break;
1073 
1074 	case DSCHED_LIST_DISK:
1075 		if (dsched_dev_list_disk(data) != 0) {
1076 			error = EINVAL; /* Invalid argument */
1077 		}
1078 		break;
1079 
1080 	case DSCHED_LIST_DISKS:
1081 		if (dsched_dev_list_disks(data) != 0) {
1082 			error = EINVAL; /* Invalid argument */
1083 		}
1084 		break;
1085 
1086 	case DSCHED_LIST_POLICIES:
1087 		if (dsched_dev_list_policies(data) != 0) {
1088 			error = EINVAL; /* Invalid argument */
1089 		}
1090 		break;
1091 
1092 
1093 	default:
1094 		error = ENOTTY; /* Inappropriate ioctl for device */
1095 		break;
1096 	}
1097 
1098 	return(error);
1099 }
1100 
1101 
1102 
1103 
1104 
1105 
1106 /*
1107  * SYSINIT stuff
1108  */
1109 
1110 
1111 static void
1112 dsched_init(void)
1113 {
1114 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1115 					   NULL, NULL, NULL,
1116 					   objcache_malloc_alloc,
1117 					   objcache_malloc_free,
1118 					   &dsched_thread_io_malloc_args );
1119 
1120 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1121 					   NULL, NULL, NULL,
1122 					   objcache_malloc_alloc,
1123 					   objcache_malloc_free,
1124 					   &dsched_thread_ctx_malloc_args );
1125 
1126 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1127 					   NULL, NULL, NULL,
1128 					   objcache_malloc_alloc,
1129 					   objcache_malloc_free,
1130 					   &dsched_disk_ctx_malloc_args );
1131 
1132 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1133 
1134 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1135 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1136 
1137 	dsched_register(&dsched_default_policy);
1138 
1139 	dsched_inited = 1;
1140 }
1141 
1142 static void
1143 dsched_uninit(void)
1144 {
1145 }
1146 
1147 static void
1148 dsched_dev_init(void)
1149 {
1150 	dsched_dev = make_dev(&dsched_dev_ops,
1151             0,
1152             UID_ROOT,
1153             GID_WHEEL,
1154             0600,
1155             "dsched");
1156 }
1157 
1158 static void
1159 dsched_dev_uninit(void)
1160 {
1161 	destroy_dev(dsched_dev);
1162 }
1163 
1164 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1165 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1166 SYSINIT(subr_dsched_dev_register, SI_SUB_DRIVERS, SI_ORDER_ANY, dsched_dev_init, NULL);
1167 SYSUNINIT(subr_dsched_dev_register, SI_SUB_DRIVERS, SI_ORDER_ANY, dsched_dev_uninit, NULL);
1168 
1169 /*
1170  * SYSCTL stuff
1171  */
1172 static int
1173 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1174 {
1175 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1176 }
1177 
1178 static int
1179 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1180 {
1181 	struct dsched_policy *pol = NULL;
1182 	int error, first = 1;
1183 
1184 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1185 
1186 	while ((pol = dsched_policy_enumerate(pol))) {
1187 		if (!first) {
1188 			error = SYSCTL_OUT(req, " ", 1);
1189 			if (error)
1190 				break;
1191 		} else {
1192 			first = 0;
1193 		}
1194 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1195 		if (error)
1196 			break;
1197 
1198 	}
1199 
1200 	lockmgr(&dsched_lock, LK_RELEASE);
1201 
1202 	error = SYSCTL_OUT(req, "", 1);
1203 
1204 	return error;
1205 }
1206 
1207 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1208     "Disk Scheduler Framework (dsched) magic");
1209 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1210     0, "Enable dsched debugging");
1211 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1212     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1213     "dsched statistics");
1214 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1215     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1216 
1217