xref: /dflybsd-src/sys/kern/kern_dsched.c (revision 441d34b2441f59fde86fa4ef2d5d5cb7a6bfcb11)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		default_prepare;
58 static dsched_teardown_t	default_teardown;
59 static dsched_cancel_t		default_cancel;
60 static dsched_queue_t		default_queue;
61 
62 static d_open_t      dsched_dev_open;
63 static d_close_t     dsched_dev_close;
64 static d_ioctl_t     dsched_dev_ioctl;
65 
66 static int dsched_dev_list_disks(struct dsched_ioctl *data);
67 static int dsched_dev_list_disk(struct dsched_ioctl *data);
68 static int dsched_dev_list_policies(struct dsched_ioctl *data);
69 static int dsched_dev_handle_switch(char *disk, char *policy);
70 
71 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
72 
73 static int	dsched_inited = 0;
74 
75 struct lock	dsched_lock;
76 static int	dsched_debug_enable = 0;
77 static cdev_t	dsched_dev;
78 
79 struct dsched_stats	dsched_stats;
80 
81 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
82 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
83 struct objcache_malloc_args dsched_thread_io_malloc_args = {
84 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
85 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
86 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
87 
88 static struct objcache	*dsched_diskctx_cache;
89 static struct objcache	*dsched_tdctx_cache;
90 static struct objcache	*dsched_tdio_cache;
91 
92 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
93 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
94 
95 struct lock	dsched_tdctx_lock;
96 
97 static struct dsched_policy_head dsched_policy_list =
98 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
99 
100 static struct dsched_policy dsched_default_policy = {
101 	.name = "noop",
102 
103 	.prepare = default_prepare,
104 	.teardown = default_teardown,
105 	.cancel_all = default_cancel,
106 	.bio_queue = default_queue
107 };
108 
109 static struct dev_ops dsched_dev_ops = {
110 	{ "dsched", 0, 0 },
111 	.d_open = dsched_dev_open,
112 	.d_close = dsched_dev_close,
113 	.d_ioctl = dsched_dev_ioctl
114 };
115 
116 /*
117  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
118  * using kvprintf
119  */
120 int
121 dsched_debug(int level, char *fmt, ...)
122 {
123 	__va_list ap;
124 
125 	__va_start(ap, fmt);
126 	if (level <= dsched_debug_enable)
127 		kvprintf(fmt, ap);
128 	__va_end(ap);
129 
130 	return 0;
131 }
132 
133 /*
134  * Called on disk_create()
135  * tries to read which policy to use from loader.conf, if there's
136  * none specified, the default policy is used.
137  */
138 void
139 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
140 {
141 	char tunable_key[SPECNAMELEN + 48];
142 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
143 	struct dsched_policy *policy = NULL;
144 
145 	/* Also look for serno stuff? */
146 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
147 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
148 
149 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
150 	    head_name, unit);
151 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
152 	    sizeof(sched_policy)) != 0) {
153 		policy = dsched_find_policy(sched_policy);
154 	}
155 
156 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
157 	    head_name);
158 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
159 	    sizeof(sched_policy)) != 0)) {
160 		policy = dsched_find_policy(sched_policy);
161 	}
162 
163 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
164 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
165 	    sizeof(sched_policy)) != 0)) {
166 		policy = dsched_find_policy(sched_policy);
167 	}
168 
169 	if (!policy) {
170 		dsched_debug(0, "No policy for %s%d specified, "
171 		    "or policy not found\n", head_name, unit);
172 		dsched_set_policy(dp, &dsched_default_policy);
173 	} else {
174 		dsched_set_policy(dp, policy);
175 	}
176 
177 	ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
178 	dsched_sysctl_add_disk(
179 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
180 	    tunable_key);
181 
182 	lockmgr(&dsched_lock, LK_RELEASE);
183 }
184 
185 /*
186  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
187  * there's any policy associated with the serial number of the device.
188  */
189 void
190 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
191 {
192 	char tunable_key[SPECNAMELEN + 48];
193 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
194 	struct dsched_policy *policy = NULL;
195 
196 	if (info->d_serialno == NULL)
197 		return;
198 
199 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
200 
201 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
202 	    info->d_serialno);
203 
204 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
205 	    sizeof(sched_policy)) != 0)) {
206 		policy = dsched_find_policy(sched_policy);
207 	}
208 
209 	if (policy) {
210 		dsched_switch(dp, policy);
211 	}
212 
213 	dsched_sysctl_add_disk(
214 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
215 	    info->d_serialno);
216 
217 	lockmgr(&dsched_lock, LK_RELEASE);
218 }
219 
220 /*
221  * Called on disk_destroy()
222  * shuts down the scheduler core and cancels all remaining bios
223  */
224 void
225 dsched_disk_destroy_callback(struct disk *dp)
226 {
227 	struct dsched_policy *old_policy;
228 	struct dsched_disk_ctx *diskctx;
229 
230 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
231 
232 	diskctx = dsched_get_disk_priv(dp);
233 
234 	old_policy = dp->d_sched_policy;
235 	dp->d_sched_policy = &dsched_default_policy;
236 	old_policy->cancel_all(dsched_get_disk_priv(dp));
237 	old_policy->teardown(dsched_get_disk_priv(dp));
238 
239 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
240 		sysctl_ctx_free(&diskctx->sysctl_ctx);
241 
242 	policy_destroy(dp);
243 	atomic_subtract_int(&old_policy->ref_count, 1);
244 	KKASSERT(old_policy->ref_count >= 0);
245 
246 	lockmgr(&dsched_lock, LK_RELEASE);
247 }
248 
249 
250 void
251 dsched_queue(struct disk *dp, struct bio *bio)
252 {
253 	struct dsched_thread_ctx	*tdctx;
254 	struct dsched_thread_io		*tdio;
255 	struct dsched_disk_ctx		*diskctx;
256 
257 	int found = 0, error = 0;
258 
259 	tdctx = dsched_get_buf_priv(bio->bio_buf);
260 	if (tdctx == NULL) {
261 		/* We don't handle this case, let dsched dispatch */
262 		atomic_add_int(&dsched_stats.no_tdctx, 1);
263 		dsched_strategy_raw(dp, bio);
264 		return;
265 	}
266 
267 	DSCHED_THREAD_CTX_LOCK(tdctx);
268 
269 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
270 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
271 		if (tdio->dp == dp) {
272 			dsched_thread_io_ref(tdio);
273 			found = 1;
274 			break;
275 		}
276 	}
277 
278 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
279 	dsched_clr_buf_priv(bio->bio_buf);
280 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
281 
282 	KKASSERT(found == 1);
283 	diskctx = dsched_get_disk_priv(dp);
284 	dsched_disk_ctx_ref(diskctx);
285 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
286 
287 	if (error) {
288 		dsched_strategy_raw(dp, bio);
289 	}
290 	dsched_disk_ctx_unref(diskctx);
291 	dsched_thread_io_unref(tdio);
292 }
293 
294 
295 /*
296  * Called from each module_init or module_attach of each policy
297  * registers the policy in the local policy list.
298  */
299 int
300 dsched_register(struct dsched_policy *d_policy)
301 {
302 	struct dsched_policy *policy;
303 	int error = 0;
304 
305 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
306 
307 	policy = dsched_find_policy(d_policy->name);
308 
309 	if (!policy) {
310 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
311 		atomic_add_int(&d_policy->ref_count, 1);
312 	} else {
313 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
314 		    d_policy->name);
315 		error = EEXIST;
316 	}
317 
318 	lockmgr(&dsched_lock, LK_RELEASE);
319 	return error;
320 }
321 
322 /*
323  * Called from each module_detach of each policy
324  * unregisters the policy
325  */
326 int
327 dsched_unregister(struct dsched_policy *d_policy)
328 {
329 	struct dsched_policy *policy;
330 
331 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
332 	policy = dsched_find_policy(d_policy->name);
333 
334 	if (policy) {
335 		if (policy->ref_count > 1) {
336 			lockmgr(&dsched_lock, LK_RELEASE);
337 			return EBUSY;
338 		}
339 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
340 		atomic_subtract_int(&policy->ref_count, 1);
341 		KKASSERT(policy->ref_count == 0);
342 	}
343 	lockmgr(&dsched_lock, LK_RELEASE);
344 	return 0;
345 }
346 
347 
348 /*
349  * switches the policy by first removing the old one and then
350  * enabling the new one.
351  */
352 int
353 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
354 {
355 	struct dsched_policy *old_policy;
356 
357 	/* If we are asked to set the same policy, do nothing */
358 	if (dp->d_sched_policy == new_policy)
359 		return 0;
360 
361 	/* lock everything down, diskwise */
362 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
363 	old_policy = dp->d_sched_policy;
364 
365 	atomic_subtract_int(&old_policy->ref_count, 1);
366 	KKASSERT(old_policy->ref_count >= 0);
367 
368 	dp->d_sched_policy = &dsched_default_policy;
369 	old_policy->teardown(dsched_get_disk_priv(dp));
370 	policy_destroy(dp);
371 
372 	/* Bring everything back to life */
373 	dsched_set_policy(dp, new_policy);
374 	lockmgr(&dsched_lock, LK_RELEASE);
375 	return 0;
376 }
377 
378 
379 /*
380  * Loads a given policy and attaches it to the specified disk.
381  * Also initializes the core for the policy
382  */
383 void
384 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
385 {
386 	int locked = 0;
387 
388 	/* Check if it is locked already. if not, we acquire the devfs lock */
389 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
390 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
391 		locked = 1;
392 	}
393 
394 	policy_new(dp, new_policy);
395 	new_policy->prepare(dsched_get_disk_priv(dp));
396 	dp->d_sched_policy = new_policy;
397 	atomic_add_int(&new_policy->ref_count, 1);
398 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
399 	    new_policy->name);
400 
401 	/* If we acquired the lock, we also get rid of it */
402 	if (locked)
403 		lockmgr(&dsched_lock, LK_RELEASE);
404 }
405 
406 struct dsched_policy*
407 dsched_find_policy(char *search)
408 {
409 	struct dsched_policy *policy;
410 	struct dsched_policy *policy_found = NULL;
411 	int locked = 0;
412 
413 	/* Check if it is locked already. if not, we acquire the devfs lock */
414 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
415 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
416 		locked = 1;
417 	}
418 
419 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
420 		if (!strcmp(policy->name, search)) {
421 			policy_found = policy;
422 			break;
423 		}
424 	}
425 
426 	/* If we acquired the lock, we also get rid of it */
427 	if (locked)
428 		lockmgr(&dsched_lock, LK_RELEASE);
429 
430 	return policy_found;
431 }
432 
433 struct disk*
434 dsched_find_disk(char *search)
435 {
436 	struct disk *dp_found = NULL;
437 	struct disk *dp = NULL;
438 
439 	while((dp = disk_enumerate(dp))) {
440 		if (!strcmp(dp->d_cdev->si_name, search)) {
441 			dp_found = dp;
442 			break;
443 		}
444 	}
445 
446 	return dp_found;
447 }
448 
449 struct disk*
450 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
451 {
452 	while ((dp = disk_enumerate(dp))) {
453 		if (dp->d_sched_policy == policy)
454 			return dp;
455 	}
456 
457 	return NULL;
458 }
459 
460 struct dsched_policy *
461 dsched_policy_enumerate(struct dsched_policy *pol)
462 {
463 	if (!pol)
464 		return (TAILQ_FIRST(&dsched_policy_list));
465 	else
466 		return (TAILQ_NEXT(pol, link));
467 }
468 
469 void
470 dsched_cancel_bio(struct bio *bp)
471 {
472 	bp->bio_buf->b_error = ENXIO;
473 	bp->bio_buf->b_flags |= B_ERROR;
474 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
475 
476 	biodone(bp);
477 }
478 
479 void
480 dsched_strategy_raw(struct disk *dp, struct bio *bp)
481 {
482 	/*
483 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
484 	 * to avoid panics
485 	 */
486 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
487 	if(bp->bio_track != NULL) {
488 		dsched_debug(LOG_INFO,
489 		    "dsched_strategy_raw sees non-NULL bio_track!! "
490 		    "bio: %p\n", bp);
491 		bp->bio_track = NULL;
492 	}
493 	dev_dstrategy(dp->d_rawdev, bp);
494 }
495 
496 void
497 dsched_strategy_sync(struct disk *dp, struct bio *bio)
498 {
499 	struct buf *bp, *nbp;
500 	struct bio *nbio;
501 
502 	bp = bio->bio_buf;
503 
504 	nbp = getpbuf(NULL);
505 	nbio = &nbp->b_bio1;
506 
507 	nbp->b_cmd = bp->b_cmd;
508 	nbp->b_bufsize = bp->b_bufsize;
509 	nbp->b_runningbufspace = bp->b_runningbufspace;
510 	nbp->b_bcount = bp->b_bcount;
511 	nbp->b_resid = bp->b_resid;
512 	nbp->b_data = bp->b_data;
513 	nbp->b_kvabase = bp->b_kvabase;
514 	nbp->b_kvasize = bp->b_kvasize;
515 	nbp->b_dirtyend = bp->b_dirtyend;
516 
517 	nbio->bio_done = biodone_sync;
518 	nbio->bio_flags |= BIO_SYNC;
519 	nbio->bio_track = NULL;
520 
521 	nbio->bio_caller_info1.ptr = dp;
522 	nbio->bio_offset = bio->bio_offset;
523 
524 	dev_dstrategy(dp->d_rawdev, nbio);
525 	biowait(nbio, "dschedsync");
526 	bp->b_resid = nbp->b_resid;
527 	bp->b_error = nbp->b_error;
528 	biodone(bio);
529 	relpbuf(nbp, NULL);
530 }
531 
532 void
533 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
534 {
535 	struct bio *nbio;
536 
537 	nbio = push_bio(bio);
538 	nbio->bio_done = done;
539 	nbio->bio_offset = bio->bio_offset;
540 
541 	dsched_set_bio_dp(nbio, dp);
542 	dsched_set_bio_priv(nbio, priv);
543 
544 	getmicrotime(&nbio->bio_caller_info3.tv);
545 	dev_dstrategy(dp->d_rawdev, nbio);
546 }
547 
548 void
549 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
550 {
551 	int refcount;
552 
553 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
554 
555 	KKASSERT(refcount >= 0);
556 }
557 
558 void
559 dsched_thread_io_ref(struct dsched_thread_io *tdio)
560 {
561 	int refcount;
562 
563 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
564 
565 	KKASSERT(refcount >= 0);
566 }
567 
568 void
569 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
570 {
571 	int refcount;
572 
573 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
574 
575 	KKASSERT(refcount >= 0);
576 }
577 
578 void
579 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
580 {
581 	struct dsched_thread_io	*tdio, *tdio2;
582 	int refcount;
583 
584 	refcount = atomic_fetchadd_int(&diskctx->refcount, -1);
585 
586 
587 	KKASSERT(refcount >= 0 || refcount <= -0x400);
588 
589 	if (refcount == 1) {
590 		atomic_subtract_int(&diskctx->refcount, 0x400); /* mark as: in destruction */
591 #if 0
592 		kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
593 		print_backtrace(4);
594 #endif
595 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
596 		TAILQ_FOREACH_MUTABLE(tdio, &diskctx->tdio_list, dlink, tdio2) {
597 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
598 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
599 			dsched_thread_io_unref(tdio);
600 		}
601 		lockmgr(&diskctx->lock, LK_RELEASE);
602 		if (diskctx->dp->d_sched_policy->destroy_diskctx)
603 			diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
604 		objcache_put(dsched_diskctx_cache, diskctx);
605 		atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
606 	}
607 }
608 
609 void
610 dsched_thread_io_unref(struct dsched_thread_io *tdio)
611 {
612 	struct dsched_thread_ctx	*tdctx;
613 	struct dsched_disk_ctx	*diskctx;
614 	int refcount;
615 
616 	refcount = atomic_fetchadd_int(&tdio->refcount, -1);
617 
618 	KKASSERT(refcount >= 0 || refcount <= -0x400);
619 
620 	if (refcount == 1) {
621 		atomic_subtract_int(&tdio->refcount, 0x400); /* mark as: in destruction */
622 #if 0
623 		kprintf("tdio (%p) destruction started, trace:\n", tdio);
624 		print_backtrace(8);
625 #endif
626 		diskctx = tdio->diskctx;
627 		KKASSERT(diskctx != NULL);
628 		KKASSERT(tdio->qlength == 0);
629 
630 		if (tdio->flags & DSCHED_LINKED_DISK_CTX) {
631 			lockmgr(&diskctx->lock, LK_EXCLUSIVE);
632 
633 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
634 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
635 
636 			lockmgr(&diskctx->lock, LK_RELEASE);
637 		}
638 
639 		if (tdio->flags & DSCHED_LINKED_THREAD_CTX) {
640 			tdctx = tdio->tdctx;
641 			KKASSERT(tdctx != NULL);
642 
643 			lockmgr(&tdctx->lock, LK_EXCLUSIVE);
644 
645 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
646 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
647 
648 			lockmgr(&tdctx->lock, LK_RELEASE);
649 		}
650 		if (tdio->diskctx->dp->d_sched_policy->destroy_tdio)
651 			tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);
652 		objcache_put(dsched_tdio_cache, tdio);
653 		atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
654 #if 0
655 		dsched_disk_ctx_unref(diskctx);
656 #endif
657 	}
658 }
659 
660 void
661 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
662 {
663 	struct dsched_thread_io	*tdio, *tdio2;
664 	int refcount;
665 
666 	refcount = atomic_fetchadd_int(&tdctx->refcount, -1);
667 
668 	KKASSERT(refcount >= 0 || refcount <= -0x400);
669 
670 	if (refcount == 1) {
671 		atomic_subtract_int(&tdctx->refcount, 0x400); /* mark as: in destruction */
672 #if 0
673 		kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
674 		print_backtrace(8);
675 #endif
676 		DSCHED_GLOBAL_THREAD_CTX_LOCK();
677 
678 		TAILQ_FOREACH_MUTABLE(tdio, &tdctx->tdio_list, link, tdio2) {
679 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
680 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
681 			dsched_thread_io_unref(tdio);
682 		}
683 		TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
684 
685 		DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
686 
687 		objcache_put(dsched_tdctx_cache, tdctx);
688 		atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
689 	}
690 }
691 
692 
693 struct dsched_thread_io *
694 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
695     struct dsched_policy *pol)
696 {
697 	struct dsched_thread_io	*tdio;
698 #if 0
699 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
700 #endif
701 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
702 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
703 
704 	/* XXX: maybe we do need another ref for the disk list for tdio */
705 	dsched_thread_io_ref(tdio);
706 
707 	DSCHED_THREAD_IO_LOCKINIT(tdio);
708 	tdio->dp = dp;
709 
710 	tdio->diskctx = dsched_get_disk_priv(dp);
711 	TAILQ_INIT(&tdio->queue);
712 
713 	if (pol->new_tdio)
714 		pol->new_tdio(tdio);
715 
716 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
717 	tdio->flags |= DSCHED_LINKED_DISK_CTX;
718 
719 	if (tdctx) {
720 		tdio->tdctx = tdctx;
721 		tdio->p = tdctx->p;
722 
723 		/* Put the tdio in the tdctx list */
724 		DSCHED_THREAD_CTX_LOCK(tdctx);
725 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
726 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
727 		tdio->flags |= DSCHED_LINKED_THREAD_CTX;
728 	}
729 
730 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
731 	return tdio;
732 }
733 
734 
735 struct dsched_disk_ctx *
736 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
737 {
738 	struct dsched_disk_ctx *diskctx;
739 
740 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
741 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
742 	dsched_disk_ctx_ref(diskctx);
743 	diskctx->dp = dp;
744 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
745 	TAILQ_INIT(&diskctx->tdio_list);
746 
747 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
748 	if (pol->new_diskctx)
749 		pol->new_diskctx(diskctx);
750 	return diskctx;
751 }
752 
753 
754 struct dsched_thread_ctx *
755 dsched_thread_ctx_alloc(struct proc *p)
756 {
757 	struct dsched_thread_ctx	*tdctx;
758 	struct dsched_thread_io	*tdio;
759 	struct disk	*dp = NULL;
760 
761 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
762 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
763 	dsched_thread_ctx_ref(tdctx);
764 #if 0
765 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
766 #endif
767 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
768 	TAILQ_INIT(&tdctx->tdio_list);
769 	tdctx->p = p;
770 
771 	/* XXX */
772 	while ((dp = disk_enumerate(dp))) {
773 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
774 	}
775 
776 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
777 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
778 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
779 
780 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
781 	/* XXX: no callback here */
782 	return tdctx;
783 }
784 
785 void
786 policy_new(struct disk *dp, struct dsched_policy *pol) {
787 	struct dsched_thread_ctx *tdctx;
788 	struct dsched_disk_ctx *diskctx;
789 	struct dsched_thread_io *tdio;
790 
791 	diskctx = dsched_disk_ctx_alloc(dp, pol);
792 	dsched_disk_ctx_ref(diskctx);
793 	dsched_set_disk_priv(dp, diskctx);
794 
795 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
796 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
797 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
798 	}
799 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
800 
801 }
802 
803 void
804 policy_destroy(struct disk *dp) {
805 	struct dsched_disk_ctx *diskctx;
806 
807 	diskctx = dsched_get_disk_priv(dp);
808 	KKASSERT(diskctx != NULL);
809 
810 	dsched_disk_ctx_unref(diskctx); /* from prepare */
811 	dsched_disk_ctx_unref(diskctx); /* from alloc */
812 
813 	dsched_set_disk_priv(dp, NULL);
814 }
815 
816 void
817 dsched_new_buf(struct buf *bp)
818 {
819 	struct dsched_thread_ctx	*tdctx = NULL;
820 
821 	if (dsched_inited == 0)
822 		return;
823 
824 	if (curproc != NULL) {
825 		tdctx = dsched_get_proc_priv(curproc);
826 	} else {
827 		/* This is a kernel thread, so no proc info is available */
828 		tdctx = dsched_get_thread_priv(curthread);
829 	}
830 
831 #if 0
832 	/*
833 	 * XXX: hack. we don't want this assert because we aren't catching all
834 	 *	threads. mi_startup() is still getting away without an tdctx.
835 	 */
836 
837 	/* by now we should have an tdctx. if not, something bad is going on */
838 	KKASSERT(tdctx != NULL);
839 #endif
840 
841 	if (tdctx) {
842 		dsched_thread_ctx_ref(tdctx);
843 	}
844 	dsched_set_buf_priv(bp, tdctx);
845 }
846 
847 void
848 dsched_exit_buf(struct buf *bp)
849 {
850 	struct dsched_thread_ctx	*tdctx;
851 
852 	tdctx = dsched_get_buf_priv(bp);
853 	if (tdctx != NULL) {
854 		dsched_clr_buf_priv(bp);
855 		dsched_thread_ctx_unref(tdctx);
856 	}
857 }
858 
859 void
860 dsched_new_proc(struct proc *p)
861 {
862 	struct dsched_thread_ctx	*tdctx;
863 
864 	if (dsched_inited == 0)
865 		return;
866 
867 	KKASSERT(p != NULL);
868 
869 	tdctx = dsched_thread_ctx_alloc(p);
870 	tdctx->p = p;
871 	dsched_thread_ctx_ref(tdctx);
872 
873 	dsched_set_proc_priv(p, tdctx);
874 	atomic_add_int(&dsched_stats.nprocs, 1);
875 }
876 
877 
878 void
879 dsched_new_thread(struct thread *td)
880 {
881 	struct dsched_thread_ctx	*tdctx;
882 
883 	if (dsched_inited == 0)
884 		return;
885 
886 	KKASSERT(td != NULL);
887 
888 	tdctx = dsched_thread_ctx_alloc(NULL);
889 	tdctx->td = td;
890 	dsched_thread_ctx_ref(tdctx);
891 
892 	dsched_set_thread_priv(td, tdctx);
893 	atomic_add_int(&dsched_stats.nthreads, 1);
894 }
895 
896 void
897 dsched_exit_proc(struct proc *p)
898 {
899 	struct dsched_thread_ctx	*tdctx;
900 
901 	if (dsched_inited == 0)
902 		return;
903 
904 	KKASSERT(p != NULL);
905 
906 	tdctx = dsched_get_proc_priv(p);
907 	KKASSERT(tdctx != NULL);
908 
909 	tdctx->dead = 0xDEAD;
910 	dsched_set_proc_priv(p, 0);
911 
912 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
913 	dsched_thread_ctx_unref(tdctx); /* one for ref */
914 	atomic_subtract_int(&dsched_stats.nprocs, 1);
915 }
916 
917 
918 void
919 dsched_exit_thread(struct thread *td)
920 {
921 	struct dsched_thread_ctx	*tdctx;
922 
923 	if (dsched_inited == 0)
924 		return;
925 
926 	KKASSERT(td != NULL);
927 
928 	tdctx = dsched_get_thread_priv(td);
929 	KKASSERT(tdctx != NULL);
930 
931 	tdctx->dead = 0xDEAD;
932 	dsched_set_thread_priv(td, 0);
933 
934 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
935 	dsched_thread_ctx_unref(tdctx); /* one for ref */
936 	atomic_subtract_int(&dsched_stats.nthreads, 1);
937 }
938 
939 /* DEFAULT NOOP POLICY */
940 
941 static int
942 default_prepare(struct dsched_disk_ctx *diskctx)
943 {
944 	return 0;
945 }
946 
947 static void
948 default_teardown(struct dsched_disk_ctx *diskctx)
949 {
950 
951 }
952 
953 static void
954 default_cancel(struct dsched_disk_ctx *diskctx)
955 {
956 
957 }
958 
959 static int
960 default_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
961     struct bio *bio)
962 {
963 	dsched_strategy_raw(diskctx->dp, bio);
964 #if 0
965 	dsched_strategy_async(diskctx->dp, bio, default_completed, NULL);
966 #endif
967 	return 0;
968 }
969 
970 
971 /*
972  * dsched device stuff
973  */
974 
975 static int
976 dsched_dev_list_disks(struct dsched_ioctl *data)
977 {
978 	struct disk *dp = NULL;
979 	uint32_t i;
980 
981 	for (i = 0; (i <= data->num_elem) && (dp = disk_enumerate(dp)); i++);
982 
983 	if (dp == NULL)
984 		return -1;
985 
986 	strncpy(data->dev_name, dp->d_cdev->si_name, sizeof(data->dev_name));
987 
988 	if (dp->d_sched_policy) {
989 		strncpy(data->pol_name, dp->d_sched_policy->name,
990 		    sizeof(data->pol_name));
991 	} else {
992 		strncpy(data->pol_name, "N/A (error)", 12);
993 	}
994 
995 	return 0;
996 }
997 
998 static int
999 dsched_dev_list_disk(struct dsched_ioctl *data)
1000 {
1001 	struct disk *dp = NULL;
1002 	int found = 0;
1003 
1004 	while ((dp = disk_enumerate(dp))) {
1005 		if (!strncmp(dp->d_cdev->si_name, data->dev_name,
1006 		    sizeof(data->dev_name))) {
1007 			KKASSERT(dp->d_sched_policy != NULL);
1008 
1009 			found = 1;
1010 			strncpy(data->pol_name, dp->d_sched_policy->name,
1011 			    sizeof(data->pol_name));
1012 			break;
1013 		}
1014 	}
1015 	if (!found)
1016 		return -1;
1017 
1018 	return 0;
1019 }
1020 
1021 static int
1022 dsched_dev_list_policies(struct dsched_ioctl *data)
1023 {
1024 	struct dsched_policy *pol = NULL;
1025 	uint32_t i;
1026 
1027 	for (i = 0; (i <= data->num_elem) && (pol = dsched_policy_enumerate(pol)); i++);
1028 
1029 	if (pol == NULL)
1030 		return -1;
1031 
1032 	strncpy(data->pol_name, pol->name, sizeof(data->pol_name));
1033 	return 0;
1034 }
1035 
1036 static int
1037 dsched_dev_handle_switch(char *disk, char *policy)
1038 {
1039 	struct disk *dp;
1040 	struct dsched_policy *pol;
1041 
1042 	dp = dsched_find_disk(disk);
1043 	pol = dsched_find_policy(policy);
1044 
1045 	if ((dp == NULL) || (pol == NULL))
1046 		return -1;
1047 
1048 	return (dsched_switch(dp, pol));
1049 }
1050 
1051 static int
1052 dsched_dev_open(struct dev_open_args *ap)
1053 {
1054 	/*
1055 	 * Only allow read-write access.
1056 	 */
1057 	if (((ap->a_oflags & FWRITE) == 0) || ((ap->a_oflags & FREAD) == 0))
1058 		return(EPERM);
1059 
1060 	/*
1061 	 * We don't allow nonblocking access.
1062 	 */
1063 	if ((ap->a_oflags & O_NONBLOCK) != 0) {
1064 		kprintf("dsched_dev: can't do nonblocking access\n");
1065 		return(ENODEV);
1066 	}
1067 
1068 	return 0;
1069 }
1070 
1071 static int
1072 dsched_dev_close(struct dev_close_args *ap)
1073 {
1074 	return 0;
1075 }
1076 
1077 static int
1078 dsched_dev_ioctl(struct dev_ioctl_args *ap)
1079 {
1080 	int error;
1081 	struct dsched_ioctl *data;
1082 
1083 	error = 0;
1084 	data = (struct dsched_ioctl *)ap->a_data;
1085 
1086 	switch(ap->a_cmd) {
1087 	case DSCHED_SET_DEVICE_POLICY:
1088 		if (dsched_dev_handle_switch(data->dev_name, data->pol_name))
1089 			error = ENOENT; /* No such file or directory */
1090 		break;
1091 
1092 	case DSCHED_LIST_DISK:
1093 		if (dsched_dev_list_disk(data) != 0) {
1094 			error = EINVAL; /* Invalid argument */
1095 		}
1096 		break;
1097 
1098 	case DSCHED_LIST_DISKS:
1099 		if (dsched_dev_list_disks(data) != 0) {
1100 			error = EINVAL; /* Invalid argument */
1101 		}
1102 		break;
1103 
1104 	case DSCHED_LIST_POLICIES:
1105 		if (dsched_dev_list_policies(data) != 0) {
1106 			error = EINVAL; /* Invalid argument */
1107 		}
1108 		break;
1109 
1110 
1111 	default:
1112 		error = ENOTTY; /* Inappropriate ioctl for device */
1113 		break;
1114 	}
1115 
1116 	return(error);
1117 }
1118 
1119 
1120 
1121 
1122 
1123 
1124 /*
1125  * SYSINIT stuff
1126  */
1127 
1128 
1129 static void
1130 dsched_init(void)
1131 {
1132 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1133 					   NULL, NULL, NULL,
1134 					   objcache_malloc_alloc,
1135 					   objcache_malloc_free,
1136 					   &dsched_thread_io_malloc_args );
1137 
1138 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1139 					   NULL, NULL, NULL,
1140 					   objcache_malloc_alloc,
1141 					   objcache_malloc_free,
1142 					   &dsched_thread_ctx_malloc_args );
1143 
1144 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1145 					   NULL, NULL, NULL,
1146 					   objcache_malloc_alloc,
1147 					   objcache_malloc_free,
1148 					   &dsched_disk_ctx_malloc_args );
1149 
1150 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1151 
1152 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1153 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1154 
1155 	dsched_register(&dsched_default_policy);
1156 
1157 	dsched_inited = 1;
1158 }
1159 
1160 static void
1161 dsched_uninit(void)
1162 {
1163 }
1164 
1165 static void
1166 dsched_dev_init(void)
1167 {
1168 	dsched_dev = make_dev(&dsched_dev_ops,
1169             0,
1170             UID_ROOT,
1171             GID_WHEEL,
1172             0600,
1173             "dsched");
1174 }
1175 
1176 static void
1177 dsched_dev_uninit(void)
1178 {
1179 	destroy_dev(dsched_dev);
1180 }
1181 
1182 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1183 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1184 SYSINIT(subr_dsched_dev_register, SI_SUB_DRIVERS, SI_ORDER_ANY, dsched_dev_init, NULL);
1185 SYSUNINIT(subr_dsched_dev_register, SI_SUB_DRIVERS, SI_ORDER_ANY, dsched_dev_uninit, NULL);
1186 
1187 /*
1188  * SYSCTL stuff
1189  */
1190 static int
1191 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1192 {
1193 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1194 }
1195 
1196 static int
1197 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1198 {
1199 	struct dsched_policy *pol = NULL;
1200 	int error, first = 1;
1201 
1202 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1203 
1204 	while ((pol = dsched_policy_enumerate(pol))) {
1205 		if (!first) {
1206 			error = SYSCTL_OUT(req, " ", 1);
1207 			if (error)
1208 				break;
1209 		} else {
1210 			first = 0;
1211 		}
1212 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1213 		if (error)
1214 			break;
1215 
1216 	}
1217 
1218 	lockmgr(&dsched_lock, LK_RELEASE);
1219 
1220 	error = SYSCTL_OUT(req, "", 1);
1221 
1222 	return error;
1223 }
1224 
1225 static int
1226 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1227 {
1228 	char buf[DSCHED_POLICY_NAME_LENGTH];
1229 	struct dsched_disk_ctx *diskctx = arg1;
1230 	struct dsched_policy *pol = NULL;
1231 	int error;
1232 
1233 	if (diskctx == NULL) {
1234 		return 0;
1235 	}
1236 
1237 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1238 
1239 	pol = diskctx->dp->d_sched_policy;
1240 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1241 
1242 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1243 	if (error || req->newptr == NULL) {
1244 		lockmgr(&dsched_lock, LK_RELEASE);
1245 		return (error);
1246 	}
1247 
1248 	pol = dsched_find_policy(buf);
1249 	if (pol == NULL) {
1250 		lockmgr(&dsched_lock, LK_RELEASE);
1251 		return 0;
1252 	}
1253 
1254 	dsched_switch(diskctx->dp, pol);
1255 
1256 	lockmgr(&dsched_lock, LK_RELEASE);
1257 
1258 	return error;
1259 }
1260 
1261 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1262     "Disk Scheduler Framework (dsched) magic");
1263 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1264     "List of disks and their policies");
1265 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1266     0, "Enable dsched debugging");
1267 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1268     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1269     "dsched statistics");
1270 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1271     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1272 
1273 static void
1274 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1275 {
1276 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1277 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1278 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1279 	}
1280 
1281 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1282 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1283 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1284 }
1285