xref: /dflybsd-src/sys/kern/kern_dsched.c (revision e3440f963d168fa3ccb25daabd2beebd240133f5)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 TAILQ_HEAD(tdio_list_head, dsched_thread_io);
56 
57 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
58 
59 static dsched_prepare_t		noop_prepare;
60 static dsched_teardown_t	noop_teardown;
61 static dsched_cancel_t		noop_cancel;
62 static dsched_queue_t		noop_queue;
63 
64 static void dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio);
65 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
66 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
67 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
68 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
69 
70 static struct dsched_thread_io *dsched_thread_io_alloc(
71 		struct disk *dp, struct dsched_thread_ctx *tdctx,
72 		struct dsched_policy *pol, int tdctx_locked);
73 
74 static int	dsched_inited = 0;
75 static int	default_set = 0;
76 
77 struct lock	dsched_lock;
78 static int	dsched_debug_enable = 0;
79 
80 struct dsched_stats	dsched_stats;
81 
82 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
83 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
84 struct objcache_malloc_args dsched_thread_io_malloc_args = {
85 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
86 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
87 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
88 
89 static struct objcache	*dsched_diskctx_cache;
90 static struct objcache	*dsched_tdctx_cache;
91 static struct objcache	*dsched_tdio_cache;
92 
93 struct lock	dsched_tdctx_lock;
94 
95 static struct dsched_policy_head dsched_policy_list =
96 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
97 
98 static struct dsched_policy dsched_noop_policy = {
99 	.name = "noop",
100 
101 	.prepare = noop_prepare,
102 	.teardown = noop_teardown,
103 	.cancel_all = noop_cancel,
104 	.bio_queue = noop_queue
105 };
106 
107 static struct dsched_policy *default_policy = &dsched_noop_policy;
108 
109 /*
110  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
111  * using kvprintf
112  */
113 int
114 dsched_debug(int level, char *fmt, ...)
115 {
116 	__va_list ap;
117 
118 	__va_start(ap, fmt);
119 	if (level <= dsched_debug_enable)
120 		kvprintf(fmt, ap);
121 	__va_end(ap);
122 
123 	return 0;
124 }
125 
126 /*
127  * Called on disk_create()
128  * tries to read which policy to use from loader.conf, if there's
129  * none specified, the default policy is used.
130  */
131 void
132 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
133 {
134 	char tunable_key[SPECNAMELEN + 48];
135 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
136 	char *ptr;
137 	struct dsched_policy *policy = NULL;
138 
139 	/* Also look for serno stuff? */
140 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
141 
142 	ksnprintf(tunable_key, sizeof(tunable_key),
143 		  "dsched.policy.%s%d", head_name, unit);
144 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
145 	    sizeof(sched_policy)) != 0) {
146 		policy = dsched_find_policy(sched_policy);
147 	}
148 
149 	ksnprintf(tunable_key, sizeof(tunable_key),
150 		  "dsched.policy.%s", head_name);
151 
152 	for (ptr = tunable_key; *ptr; ptr++) {
153 		if (*ptr == '/')
154 			*ptr = '-';
155 	}
156 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
157 	    sizeof(sched_policy)) != 0)) {
158 		policy = dsched_find_policy(sched_policy);
159 	}
160 
161 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
162 	if (!policy && !default_set &&
163 	    (TUNABLE_STR_FETCH(tunable_key, sched_policy,
164 			       sizeof(sched_policy)) != 0)) {
165 		policy = dsched_find_policy(sched_policy);
166 	}
167 
168 	if (!policy) {
169 		if (!default_set && bootverbose) {
170 			dsched_debug(0,
171 				     "No policy for %s%d specified, "
172 				     "or policy not found\n",
173 				     head_name, unit);
174 		}
175 		dsched_set_policy(dp, default_policy);
176 	} else {
177 		dsched_set_policy(dp, policy);
178 	}
179 
180 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
181 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
182 	else
183 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
184 	for (ptr = tunable_key; *ptr; ptr++) {
185 		if (*ptr == '/')
186 			*ptr = '-';
187 	}
188 	dsched_sysctl_add_disk(
189 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
190 	    tunable_key);
191 
192 	lockmgr(&dsched_lock, LK_RELEASE);
193 }
194 
195 /*
196  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
197  * there's any policy associated with the serial number of the device.
198  */
199 void
200 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
201 {
202 	char tunable_key[SPECNAMELEN + 48];
203 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
204 	struct dsched_policy *policy = NULL;
205 
206 	if (info->d_serialno == NULL)
207 		return;
208 
209 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
210 
211 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
212 	    info->d_serialno);
213 
214 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
215 	    sizeof(sched_policy)) != 0)) {
216 		policy = dsched_find_policy(sched_policy);
217 	}
218 
219 	if (policy) {
220 		dsched_switch(dp, policy);
221 	}
222 
223 	dsched_sysctl_add_disk(
224 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
225 	    info->d_serialno);
226 
227 	lockmgr(&dsched_lock, LK_RELEASE);
228 }
229 
230 /*
231  * Called on disk_destroy()
232  * shuts down the scheduler core and cancels all remaining bios
233  */
234 void
235 dsched_disk_destroy_callback(struct disk *dp)
236 {
237 	struct dsched_policy *old_policy;
238 	struct dsched_disk_ctx *diskctx;
239 
240 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
241 
242 	diskctx = dsched_get_disk_priv(dp);
243 
244 	old_policy = dp->d_sched_policy;
245 	dp->d_sched_policy = &dsched_noop_policy;
246 	old_policy->cancel_all(dsched_get_disk_priv(dp));
247 	old_policy->teardown(dsched_get_disk_priv(dp));
248 
249 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
250 		sysctl_ctx_free(&diskctx->sysctl_ctx);
251 
252 	policy_destroy(dp);
253 	atomic_subtract_int(&old_policy->ref_count, 1);
254 	KKASSERT(old_policy->ref_count >= 0);
255 
256 	lockmgr(&dsched_lock, LK_RELEASE);
257 }
258 
259 
260 /*
261  * Caller must have dp->diskctx locked
262  */
263 void
264 dsched_queue(struct disk *dp, struct bio *bio)
265 {
266 	struct dsched_thread_ctx	*tdctx;
267 	struct dsched_thread_io		*tdio;
268 	struct dsched_disk_ctx		*diskctx;
269 	int	error;
270 
271 	if (dp->d_sched_policy == &dsched_noop_policy) {
272 		dsched_clr_buf_priv(bio->bio_buf);
273 		atomic_add_int(&dsched_stats.no_tdctx, 1);
274 		dsched_strategy_raw(dp, bio);
275 		return;
276 	}
277 
278 	error = 0;
279 	tdctx = dsched_get_buf_priv(bio->bio_buf);
280 	if (tdctx == NULL) {
281 		/* We don't handle this case, let dsched dispatch */
282 		atomic_add_int(&dsched_stats.no_tdctx, 1);
283 		dsched_strategy_raw(dp, bio);
284 		return;
285 	}
286 
287 	DSCHED_THREAD_CTX_LOCK(tdctx);
288 
289 	/*
290 	 * XXX:
291 	 * iterate in reverse to make sure we find the most up-to-date
292 	 * tdio for a given disk. After a switch it may take some time
293 	 * for everything to clean up.
294 	 */
295 	TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
296 		if (tdio->dp == dp) {
297 			dsched_thread_io_ref(tdio);
298 			break;
299 		}
300 	}
301 	if (tdio == NULL) {
302 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy, 1);
303 		dsched_thread_io_ref(tdio);
304 	}
305 
306 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
307 	dsched_clr_buf_priv(bio->bio_buf);
308 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
309 
310 	diskctx = dsched_get_disk_priv(dp);
311 	dsched_disk_ctx_ref(diskctx);
312 
313 	if (dp->d_sched_policy != &dsched_noop_policy)
314 		KKASSERT(tdio->debug_policy == dp->d_sched_policy);
315 
316 	KKASSERT(tdio->debug_inited == 0xF00F1234);
317 
318 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
319 
320 	if (error) {
321 		dsched_strategy_raw(dp, bio);
322 	}
323 	dsched_disk_ctx_unref(diskctx);
324 	dsched_thread_io_unref(tdio);
325 }
326 
327 
328 /*
329  * Called from each module_init or module_attach of each policy
330  * registers the policy in the local policy list.
331  */
332 int
333 dsched_register(struct dsched_policy *d_policy)
334 {
335 	struct dsched_policy *policy;
336 	int error = 0;
337 
338 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
339 
340 	policy = dsched_find_policy(d_policy->name);
341 
342 	if (!policy) {
343 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
344 		atomic_add_int(&d_policy->ref_count, 1);
345 	} else {
346 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
347 		    d_policy->name);
348 		error = EEXIST;
349 	}
350 
351 	lockmgr(&dsched_lock, LK_RELEASE);
352 	return error;
353 }
354 
355 /*
356  * Called from each module_detach of each policy
357  * unregisters the policy
358  */
359 int
360 dsched_unregister(struct dsched_policy *d_policy)
361 {
362 	struct dsched_policy *policy;
363 
364 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
365 	policy = dsched_find_policy(d_policy->name);
366 
367 	if (policy) {
368 		if (policy->ref_count > 1) {
369 			lockmgr(&dsched_lock, LK_RELEASE);
370 			return EBUSY;
371 		}
372 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
373 		atomic_subtract_int(&policy->ref_count, 1);
374 		KKASSERT(policy->ref_count == 0);
375 	}
376 	lockmgr(&dsched_lock, LK_RELEASE);
377 
378 	return 0;
379 }
380 
381 
382 /*
383  * switches the policy by first removing the old one and then
384  * enabling the new one.
385  */
386 int
387 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
388 {
389 	struct dsched_policy *old_policy;
390 
391 	/* If we are asked to set the same policy, do nothing */
392 	if (dp->d_sched_policy == new_policy)
393 		return 0;
394 
395 	/* lock everything down, diskwise */
396 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
397 	old_policy = dp->d_sched_policy;
398 
399 	atomic_subtract_int(&old_policy->ref_count, 1);
400 	KKASSERT(old_policy->ref_count >= 0);
401 
402 	dp->d_sched_policy = &dsched_noop_policy;
403 	old_policy->teardown(dsched_get_disk_priv(dp));
404 	policy_destroy(dp);
405 
406 	/* Bring everything back to life */
407 	dsched_set_policy(dp, new_policy);
408 	lockmgr(&dsched_lock, LK_RELEASE);
409 
410 	return 0;
411 }
412 
413 
414 /*
415  * Loads a given policy and attaches it to the specified disk.
416  * Also initializes the core for the policy
417  */
418 void
419 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
420 {
421 	int locked = 0;
422 
423 	/* Check if it is locked already. if not, we acquire the devfs lock */
424 	if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
425 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
426 		locked = 1;
427 	}
428 
429 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
430 
431 	policy_new(dp, new_policy);
432 	new_policy->prepare(dsched_get_disk_priv(dp));
433 	dp->d_sched_policy = new_policy;
434 	atomic_add_int(&new_policy->ref_count, 1);
435 
436 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
437 
438 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
439 	    new_policy->name);
440 
441 	/* If we acquired the lock, we also get rid of it */
442 	if (locked)
443 		lockmgr(&dsched_lock, LK_RELEASE);
444 }
445 
446 struct dsched_policy*
447 dsched_find_policy(char *search)
448 {
449 	struct dsched_policy *policy;
450 	struct dsched_policy *policy_found = NULL;
451 	int locked = 0;
452 
453 	/* Check if it is locked already. if not, we acquire the devfs lock */
454 	if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
455 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
456 		locked = 1;
457 	}
458 
459 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
460 		if (!strcmp(policy->name, search)) {
461 			policy_found = policy;
462 			break;
463 		}
464 	}
465 
466 	/* If we acquired the lock, we also get rid of it */
467 	if (locked)
468 		lockmgr(&dsched_lock, LK_RELEASE);
469 
470 	return policy_found;
471 }
472 
473 /*
474  * Returns ref'd disk
475  */
476 struct disk *
477 dsched_find_disk(char *search)
478 {
479 	struct disk marker;
480 	struct disk *dp = NULL;
481 
482 	while ((dp = disk_enumerate(&marker, dp)) != NULL) {
483 		if (strcmp(dp->d_cdev->si_name, search) == 0) {
484 			disk_enumerate_stop(&marker, NULL);
485 			/* leave ref on dp */
486 			break;
487 		}
488 	}
489 	return dp;
490 }
491 
492 struct disk *
493 dsched_disk_enumerate(struct disk *marker, struct disk *dp,
494 		      struct dsched_policy *policy)
495 {
496 	while ((dp = disk_enumerate(marker, dp)) != NULL) {
497 		if (dp->d_sched_policy == policy)
498 			break;
499 	}
500 	return NULL;
501 }
502 
503 struct dsched_policy *
504 dsched_policy_enumerate(struct dsched_policy *pol)
505 {
506 	if (!pol)
507 		return (TAILQ_FIRST(&dsched_policy_list));
508 	else
509 		return (TAILQ_NEXT(pol, link));
510 }
511 
512 void
513 dsched_cancel_bio(struct bio *bp)
514 {
515 	bp->bio_buf->b_error = ENXIO;
516 	bp->bio_buf->b_flags |= B_ERROR;
517 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
518 
519 	biodone(bp);
520 }
521 
522 void
523 dsched_strategy_raw(struct disk *dp, struct bio *bp)
524 {
525 	/*
526 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
527 	 * to avoid panics
528 	 */
529 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
530 	if(bp->bio_track != NULL) {
531 		dsched_debug(LOG_INFO,
532 		    "dsched_strategy_raw sees non-NULL bio_track!! "
533 		    "bio: %p\n", bp);
534 		bp->bio_track = NULL;
535 	}
536 	dev_dstrategy(dp->d_rawdev, bp);
537 }
538 
539 void
540 dsched_strategy_sync(struct disk *dp, struct bio *bio)
541 {
542 	struct buf *bp, *nbp;
543 	struct bio *nbio;
544 
545 	bp = bio->bio_buf;
546 
547 	nbp = getpbuf(NULL);
548 	nbio = &nbp->b_bio1;
549 
550 	nbp->b_cmd = bp->b_cmd;
551 	nbp->b_bufsize = bp->b_bufsize;
552 	nbp->b_runningbufspace = bp->b_runningbufspace;
553 	nbp->b_bcount = bp->b_bcount;
554 	nbp->b_resid = bp->b_resid;
555 	nbp->b_data = bp->b_data;
556 #if 0
557 	/*
558 	 * Buffers undergoing device I/O do not need a kvabase/size.
559 	 */
560 	nbp->b_kvabase = bp->b_kvabase;
561 	nbp->b_kvasize = bp->b_kvasize;
562 #endif
563 	nbp->b_dirtyend = bp->b_dirtyend;
564 
565 	nbio->bio_done = biodone_sync;
566 	nbio->bio_flags |= BIO_SYNC;
567 	nbio->bio_track = NULL;
568 
569 	nbio->bio_caller_info1.ptr = dp;
570 	nbio->bio_offset = bio->bio_offset;
571 
572 	dev_dstrategy(dp->d_rawdev, nbio);
573 	biowait(nbio, "dschedsync");
574 	bp->b_resid = nbp->b_resid;
575 	bp->b_error = nbp->b_error;
576 	biodone(bio);
577 #if 0
578 	nbp->b_kvabase = NULL;
579 	nbp->b_kvasize = 0;
580 #endif
581 	relpbuf(nbp, NULL);
582 }
583 
584 void
585 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
586 {
587 	struct bio *nbio;
588 
589 	nbio = push_bio(bio);
590 	nbio->bio_done = done;
591 	nbio->bio_offset = bio->bio_offset;
592 
593 	dsched_set_bio_dp(nbio, dp);
594 	dsched_set_bio_priv(nbio, priv);
595 
596 	getmicrotime(&nbio->bio_caller_info3.tv);
597 	dev_dstrategy(dp->d_rawdev, nbio);
598 }
599 
600 /*
601  * A special bio done call back function
602  * used by policy having request polling implemented.
603  */
604 static void
605 request_polling_biodone(struct bio *bp)
606 {
607 	struct dsched_disk_ctx *diskctx = NULL;
608 	struct disk *dp = NULL;
609 	struct bio *obio;
610 	struct dsched_policy *policy;
611 
612 	dp = dsched_get_bio_dp(bp);
613 	policy = dp->d_sched_policy;
614 	diskctx = dsched_get_disk_priv(dp);
615 	KKASSERT(diskctx && policy);
616 	dsched_disk_ctx_ref(diskctx);
617 
618 	/*
619 	 * XXX:
620 	 * the bio_done function should not be blocked !
621 	 */
622 	if (diskctx->dp->d_sched_policy->bio_done)
623 		diskctx->dp->d_sched_policy->bio_done(bp);
624 
625 	obio = pop_bio(bp);
626 	biodone(obio);
627 
628 	atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
629 
630 	/* call the polling function,
631 	 * XXX:
632 	 * the polling function should not be blocked!
633 	 */
634 	if (policy->polling_func)
635 		policy->polling_func(diskctx);
636 	else
637 		dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
638 	dsched_disk_ctx_unref(diskctx);
639 }
640 
641 /*
642  * A special dsched strategy used by policy having request polling
643  * (polling function) implemented.
644  *
645  * The strategy is the just like dsched_strategy_async(), but
646  * the biodone call back is set to a preset one.
647  *
648  * If the policy needs its own biodone callback, it should
649  * register it in the policy structure. (bio_done field)
650  *
651  * The current_tag_queue_depth is maintained by this function
652  * and the request_polling_biodone() function
653  */
654 
655 void
656 dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
657 {
658 	atomic_add_int(&diskctx->current_tag_queue_depth, 1);
659 	dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
660 }
661 
662 /*
663  * Ref and deref various structures.  The 1->0 transition of the reference
664  * count actually transitions 1->0x80000000 and causes the object to be
665  * destroyed.  It is possible for transitory references to occur on the
666  * object while it is being destroyed.  We use bit 31 to indicate that
667  * destruction is in progress and to prevent nested destructions.
668  */
669 void
670 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
671 {
672 	int refcount;
673 
674 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
675 }
676 
677 void
678 dsched_thread_io_ref(struct dsched_thread_io *tdio)
679 {
680 	int refcount;
681 
682 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
683 }
684 
685 void
686 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
687 {
688 	int refcount;
689 
690 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
691 }
692 
693 void
694 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
695 {
696 	int refs;
697 	int nrefs;
698 
699 	/*
700 	 * Handle 1->0 transitions for diskctx and nested destruction
701 	 * recursions.  If the refs are already in destruction mode (bit 31
702 	 * set) on the 1->0 transition we don't try to destruct it again.
703 	 *
704 	 * 0x80000001->0x80000000 transitions are handled normally and
705 	 * thus avoid nested dstruction.
706 	 */
707 	for (;;) {
708 		refs = diskctx->refcount;
709 		cpu_ccfence();
710 		nrefs = refs - 1;
711 
712 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
713 		if (nrefs) {
714 			if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
715 				break;
716 			continue;
717 		}
718 		nrefs = 0x80000000;
719 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
720 			dsched_disk_ctx_destroy(diskctx);
721 			break;
722 		}
723 	}
724 }
725 
726 static
727 void
728 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
729 {
730 	struct dsched_thread_io	*tdio;
731 	int refs;
732 	int nrefs;
733 
734 #if 0
735 	kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
736 	print_backtrace(4);
737 #endif
738 	lockmgr(&diskctx->lock, LK_EXCLUSIVE);
739 	while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
740 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
741 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
742 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
743 		tdio->diskctx = NULL;
744 		lockmgr(&diskctx->lock, LK_RELEASE);
745 		lockmgr(&tdio->lock, LK_EXCLUSIVE);
746 		dsched_thread_io_unref_destroy(tdio);
747 		lockmgr(&tdio->lock, LK_RELEASE);
748 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
749 	}
750 	lockmgr(&diskctx->lock, LK_RELEASE);
751 
752 	/*
753 	 * Expect diskctx->refcount to be 0x80000000.  If it isn't someone
754 	 * else still has a temporary ref on the diskctx and we have to
755 	 * transition it back to an undestroyed-state (albeit without any
756 	 * associations), so the other user destroys it properly when the
757 	 * ref is released.
758 	 */
759 	while ((refs = diskctx->refcount) != 0x80000000) {
760 		kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx);
761 		cpu_ccfence();
762 		KKASSERT(refs & 0x80000000);
763 		nrefs = refs & 0x7FFFFFFF;
764 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
765 			return;
766 	}
767 
768 	/*
769 	 * Really for sure now.
770 	 */
771 	if (diskctx->dp->d_sched_policy->destroy_diskctx)
772 		diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
773 	objcache_put(dsched_diskctx_cache, diskctx);
774 	atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
775 }
776 
777 void
778 dsched_thread_io_unref(struct dsched_thread_io *tdio)
779 {
780 	int refs;
781 	int nrefs;
782 
783 	/*
784 	 * Handle 1->0 transitions for tdio and nested destruction
785 	 * recursions.  If the refs are already in destruction mode (bit 31
786 	 * set) on the 1->0 transition we don't try to destruct it again.
787 	 *
788 	 * 0x80000001->0x80000000 transitions are handled normally and
789 	 * thus avoid nested dstruction.
790 	 */
791 	for (;;) {
792 		refs = tdio->refcount;
793 		cpu_ccfence();
794 		nrefs = refs - 1;
795 
796 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
797 		if (nrefs) {
798 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
799 				break;
800 			continue;
801 		}
802 		nrefs = 0x80000000;
803 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
804 			dsched_thread_io_destroy(tdio);
805 			break;
806 		}
807 	}
808 }
809 
810 /*
811  * Unref and destroy the tdio even if additional refs are present.
812  */
813 static
814 void
815 dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio)
816 {
817 	int refs;
818 	int nrefs;
819 
820 	/*
821 	 * If not already transitioned to destroy-in-progress we transition
822 	 * to destroy-in-progress, cleanup our ref, and destroy the tdio.
823 	 */
824 	for (;;) {
825 		refs = tdio->refcount;
826 		cpu_ccfence();
827 		nrefs = refs - 1;
828 
829 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
830 		if (nrefs & 0x80000000) {
831 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
832 				break;
833 			continue;
834 		}
835 		nrefs |= 0x80000000;
836 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
837 			dsched_thread_io_destroy(tdio);
838 			break;
839 		}
840 	}
841 }
842 
843 static void
844 dsched_async_thread_io_destroy(struct bio *bio)
845 {
846 	struct bio *obio;
847 	void *ident = dsched_get_bio_priv(bio);
848 
849 	obio = pop_bio(bio);
850 	biodone(obio);
851 	wakeup(ident);
852 }
853 
854 
855 static void
856 dsched_thread_io_drain(struct dsched_thread_io *tdio)
857 {
858 	struct bio *bio;
859 	struct bio *nbio;
860 
861 	while (tdio->qlength != 0) {
862 		bio = TAILQ_LAST(&tdio->queue, tdio_queue);
863 		TAILQ_REMOVE(&tdio->queue, bio, link);
864 
865 		nbio = push_bio(bio);
866 		nbio->bio_done = &dsched_async_thread_io_destroy;
867 		nbio->bio_offset = bio->bio_offset;
868 
869 		dsched_set_bio_dp(nbio, tdio->dp);
870 		dsched_set_bio_priv(nbio, (void *)tdio);
871 		TAILQ_INSERT_TAIL(&tdio->queue, nbio, link);
872 
873 		lksleep((void *)tdio, &tdio->lock, 0, "tdiow", 0);
874 	}
875 }
876 
877 static void
878 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
879 {
880 	struct dsched_thread_ctx *tdctx;
881 	struct dsched_disk_ctx	*diskctx;
882 	int refs;
883 	int nrefs;
884 
885 #if 0
886 	kprintf("tdio (%p) destruction started, trace:\n", tdio);
887 	print_backtrace(8);
888 #endif
889 	KKASSERT(tdio->qlength == 0);
890 
891 	while ((diskctx = tdio->diskctx) != NULL) {
892 		dsched_disk_ctx_ref(diskctx);
893 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
894 		if (diskctx != tdio->diskctx) {
895 			lockmgr(&diskctx->lock, LK_RELEASE);
896 			dsched_disk_ctx_unref(diskctx);
897 			continue;
898 		}
899 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
900 		if (diskctx->dp->d_sched_policy->destroy_tdio)
901 			diskctx->dp->d_sched_policy->destroy_tdio(tdio);
902 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
903 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
904 		tdio->diskctx = NULL;
905 		dsched_thread_io_unref(tdio);
906 		lockmgr(&diskctx->lock, LK_RELEASE);
907 		dsched_disk_ctx_unref(diskctx);
908 	}
909 	while ((tdctx = tdio->tdctx) != NULL) {
910 		dsched_thread_ctx_ref(tdctx);
911 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
912 		if (tdctx != tdio->tdctx) {
913 			lockmgr(&tdctx->lock, LK_RELEASE);
914 			dsched_thread_ctx_unref(tdctx);
915 			continue;
916 		}
917 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
918 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
919 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
920 		tdio->tdctx = NULL;
921 		dsched_thread_io_unref(tdio);
922 		lockmgr(&tdctx->lock, LK_RELEASE);
923 		dsched_thread_ctx_unref(tdctx);
924 	}
925 
926 	/*
927 	 * Expect tdio->refcount to be 0x80000000.  If it isn't someone else
928 	 * still has a temporary ref on the tdio and we have to transition
929 	 * it back to an undestroyed-state (albeit without any associations)
930 	 * so the other user destroys it properly when the ref is released.
931 	 */
932 	while ((refs = tdio->refcount) != 0x80000000) {
933 		kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio);
934 		cpu_ccfence();
935 		KKASSERT(refs & 0x80000000);
936 		nrefs = refs & 0x7FFFFFFF;
937 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
938 			return;
939 	}
940 
941 	/*
942 	 * Really for sure now.
943 	 */
944 	objcache_put(dsched_tdio_cache, tdio);
945 	atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
946 }
947 
948 void
949 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
950 {
951 	int refs;
952 	int nrefs;
953 
954 	/*
955 	 * Handle 1->0 transitions for tdctx and nested destruction
956 	 * recursions.  If the refs are already in destruction mode (bit 31
957 	 * set) on the 1->0 transition we don't try to destruct it again.
958 	 *
959 	 * 0x80000001->0x80000000 transitions are handled normally and
960 	 * thus avoid nested dstruction.
961 	 */
962 	for (;;) {
963 		refs = tdctx->refcount;
964 		cpu_ccfence();
965 		nrefs = refs - 1;
966 
967 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
968 		if (nrefs) {
969 			if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
970 				break;
971 			continue;
972 		}
973 		nrefs = 0x80000000;
974 		if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
975 			dsched_thread_ctx_destroy(tdctx);
976 			break;
977 		}
978 	}
979 }
980 
981 static void
982 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
983 {
984 	struct dsched_thread_io	*tdio;
985 
986 	lockmgr(&tdctx->lock, LK_EXCLUSIVE);
987 
988 	while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
989 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
990 		lockmgr(&tdio->lock, LK_EXCLUSIVE);
991 		dsched_thread_io_drain(tdio);
992 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
993 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
994 		tdio->tdctx = NULL;
995 		lockmgr(&tdio->lock, LK_RELEASE);
996 		lockmgr(&tdctx->lock, LK_RELEASE);	/* avoid deadlock */
997 		dsched_thread_io_unref_destroy(tdio);
998 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
999 	}
1000 	KKASSERT(tdctx->refcount == 0x80000000);
1001 
1002 	lockmgr(&tdctx->lock, LK_RELEASE);
1003 
1004 	objcache_put(dsched_tdctx_cache, tdctx);
1005 	atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
1006 }
1007 
1008 /*
1009  * Ensures that a tdio is assigned to tdctx and disk.
1010  */
1011 static
1012 struct dsched_thread_io *
1013 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
1014 		       struct dsched_policy *pol, int tdctx_locked)
1015 {
1016 	struct dsched_thread_io	*tdio;
1017 #if 0
1018 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
1019 #endif
1020 	tdio = objcache_get(dsched_tdio_cache, M_INTWAIT);
1021 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
1022 
1023 	dsched_thread_io_ref(tdio);	/* prevent ripout */
1024 	dsched_thread_io_ref(tdio);	/* for diskctx ref */
1025 
1026 	DSCHED_THREAD_IO_LOCKINIT(tdio);
1027 	tdio->dp = dp;
1028 
1029 	tdio->diskctx = dsched_get_disk_priv(dp);
1030 	TAILQ_INIT(&tdio->queue);
1031 
1032 	if (pol->new_tdio)
1033 		pol->new_tdio(tdio);
1034 
1035 	DSCHED_DISK_CTX_LOCK(tdio->diskctx);
1036 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
1037 	atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
1038 	DSCHED_DISK_CTX_UNLOCK(tdio->diskctx);
1039 
1040 	if (tdctx) {
1041 		/*
1042 		 * Put the tdio in the tdctx list.  Inherit the temporary
1043 		 * ref (one ref for each list).
1044 		 */
1045 		if (tdctx_locked == 0)
1046 			DSCHED_THREAD_CTX_LOCK(tdctx);
1047 		tdio->tdctx = tdctx;
1048 		tdio->p = tdctx->p;
1049 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
1050 		atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
1051 		if (tdctx_locked == 0)
1052 			DSCHED_THREAD_CTX_UNLOCK(tdctx);
1053 	} else {
1054 		dsched_thread_io_unref(tdio);
1055 	}
1056 
1057 	tdio->debug_policy = pol;
1058 	tdio->debug_inited = 0xF00F1234;
1059 
1060 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
1061 
1062 	return(tdio);
1063 }
1064 
1065 
1066 struct dsched_disk_ctx *
1067 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
1068 {
1069 	struct dsched_disk_ctx *diskctx;
1070 
1071 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
1072 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
1073 	dsched_disk_ctx_ref(diskctx);
1074 	diskctx->dp = dp;
1075 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
1076 	TAILQ_INIT(&diskctx->tdio_list);
1077 	/*
1078 	 * XXX: magic number 32: most device has a tag queue
1079 	 * of depth 32.
1080 	 * Better to retrive more precise value from the driver
1081 	 */
1082 	diskctx->max_tag_queue_depth = 32;
1083 	diskctx->current_tag_queue_depth = 0;
1084 
1085 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
1086 	if (pol->new_diskctx)
1087 		pol->new_diskctx(diskctx);
1088 	return diskctx;
1089 }
1090 
1091 
1092 struct dsched_thread_ctx *
1093 dsched_thread_ctx_alloc(struct proc *p)
1094 {
1095 	struct dsched_thread_ctx	*tdctx;
1096 
1097 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
1098 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
1099 	dsched_thread_ctx_ref(tdctx);
1100 #if 0
1101 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
1102 #endif
1103 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
1104 	TAILQ_INIT(&tdctx->tdio_list);
1105 	tdctx->p = p;
1106 
1107 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
1108 	/* XXX: no callback here */
1109 
1110 	return tdctx;
1111 }
1112 
1113 void
1114 policy_new(struct disk *dp, struct dsched_policy *pol)
1115 {
1116 	struct dsched_disk_ctx *diskctx;
1117 
1118 	diskctx = dsched_disk_ctx_alloc(dp, pol);
1119 	dsched_disk_ctx_ref(diskctx);
1120 	dsched_set_disk_priv(dp, diskctx);
1121 }
1122 
1123 void
1124 policy_destroy(struct disk *dp) {
1125 	struct dsched_disk_ctx *diskctx;
1126 
1127 	diskctx = dsched_get_disk_priv(dp);
1128 	KKASSERT(diskctx != NULL);
1129 
1130 	dsched_disk_ctx_unref(diskctx); /* from prepare */
1131 	dsched_disk_ctx_unref(diskctx); /* from alloc */
1132 
1133 	dsched_set_disk_priv(dp, NULL);
1134 }
1135 
1136 void
1137 dsched_new_buf(struct buf *bp)
1138 {
1139 	struct dsched_thread_ctx	*tdctx = NULL;
1140 
1141 	if (dsched_inited == 0)
1142 		return;
1143 
1144 	if (curproc != NULL) {
1145 		tdctx = dsched_get_proc_priv(curproc);
1146 	} else {
1147 		/* This is a kernel thread, so no proc info is available */
1148 		tdctx = dsched_get_thread_priv(curthread);
1149 	}
1150 
1151 #if 0
1152 	/*
1153 	 * XXX: hack. we don't want this assert because we aren't catching all
1154 	 *	threads. mi_startup() is still getting away without an tdctx.
1155 	 */
1156 
1157 	/* by now we should have an tdctx. if not, something bad is going on */
1158 	KKASSERT(tdctx != NULL);
1159 #endif
1160 
1161 	if (tdctx) {
1162 		dsched_thread_ctx_ref(tdctx);
1163 	}
1164 	dsched_set_buf_priv(bp, tdctx);
1165 }
1166 
1167 void
1168 dsched_exit_buf(struct buf *bp)
1169 {
1170 	struct dsched_thread_ctx	*tdctx;
1171 
1172 	tdctx = dsched_get_buf_priv(bp);
1173 	if (tdctx != NULL) {
1174 		dsched_clr_buf_priv(bp);
1175 		dsched_thread_ctx_unref(tdctx);
1176 	}
1177 }
1178 
1179 void
1180 dsched_new_proc(struct proc *p)
1181 {
1182 	struct dsched_thread_ctx	*tdctx;
1183 
1184 	if (dsched_inited == 0)
1185 		return;
1186 
1187 	KKASSERT(p != NULL);
1188 
1189 	tdctx = dsched_thread_ctx_alloc(p);
1190 	tdctx->p = p;
1191 	dsched_thread_ctx_ref(tdctx);
1192 
1193 	dsched_set_proc_priv(p, tdctx);
1194 	atomic_add_int(&dsched_stats.nprocs, 1);
1195 }
1196 
1197 
1198 void
1199 dsched_new_thread(struct thread *td)
1200 {
1201 	struct dsched_thread_ctx	*tdctx;
1202 
1203 	if (dsched_inited == 0)
1204 		return;
1205 
1206 	KKASSERT(td != NULL);
1207 
1208 	tdctx = dsched_thread_ctx_alloc(NULL);
1209 	tdctx->td = td;
1210 	dsched_thread_ctx_ref(tdctx);
1211 
1212 	dsched_set_thread_priv(td, tdctx);
1213 	atomic_add_int(&dsched_stats.nthreads, 1);
1214 }
1215 
1216 void
1217 dsched_exit_proc(struct proc *p)
1218 {
1219 	struct dsched_thread_ctx	*tdctx;
1220 
1221 	if (dsched_inited == 0)
1222 		return;
1223 
1224 	KKASSERT(p != NULL);
1225 
1226 	tdctx = dsched_get_proc_priv(p);
1227 	KKASSERT(tdctx != NULL);
1228 
1229 	tdctx->dead = 0xDEAD;
1230 	dsched_set_proc_priv(p, NULL);
1231 
1232 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1233 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1234 	atomic_subtract_int(&dsched_stats.nprocs, 1);
1235 }
1236 
1237 
1238 void
1239 dsched_exit_thread(struct thread *td)
1240 {
1241 	struct dsched_thread_ctx	*tdctx;
1242 
1243 	if (dsched_inited == 0)
1244 		return;
1245 
1246 	KKASSERT(td != NULL);
1247 
1248 	tdctx = dsched_get_thread_priv(td);
1249 	KKASSERT(tdctx != NULL);
1250 
1251 	tdctx->dead = 0xDEAD;
1252 	dsched_set_thread_priv(td, 0);
1253 
1254 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1255 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1256 	atomic_subtract_int(&dsched_stats.nthreads, 1);
1257 }
1258 
1259 /*
1260  * Returns ref'd tdio.
1261  *
1262  * tdio may have additional refs for the diskctx and tdctx it resides on.
1263  */
1264 void
1265 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1266 			      struct dsched_policy *pol)
1267 {
1268 	struct dsched_thread_ctx *tdctx;
1269 
1270 	tdctx = dsched_get_thread_priv(curthread);
1271 	KKASSERT(tdctx != NULL);
1272 	dsched_thread_io_alloc(diskctx->dp, tdctx, pol, 0);
1273 }
1274 
1275 /* DEFAULT NOOP POLICY */
1276 
1277 static int
1278 noop_prepare(struct dsched_disk_ctx *diskctx)
1279 {
1280 	return 0;
1281 }
1282 
1283 static void
1284 noop_teardown(struct dsched_disk_ctx *diskctx)
1285 {
1286 
1287 }
1288 
1289 static void
1290 noop_cancel(struct dsched_disk_ctx *diskctx)
1291 {
1292 
1293 }
1294 
1295 static int
1296 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1297 	   struct bio *bio)
1298 {
1299 	dsched_strategy_raw(diskctx->dp, bio);
1300 #if 0
1301 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1302 #endif
1303 	return 0;
1304 }
1305 
1306 /*
1307  * SYSINIT stuff
1308  */
1309 static void
1310 dsched_init(void)
1311 {
1312 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1313 					   NULL, NULL, NULL,
1314 					   objcache_malloc_alloc,
1315 					   objcache_malloc_free,
1316 					   &dsched_thread_io_malloc_args );
1317 
1318 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1319 					   NULL, NULL, NULL,
1320 					   objcache_malloc_alloc,
1321 					   objcache_malloc_free,
1322 					   &dsched_thread_ctx_malloc_args );
1323 
1324 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1325 					   NULL, NULL, NULL,
1326 					   objcache_malloc_alloc,
1327 					   objcache_malloc_free,
1328 					   &dsched_disk_ctx_malloc_args );
1329 
1330 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1331 
1332 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1333 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1334 
1335 	dsched_register(&dsched_noop_policy);
1336 
1337 	dsched_inited = 1;
1338 }
1339 
1340 static void
1341 dsched_uninit(void)
1342 {
1343 }
1344 
1345 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1346 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1347 
1348 /*
1349  * SYSCTL stuff
1350  */
1351 static int
1352 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1353 {
1354 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1355 }
1356 
1357 static int
1358 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1359 {
1360 	struct dsched_policy *pol = NULL;
1361 	int error, first = 1;
1362 
1363 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1364 
1365 	while ((pol = dsched_policy_enumerate(pol))) {
1366 		if (!first) {
1367 			error = SYSCTL_OUT(req, " ", 1);
1368 			if (error)
1369 				break;
1370 		} else {
1371 			first = 0;
1372 		}
1373 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1374 		if (error)
1375 			break;
1376 
1377 	}
1378 
1379 	lockmgr(&dsched_lock, LK_RELEASE);
1380 
1381 	error = SYSCTL_OUT(req, "", 1);
1382 
1383 	return error;
1384 }
1385 
1386 static int
1387 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1388 {
1389 	char buf[DSCHED_POLICY_NAME_LENGTH];
1390 	struct dsched_disk_ctx *diskctx = arg1;
1391 	struct dsched_policy *pol = NULL;
1392 	int error;
1393 
1394 	if (diskctx == NULL) {
1395 		return 0;
1396 	}
1397 
1398 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1399 
1400 	pol = diskctx->dp->d_sched_policy;
1401 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1402 
1403 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1404 	if (error || req->newptr == NULL) {
1405 		lockmgr(&dsched_lock, LK_RELEASE);
1406 		return (error);
1407 	}
1408 
1409 	pol = dsched_find_policy(buf);
1410 	if (pol == NULL) {
1411 		lockmgr(&dsched_lock, LK_RELEASE);
1412 		return 0;
1413 	}
1414 
1415 	dsched_switch(diskctx->dp, pol);
1416 
1417 	lockmgr(&dsched_lock, LK_RELEASE);
1418 
1419 	return error;
1420 }
1421 
1422 static int
1423 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1424 {
1425 	char buf[DSCHED_POLICY_NAME_LENGTH];
1426 	struct dsched_policy *pol = NULL;
1427 	int error;
1428 
1429 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1430 
1431 	pol = default_policy;
1432 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1433 
1434 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1435 	if (error || req->newptr == NULL) {
1436 		lockmgr(&dsched_lock, LK_RELEASE);
1437 		return (error);
1438 	}
1439 
1440 	pol = dsched_find_policy(buf);
1441 	if (pol == NULL) {
1442 		lockmgr(&dsched_lock, LK_RELEASE);
1443 		return 0;
1444 	}
1445 
1446 	default_set = 1;
1447 	default_policy = pol;
1448 
1449 	lockmgr(&dsched_lock, LK_RELEASE);
1450 
1451 	return error;
1452 }
1453 
1454 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1455     "Disk Scheduler Framework (dsched) magic");
1456 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1457     "List of disks and their policies");
1458 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1459     0, "Enable dsched debugging");
1460 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1461     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1462     "dsched statistics");
1463 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1464     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1465 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1466     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1467 
1468 static void
1469 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1470 {
1471 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1472 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1473 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1474 	}
1475 
1476 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1477 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1478 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1479 }
1480