xref: /dflybsd-src/sys/kern/kern_dsched.c (revision 37d4ea13cefac0f93287e0a0a1d5f304a492ffe7)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		noop_prepare;
58 static dsched_teardown_t	noop_teardown;
59 static dsched_cancel_t		noop_cancel;
60 static dsched_queue_t		noop_queue;
61 
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 
64 static int	dsched_inited = 0;
65 static int	default_set = 0;
66 
67 struct lock	dsched_lock;
68 static int	dsched_debug_enable = 0;
69 
70 struct dsched_stats	dsched_stats;
71 
72 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
73 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
74 struct objcache_malloc_args dsched_thread_io_malloc_args = {
75 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
76 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
77 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
78 
79 static struct objcache	*dsched_diskctx_cache;
80 static struct objcache	*dsched_tdctx_cache;
81 static struct objcache	*dsched_tdio_cache;
82 
83 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
84 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
85 
86 struct lock	dsched_tdctx_lock;
87 
88 static struct dsched_policy_head dsched_policy_list =
89 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
90 
91 static struct dsched_policy dsched_noop_policy = {
92 	.name = "noop",
93 
94 	.prepare = noop_prepare,
95 	.teardown = noop_teardown,
96 	.cancel_all = noop_cancel,
97 	.bio_queue = noop_queue
98 };
99 
100 static struct dsched_policy *default_policy = &dsched_noop_policy;
101 
102 /*
103  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
104  * using kvprintf
105  */
106 int
107 dsched_debug(int level, char *fmt, ...)
108 {
109 	__va_list ap;
110 
111 	__va_start(ap, fmt);
112 	if (level <= dsched_debug_enable)
113 		kvprintf(fmt, ap);
114 	__va_end(ap);
115 
116 	return 0;
117 }
118 
119 /*
120  * Called on disk_create()
121  * tries to read which policy to use from loader.conf, if there's
122  * none specified, the default policy is used.
123  */
124 void
125 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
126 {
127 	char tunable_key[SPECNAMELEN + 48];
128 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
129 	char *ptr;
130 	struct dsched_policy *policy = NULL;
131 
132 	/* Also look for serno stuff? */
133 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
134 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
135 
136 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
137 	    head_name, unit);
138 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
139 	    sizeof(sched_policy)) != 0) {
140 		policy = dsched_find_policy(sched_policy);
141 	}
142 
143 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
144 	    head_name);
145 	for (ptr = tunable_key; *ptr; ptr++) {
146 		if (*ptr == '/')
147 			*ptr = '-';
148 	}
149 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
150 	    sizeof(sched_policy)) != 0)) {
151 		policy = dsched_find_policy(sched_policy);
152 	}
153 
154 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
155 	if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
156 	    sizeof(sched_policy)) != 0)) {
157 		policy = dsched_find_policy(sched_policy);
158 	}
159 
160 	if (!policy) {
161 		if (!default_set) {
162 			dsched_debug(0, "No policy for %s%d specified, "
163 			    "or policy not found\n", head_name, unit);
164 		}
165 		dsched_set_policy(dp, default_policy);
166 	} else {
167 		dsched_set_policy(dp, policy);
168 	}
169 
170 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
171 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
172 	else
173 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
174 	for (ptr = tunable_key; *ptr; ptr++) {
175 		if (*ptr == '/')
176 			*ptr = '-';
177 	}
178 	dsched_sysctl_add_disk(
179 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
180 	    tunable_key);
181 
182 	lockmgr(&dsched_lock, LK_RELEASE);
183 }
184 
185 /*
186  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
187  * there's any policy associated with the serial number of the device.
188  */
189 void
190 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
191 {
192 	char tunable_key[SPECNAMELEN + 48];
193 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
194 	struct dsched_policy *policy = NULL;
195 
196 	if (info->d_serialno == NULL)
197 		return;
198 
199 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
200 
201 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
202 	    info->d_serialno);
203 
204 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
205 	    sizeof(sched_policy)) != 0)) {
206 		policy = dsched_find_policy(sched_policy);
207 	}
208 
209 	if (policy) {
210 		dsched_switch(dp, policy);
211 	}
212 
213 	dsched_sysctl_add_disk(
214 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
215 	    info->d_serialno);
216 
217 	lockmgr(&dsched_lock, LK_RELEASE);
218 }
219 
220 /*
221  * Called on disk_destroy()
222  * shuts down the scheduler core and cancels all remaining bios
223  */
224 void
225 dsched_disk_destroy_callback(struct disk *dp)
226 {
227 	struct dsched_policy *old_policy;
228 	struct dsched_disk_ctx *diskctx;
229 
230 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
231 
232 	diskctx = dsched_get_disk_priv(dp);
233 
234 	old_policy = dp->d_sched_policy;
235 	dp->d_sched_policy = &dsched_noop_policy;
236 	old_policy->cancel_all(dsched_get_disk_priv(dp));
237 	old_policy->teardown(dsched_get_disk_priv(dp));
238 
239 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
240 		sysctl_ctx_free(&diskctx->sysctl_ctx);
241 
242 	policy_destroy(dp);
243 	atomic_subtract_int(&old_policy->ref_count, 1);
244 	KKASSERT(old_policy->ref_count >= 0);
245 
246 	lockmgr(&dsched_lock, LK_RELEASE);
247 }
248 
249 
250 void
251 dsched_queue(struct disk *dp, struct bio *bio)
252 {
253 	struct dsched_thread_ctx	*tdctx;
254 	struct dsched_thread_io		*tdio;
255 	struct dsched_disk_ctx		*diskctx;
256 
257 	int found = 0, error = 0;
258 
259 	tdctx = dsched_get_buf_priv(bio->bio_buf);
260 	if (tdctx == NULL) {
261 		/* We don't handle this case, let dsched dispatch */
262 		atomic_add_int(&dsched_stats.no_tdctx, 1);
263 		dsched_strategy_raw(dp, bio);
264 		return;
265 	}
266 
267 	DSCHED_THREAD_CTX_LOCK(tdctx);
268 
269 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
270 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
271 		if (tdio->dp == dp) {
272 			dsched_thread_io_ref(tdio);
273 			found = 1;
274 			break;
275 		}
276 	}
277 
278 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
279 	dsched_clr_buf_priv(bio->bio_buf);
280 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
281 
282 	KKASSERT(found == 1);
283 	diskctx = dsched_get_disk_priv(dp);
284 	dsched_disk_ctx_ref(diskctx);
285 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
286 
287 	if (error) {
288 		dsched_strategy_raw(dp, bio);
289 	}
290 	dsched_disk_ctx_unref(diskctx);
291 	dsched_thread_io_unref(tdio);
292 }
293 
294 
295 /*
296  * Called from each module_init or module_attach of each policy
297  * registers the policy in the local policy list.
298  */
299 int
300 dsched_register(struct dsched_policy *d_policy)
301 {
302 	struct dsched_policy *policy;
303 	int error = 0;
304 
305 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
306 
307 	policy = dsched_find_policy(d_policy->name);
308 
309 	if (!policy) {
310 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
311 		atomic_add_int(&d_policy->ref_count, 1);
312 	} else {
313 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
314 		    d_policy->name);
315 		error = EEXIST;
316 	}
317 
318 	lockmgr(&dsched_lock, LK_RELEASE);
319 	return error;
320 }
321 
322 /*
323  * Called from each module_detach of each policy
324  * unregisters the policy
325  */
326 int
327 dsched_unregister(struct dsched_policy *d_policy)
328 {
329 	struct dsched_policy *policy;
330 
331 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
332 	policy = dsched_find_policy(d_policy->name);
333 
334 	if (policy) {
335 		if (policy->ref_count > 1) {
336 			lockmgr(&dsched_lock, LK_RELEASE);
337 			return EBUSY;
338 		}
339 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
340 		atomic_subtract_int(&policy->ref_count, 1);
341 		KKASSERT(policy->ref_count == 0);
342 	}
343 	lockmgr(&dsched_lock, LK_RELEASE);
344 	return 0;
345 }
346 
347 
348 /*
349  * switches the policy by first removing the old one and then
350  * enabling the new one.
351  */
352 int
353 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
354 {
355 	struct dsched_policy *old_policy;
356 
357 	/* If we are asked to set the same policy, do nothing */
358 	if (dp->d_sched_policy == new_policy)
359 		return 0;
360 
361 	/* lock everything down, diskwise */
362 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
363 	old_policy = dp->d_sched_policy;
364 
365 	atomic_subtract_int(&old_policy->ref_count, 1);
366 	KKASSERT(old_policy->ref_count >= 0);
367 
368 	dp->d_sched_policy = &dsched_noop_policy;
369 	old_policy->teardown(dsched_get_disk_priv(dp));
370 	policy_destroy(dp);
371 
372 	/* Bring everything back to life */
373 	dsched_set_policy(dp, new_policy);
374 	lockmgr(&dsched_lock, LK_RELEASE);
375 	return 0;
376 }
377 
378 
379 /*
380  * Loads a given policy and attaches it to the specified disk.
381  * Also initializes the core for the policy
382  */
383 void
384 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
385 {
386 	int locked = 0;
387 
388 	/* Check if it is locked already. if not, we acquire the devfs lock */
389 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
390 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
391 		locked = 1;
392 	}
393 
394 	policy_new(dp, new_policy);
395 	new_policy->prepare(dsched_get_disk_priv(dp));
396 	dp->d_sched_policy = new_policy;
397 	atomic_add_int(&new_policy->ref_count, 1);
398 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
399 	    new_policy->name);
400 
401 	/* If we acquired the lock, we also get rid of it */
402 	if (locked)
403 		lockmgr(&dsched_lock, LK_RELEASE);
404 }
405 
406 struct dsched_policy*
407 dsched_find_policy(char *search)
408 {
409 	struct dsched_policy *policy;
410 	struct dsched_policy *policy_found = NULL;
411 	int locked = 0;
412 
413 	/* Check if it is locked already. if not, we acquire the devfs lock */
414 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
415 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
416 		locked = 1;
417 	}
418 
419 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
420 		if (!strcmp(policy->name, search)) {
421 			policy_found = policy;
422 			break;
423 		}
424 	}
425 
426 	/* If we acquired the lock, we also get rid of it */
427 	if (locked)
428 		lockmgr(&dsched_lock, LK_RELEASE);
429 
430 	return policy_found;
431 }
432 
433 struct disk*
434 dsched_find_disk(char *search)
435 {
436 	struct disk *dp_found = NULL;
437 	struct disk *dp = NULL;
438 
439 	while((dp = disk_enumerate(dp))) {
440 		if (!strcmp(dp->d_cdev->si_name, search)) {
441 			dp_found = dp;
442 			break;
443 		}
444 	}
445 
446 	return dp_found;
447 }
448 
449 struct disk*
450 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
451 {
452 	while ((dp = disk_enumerate(dp))) {
453 		if (dp->d_sched_policy == policy)
454 			return dp;
455 	}
456 
457 	return NULL;
458 }
459 
460 struct dsched_policy *
461 dsched_policy_enumerate(struct dsched_policy *pol)
462 {
463 	if (!pol)
464 		return (TAILQ_FIRST(&dsched_policy_list));
465 	else
466 		return (TAILQ_NEXT(pol, link));
467 }
468 
469 void
470 dsched_cancel_bio(struct bio *bp)
471 {
472 	bp->bio_buf->b_error = ENXIO;
473 	bp->bio_buf->b_flags |= B_ERROR;
474 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
475 
476 	biodone(bp);
477 }
478 
479 void
480 dsched_strategy_raw(struct disk *dp, struct bio *bp)
481 {
482 	/*
483 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
484 	 * to avoid panics
485 	 */
486 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
487 	if(bp->bio_track != NULL) {
488 		dsched_debug(LOG_INFO,
489 		    "dsched_strategy_raw sees non-NULL bio_track!! "
490 		    "bio: %p\n", bp);
491 		bp->bio_track = NULL;
492 	}
493 	dev_dstrategy(dp->d_rawdev, bp);
494 }
495 
496 void
497 dsched_strategy_sync(struct disk *dp, struct bio *bio)
498 {
499 	struct buf *bp, *nbp;
500 	struct bio *nbio;
501 
502 	bp = bio->bio_buf;
503 
504 	nbp = getpbuf(NULL);
505 	nbio = &nbp->b_bio1;
506 
507 	nbp->b_cmd = bp->b_cmd;
508 	nbp->b_bufsize = bp->b_bufsize;
509 	nbp->b_runningbufspace = bp->b_runningbufspace;
510 	nbp->b_bcount = bp->b_bcount;
511 	nbp->b_resid = bp->b_resid;
512 	nbp->b_data = bp->b_data;
513 #if 0
514 	/*
515 	 * Buffers undergoing device I/O do not need a kvabase/size.
516 	 */
517 	nbp->b_kvabase = bp->b_kvabase;
518 	nbp->b_kvasize = bp->b_kvasize;
519 #endif
520 	nbp->b_dirtyend = bp->b_dirtyend;
521 
522 	nbio->bio_done = biodone_sync;
523 	nbio->bio_flags |= BIO_SYNC;
524 	nbio->bio_track = NULL;
525 
526 	nbio->bio_caller_info1.ptr = dp;
527 	nbio->bio_offset = bio->bio_offset;
528 
529 	dev_dstrategy(dp->d_rawdev, nbio);
530 	biowait(nbio, "dschedsync");
531 	bp->b_resid = nbp->b_resid;
532 	bp->b_error = nbp->b_error;
533 	biodone(bio);
534 #if 0
535 	nbp->b_kvabase = NULL;
536 	nbp->b_kvasize = 0;
537 #endif
538 	relpbuf(nbp, NULL);
539 }
540 
541 void
542 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
543 {
544 	struct bio *nbio;
545 
546 	nbio = push_bio(bio);
547 	nbio->bio_done = done;
548 	nbio->bio_offset = bio->bio_offset;
549 
550 	dsched_set_bio_dp(nbio, dp);
551 	dsched_set_bio_priv(nbio, priv);
552 
553 	getmicrotime(&nbio->bio_caller_info3.tv);
554 	dev_dstrategy(dp->d_rawdev, nbio);
555 }
556 
557 void
558 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
559 {
560 	int refcount;
561 
562 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
563 
564 	KKASSERT(refcount >= 0);
565 }
566 
567 void
568 dsched_thread_io_ref(struct dsched_thread_io *tdio)
569 {
570 	int refcount;
571 
572 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
573 
574 	KKASSERT(refcount >= 0);
575 }
576 
577 void
578 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
579 {
580 	int refcount;
581 
582 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
583 
584 	KKASSERT(refcount >= 0);
585 }
586 
587 void
588 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
589 {
590 	struct dsched_thread_io	*tdio, *tdio2;
591 	int refcount;
592 
593 	refcount = atomic_fetchadd_int(&diskctx->refcount, -1);
594 
595 
596 	KKASSERT(refcount >= 0 || refcount <= -0x400);
597 
598 	if (refcount == 1) {
599 		atomic_subtract_int(&diskctx->refcount, 0x400); /* mark as: in destruction */
600 #if 0
601 		kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
602 		print_backtrace(4);
603 #endif
604 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
605 		TAILQ_FOREACH_MUTABLE(tdio, &diskctx->tdio_list, dlink, tdio2) {
606 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
607 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
608 			dsched_thread_io_unref(tdio);
609 		}
610 		lockmgr(&diskctx->lock, LK_RELEASE);
611 		if (diskctx->dp->d_sched_policy->destroy_diskctx)
612 			diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
613 		objcache_put(dsched_diskctx_cache, diskctx);
614 		atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
615 	}
616 }
617 
618 void
619 dsched_thread_io_unref(struct dsched_thread_io *tdio)
620 {
621 	struct dsched_thread_ctx	*tdctx;
622 	struct dsched_disk_ctx	*diskctx;
623 	int refcount;
624 
625 	refcount = atomic_fetchadd_int(&tdio->refcount, -1);
626 
627 	KKASSERT(refcount >= 0 || refcount <= -0x400);
628 
629 	if (refcount == 1) {
630 		atomic_subtract_int(&tdio->refcount, 0x400); /* mark as: in destruction */
631 #if 0
632 		kprintf("tdio (%p) destruction started, trace:\n", tdio);
633 		print_backtrace(8);
634 #endif
635 		diskctx = tdio->diskctx;
636 		KKASSERT(diskctx != NULL);
637 		KKASSERT(tdio->qlength == 0);
638 
639 		if (tdio->flags & DSCHED_LINKED_DISK_CTX) {
640 			lockmgr(&diskctx->lock, LK_EXCLUSIVE);
641 
642 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
643 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
644 
645 			lockmgr(&diskctx->lock, LK_RELEASE);
646 		}
647 
648 		if (tdio->flags & DSCHED_LINKED_THREAD_CTX) {
649 			tdctx = tdio->tdctx;
650 			KKASSERT(tdctx != NULL);
651 
652 			lockmgr(&tdctx->lock, LK_EXCLUSIVE);
653 
654 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
655 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
656 
657 			lockmgr(&tdctx->lock, LK_RELEASE);
658 		}
659 		if (tdio->diskctx->dp->d_sched_policy->destroy_tdio)
660 			tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);
661 		objcache_put(dsched_tdio_cache, tdio);
662 		atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
663 #if 0
664 		dsched_disk_ctx_unref(diskctx);
665 #endif
666 	}
667 }
668 
669 void
670 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
671 {
672 	struct dsched_thread_io	*tdio, *tdio2;
673 	int refcount;
674 
675 	refcount = atomic_fetchadd_int(&tdctx->refcount, -1);
676 
677 	KKASSERT(refcount >= 0 || refcount <= -0x400);
678 
679 	if (refcount == 1) {
680 		atomic_subtract_int(&tdctx->refcount, 0x400); /* mark as: in destruction */
681 #if 0
682 		kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
683 		print_backtrace(8);
684 #endif
685 		DSCHED_GLOBAL_THREAD_CTX_LOCK();
686 
687 		TAILQ_FOREACH_MUTABLE(tdio, &tdctx->tdio_list, link, tdio2) {
688 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
689 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
690 			dsched_thread_io_unref(tdio);
691 		}
692 		TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
693 
694 		DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
695 
696 		objcache_put(dsched_tdctx_cache, tdctx);
697 		atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
698 	}
699 }
700 
701 
702 struct dsched_thread_io *
703 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
704     struct dsched_policy *pol)
705 {
706 	struct dsched_thread_io	*tdio;
707 #if 0
708 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
709 #endif
710 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
711 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
712 
713 	/* XXX: maybe we do need another ref for the disk list for tdio */
714 	dsched_thread_io_ref(tdio);
715 
716 	DSCHED_THREAD_IO_LOCKINIT(tdio);
717 	tdio->dp = dp;
718 
719 	tdio->diskctx = dsched_get_disk_priv(dp);
720 	TAILQ_INIT(&tdio->queue);
721 
722 	if (pol->new_tdio)
723 		pol->new_tdio(tdio);
724 
725 	lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
726 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
727 	tdio->flags |= DSCHED_LINKED_DISK_CTX;
728 	lockmgr(&tdio->diskctx->lock, LK_RELEASE);
729 
730 	if (tdctx) {
731 		tdio->tdctx = tdctx;
732 		tdio->p = tdctx->p;
733 
734 		/* Put the tdio in the tdctx list */
735 		DSCHED_THREAD_CTX_LOCK(tdctx);
736 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
737 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
738 		tdio->flags |= DSCHED_LINKED_THREAD_CTX;
739 	}
740 
741 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
742 	return tdio;
743 }
744 
745 
746 struct dsched_disk_ctx *
747 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
748 {
749 	struct dsched_disk_ctx *diskctx;
750 
751 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
752 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
753 	dsched_disk_ctx_ref(diskctx);
754 	diskctx->dp = dp;
755 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
756 	TAILQ_INIT(&diskctx->tdio_list);
757 
758 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
759 	if (pol->new_diskctx)
760 		pol->new_diskctx(diskctx);
761 	return diskctx;
762 }
763 
764 
765 struct dsched_thread_ctx *
766 dsched_thread_ctx_alloc(struct proc *p)
767 {
768 	struct dsched_thread_ctx	*tdctx;
769 	struct dsched_thread_io	*tdio;
770 	struct disk	*dp = NULL;
771 
772 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
773 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
774 	dsched_thread_ctx_ref(tdctx);
775 #if 0
776 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
777 #endif
778 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
779 	TAILQ_INIT(&tdctx->tdio_list);
780 	tdctx->p = p;
781 
782 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
783 	while ((dp = disk_enumerate(dp))) {
784 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
785 	}
786 
787 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
788 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
789 
790 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
791 	/* XXX: no callback here */
792 	return tdctx;
793 }
794 
795 void
796 policy_new(struct disk *dp, struct dsched_policy *pol) {
797 	struct dsched_thread_ctx *tdctx;
798 	struct dsched_disk_ctx *diskctx;
799 	struct dsched_thread_io *tdio;
800 
801 	diskctx = dsched_disk_ctx_alloc(dp, pol);
802 	dsched_disk_ctx_ref(diskctx);
803 	dsched_set_disk_priv(dp, diskctx);
804 
805 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
806 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
807 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
808 	}
809 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
810 
811 }
812 
813 void
814 policy_destroy(struct disk *dp) {
815 	struct dsched_disk_ctx *diskctx;
816 
817 	diskctx = dsched_get_disk_priv(dp);
818 	KKASSERT(diskctx != NULL);
819 
820 	dsched_disk_ctx_unref(diskctx); /* from prepare */
821 	dsched_disk_ctx_unref(diskctx); /* from alloc */
822 
823 	dsched_set_disk_priv(dp, NULL);
824 }
825 
826 void
827 dsched_new_buf(struct buf *bp)
828 {
829 	struct dsched_thread_ctx	*tdctx = NULL;
830 
831 	if (dsched_inited == 0)
832 		return;
833 
834 	if (curproc != NULL) {
835 		tdctx = dsched_get_proc_priv(curproc);
836 	} else {
837 		/* This is a kernel thread, so no proc info is available */
838 		tdctx = dsched_get_thread_priv(curthread);
839 	}
840 
841 #if 0
842 	/*
843 	 * XXX: hack. we don't want this assert because we aren't catching all
844 	 *	threads. mi_startup() is still getting away without an tdctx.
845 	 */
846 
847 	/* by now we should have an tdctx. if not, something bad is going on */
848 	KKASSERT(tdctx != NULL);
849 #endif
850 
851 	if (tdctx) {
852 		dsched_thread_ctx_ref(tdctx);
853 	}
854 	dsched_set_buf_priv(bp, tdctx);
855 }
856 
857 void
858 dsched_exit_buf(struct buf *bp)
859 {
860 	struct dsched_thread_ctx	*tdctx;
861 
862 	tdctx = dsched_get_buf_priv(bp);
863 	if (tdctx != NULL) {
864 		dsched_clr_buf_priv(bp);
865 		dsched_thread_ctx_unref(tdctx);
866 	}
867 }
868 
869 void
870 dsched_new_proc(struct proc *p)
871 {
872 	struct dsched_thread_ctx	*tdctx;
873 
874 	if (dsched_inited == 0)
875 		return;
876 
877 	KKASSERT(p != NULL);
878 
879 	tdctx = dsched_thread_ctx_alloc(p);
880 	tdctx->p = p;
881 	dsched_thread_ctx_ref(tdctx);
882 
883 	dsched_set_proc_priv(p, tdctx);
884 	atomic_add_int(&dsched_stats.nprocs, 1);
885 }
886 
887 
888 void
889 dsched_new_thread(struct thread *td)
890 {
891 	struct dsched_thread_ctx	*tdctx;
892 
893 	if (dsched_inited == 0)
894 		return;
895 
896 	KKASSERT(td != NULL);
897 
898 	tdctx = dsched_thread_ctx_alloc(NULL);
899 	tdctx->td = td;
900 	dsched_thread_ctx_ref(tdctx);
901 
902 	dsched_set_thread_priv(td, tdctx);
903 	atomic_add_int(&dsched_stats.nthreads, 1);
904 }
905 
906 void
907 dsched_exit_proc(struct proc *p)
908 {
909 	struct dsched_thread_ctx	*tdctx;
910 
911 	if (dsched_inited == 0)
912 		return;
913 
914 	KKASSERT(p != NULL);
915 
916 	tdctx = dsched_get_proc_priv(p);
917 	KKASSERT(tdctx != NULL);
918 
919 	tdctx->dead = 0xDEAD;
920 	dsched_set_proc_priv(p, NULL);
921 
922 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
923 	dsched_thread_ctx_unref(tdctx); /* one for ref */
924 	atomic_subtract_int(&dsched_stats.nprocs, 1);
925 }
926 
927 
928 void
929 dsched_exit_thread(struct thread *td)
930 {
931 	struct dsched_thread_ctx	*tdctx;
932 
933 	if (dsched_inited == 0)
934 		return;
935 
936 	KKASSERT(td != NULL);
937 
938 	tdctx = dsched_get_thread_priv(td);
939 	KKASSERT(tdctx != NULL);
940 
941 	tdctx->dead = 0xDEAD;
942 	dsched_set_thread_priv(td, 0);
943 
944 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
945 	dsched_thread_ctx_unref(tdctx); /* one for ref */
946 	atomic_subtract_int(&dsched_stats.nthreads, 1);
947 }
948 
949 struct dsched_thread_io *
950 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
951     struct dsched_policy *pol) {
952 	struct dsched_thread_ctx *tdctx;
953 	struct dsched_thread_io *tdio;
954 
955 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
956 
957 	tdctx = dsched_get_thread_priv(curthread);
958 	KKASSERT(tdctx != NULL);
959 	tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
960 
961 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
962 
963 	return tdio;
964 }
965 
966 /* DEFAULT NOOP POLICY */
967 
968 static int
969 noop_prepare(struct dsched_disk_ctx *diskctx)
970 {
971 	return 0;
972 }
973 
974 static void
975 noop_teardown(struct dsched_disk_ctx *diskctx)
976 {
977 
978 }
979 
980 static void
981 noop_cancel(struct dsched_disk_ctx *diskctx)
982 {
983 
984 }
985 
986 static int
987 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
988     struct bio *bio)
989 {
990 	dsched_strategy_raw(diskctx->dp, bio);
991 #if 0
992 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
993 #endif
994 	return 0;
995 }
996 
997 /*
998  * SYSINIT stuff
999  */
1000 static void
1001 dsched_init(void)
1002 {
1003 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1004 					   NULL, NULL, NULL,
1005 					   objcache_malloc_alloc,
1006 					   objcache_malloc_free,
1007 					   &dsched_thread_io_malloc_args );
1008 
1009 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1010 					   NULL, NULL, NULL,
1011 					   objcache_malloc_alloc,
1012 					   objcache_malloc_free,
1013 					   &dsched_thread_ctx_malloc_args );
1014 
1015 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1016 					   NULL, NULL, NULL,
1017 					   objcache_malloc_alloc,
1018 					   objcache_malloc_free,
1019 					   &dsched_disk_ctx_malloc_args );
1020 
1021 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1022 
1023 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1024 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1025 
1026 	dsched_register(&dsched_noop_policy);
1027 
1028 	dsched_inited = 1;
1029 }
1030 
1031 static void
1032 dsched_uninit(void)
1033 {
1034 }
1035 
1036 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1037 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1038 
1039 /*
1040  * SYSCTL stuff
1041  */
1042 static int
1043 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1044 {
1045 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1046 }
1047 
1048 static int
1049 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1050 {
1051 	struct dsched_policy *pol = NULL;
1052 	int error, first = 1;
1053 
1054 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1055 
1056 	while ((pol = dsched_policy_enumerate(pol))) {
1057 		if (!first) {
1058 			error = SYSCTL_OUT(req, " ", 1);
1059 			if (error)
1060 				break;
1061 		} else {
1062 			first = 0;
1063 		}
1064 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1065 		if (error)
1066 			break;
1067 
1068 	}
1069 
1070 	lockmgr(&dsched_lock, LK_RELEASE);
1071 
1072 	error = SYSCTL_OUT(req, "", 1);
1073 
1074 	return error;
1075 }
1076 
1077 static int
1078 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1079 {
1080 	char buf[DSCHED_POLICY_NAME_LENGTH];
1081 	struct dsched_disk_ctx *diskctx = arg1;
1082 	struct dsched_policy *pol = NULL;
1083 	int error;
1084 
1085 	if (diskctx == NULL) {
1086 		return 0;
1087 	}
1088 
1089 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1090 
1091 	pol = diskctx->dp->d_sched_policy;
1092 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1093 
1094 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1095 	if (error || req->newptr == NULL) {
1096 		lockmgr(&dsched_lock, LK_RELEASE);
1097 		return (error);
1098 	}
1099 
1100 	pol = dsched_find_policy(buf);
1101 	if (pol == NULL) {
1102 		lockmgr(&dsched_lock, LK_RELEASE);
1103 		return 0;
1104 	}
1105 
1106 	dsched_switch(diskctx->dp, pol);
1107 
1108 	lockmgr(&dsched_lock, LK_RELEASE);
1109 
1110 	return error;
1111 }
1112 
1113 static int
1114 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1115 {
1116 	char buf[DSCHED_POLICY_NAME_LENGTH];
1117 	struct dsched_policy *pol = NULL;
1118 	int error;
1119 
1120 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1121 
1122 	pol = default_policy;
1123 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1124 
1125 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1126 	if (error || req->newptr == NULL) {
1127 		lockmgr(&dsched_lock, LK_RELEASE);
1128 		return (error);
1129 	}
1130 
1131 	pol = dsched_find_policy(buf);
1132 	if (pol == NULL) {
1133 		lockmgr(&dsched_lock, LK_RELEASE);
1134 		return 0;
1135 	}
1136 
1137 	default_set = 1;
1138 	default_policy = pol;
1139 
1140 	lockmgr(&dsched_lock, LK_RELEASE);
1141 
1142 	return error;
1143 }
1144 
1145 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1146     "Disk Scheduler Framework (dsched) magic");
1147 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1148     "List of disks and their policies");
1149 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1150     0, "Enable dsched debugging");
1151 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1152     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1153     "dsched statistics");
1154 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1155     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1156 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1157     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1158 
1159 static void
1160 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1161 {
1162 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1163 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1164 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1165 	}
1166 
1167 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1168 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1169 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1170 }
1171