xref: /netbsd-src/sys/kern/subr_disk.c (revision 21e37cc72a480a47828990a439cde7ac9ffaf0c6)
1 /*	$NetBSD: subr_disk.c,v 1.60 2004/03/09 12:23:07 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1982, 1986, 1988, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. Neither the name of the University nor the names of its contributors
58  *    may be used to endorse or promote products derived from this software
59  *    without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  *
73  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
74  */
75 
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.60 2004/03/09 12:23:07 yamt Exp $");
78 
79 #include "opt_compat_netbsd.h"
80 #include "opt_bufq.h"
81 
82 #include <sys/param.h>
83 #include <sys/kernel.h>
84 #include <sys/malloc.h>
85 #include <sys/buf.h>
86 #include <sys/syslog.h>
87 #include <sys/disklabel.h>
88 #include <sys/disk.h>
89 #include <sys/sysctl.h>
90 #include <lib/libkern/libkern.h>
91 
92 /*
93  * A global list of all disks attached to the system.  May grow or
94  * shrink over time.
95  */
96 struct	disklist_head disklist;	/* TAILQ_HEAD */
97 int	disk_count;		/* number of drives in global disklist */
98 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER;
99 
100 #ifdef NEW_BUFQ_STRATEGY
101 int bufq_disk_default_strat = BUFQ_READ_PRIO;
102 #else /* NEW_BUFQ_STRATEGY */
103 int bufq_disk_default_strat = BUFQ_DISKSORT;
104 #endif /* NEW_BUFQ_STRATEGY */
105 
106 /*
107  * Compute checksum for disk label.
108  */
109 u_int
110 dkcksum(struct disklabel *lp)
111 {
112 	u_short *start, *end;
113 	u_short sum = 0;
114 
115 	start = (u_short *)lp;
116 	end = (u_short *)&lp->d_partitions[lp->d_npartitions];
117 	while (start < end)
118 		sum ^= *start++;
119 	return (sum);
120 }
121 
122 /*
123  * Disk error is the preface to plaintive error messages
124  * about failing disk transfers.  It prints messages of the form
125 
126 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
127 
128  * if the offset of the error in the transfer and a disk label
129  * are both available.  blkdone should be -1 if the position of the error
130  * is unknown; the disklabel pointer may be null from drivers that have not
131  * been converted to use them.  The message is printed with printf
132  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
133  * The message should be completed (with at least a newline) with printf
134  * or addlog, respectively.  There is no trailing space.
135  */
136 #ifndef PRIdaddr
137 #define PRIdaddr PRId64
138 #endif
139 void
140 diskerr(const struct buf *bp, const char *dname, const char *what, int pri,
141     int blkdone, const struct disklabel *lp)
142 {
143 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
144 	void (*pr)(const char *, ...);
145 	char partname = 'a' + part;
146 	daddr_t sn;
147 
148 	if (/*CONSTCOND*/0)
149 		/* Compiler will error this is the format is wrong... */
150 		printf("%" PRIdaddr, bp->b_blkno);
151 
152 	if (pri != LOG_PRINTF) {
153 		static const char fmt[] = "";
154 		log(pri, fmt);
155 		pr = addlog;
156 	} else
157 		pr = printf;
158 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
159 	    bp->b_flags & B_READ ? "read" : "writ");
160 	sn = bp->b_blkno;
161 	if (bp->b_bcount <= DEV_BSIZE)
162 		(*pr)("%" PRIdaddr, sn);
163 	else {
164 		if (blkdone >= 0) {
165 			sn += blkdone;
166 			(*pr)("%" PRIdaddr " of ", sn);
167 		}
168 		(*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno,
169 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
170 	}
171 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
172 		sn += lp->d_partitions[part].p_offset;
173 		(*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "",
174 		    dname, unit, sn, sn / lp->d_secpercyl);
175 		sn %= lp->d_secpercyl;
176 		(*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")",
177 		    sn / lp->d_nsectors, sn % lp->d_nsectors);
178 	}
179 }
180 
181 /*
182  * Initialize the disklist.  Called by main() before autoconfiguration.
183  */
184 void
185 disk_init(void)
186 {
187 
188 	TAILQ_INIT(&disklist);
189 	disk_count = 0;
190 }
191 
192 /*
193  * Searches the disklist for the disk corresponding to the
194  * name provided.
195  */
196 struct disk *
197 disk_find(char *name)
198 {
199 	struct disk *diskp;
200 
201 	if ((name == NULL) || (disk_count <= 0))
202 		return (NULL);
203 
204 	simple_lock(&disklist_slock);
205 	for (diskp = TAILQ_FIRST(&disklist); diskp != NULL;
206 	    diskp = TAILQ_NEXT(diskp, dk_link))
207 		if (strcmp(diskp->dk_name, name) == 0) {
208 			simple_unlock(&disklist_slock);
209 			return (diskp);
210 		}
211 	simple_unlock(&disklist_slock);
212 
213 	return (NULL);
214 }
215 
216 /*
217  * Attach a disk.
218  */
219 void
220 disk_attach(struct disk *diskp)
221 {
222 	int s;
223 
224 	/*
225 	 * Allocate and initialize the disklabel structures.  Note that
226 	 * it's not safe to sleep here, since we're probably going to be
227 	 * called during autoconfiguration.
228 	 */
229 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
230 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
231 	    M_NOWAIT);
232 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
233 		panic("disk_attach: can't allocate storage for disklabel");
234 
235 	memset(diskp->dk_label, 0, sizeof(struct disklabel));
236 	memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel));
237 
238 	/*
239 	 * Set the attached timestamp.
240 	 */
241 	s = splclock();
242 	diskp->dk_attachtime = mono_time;
243 	splx(s);
244 
245 	/*
246 	 * Link into the disklist.
247 	 */
248 	simple_lock(&disklist_slock);
249 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
250 	simple_unlock(&disklist_slock);
251 	++disk_count;
252 }
253 
254 /*
255  * Detach a disk.
256  */
257 void
258 disk_detach(struct disk *diskp)
259 {
260 
261 	/*
262 	 * Remove from the disklist.
263 	 */
264 	if (--disk_count < 0)
265 		panic("disk_detach: disk_count < 0");
266 	simple_lock(&disklist_slock);
267 	TAILQ_REMOVE(&disklist, diskp, dk_link);
268 	simple_unlock(&disklist_slock);
269 
270 	/*
271 	 * Free the space used by the disklabel structures.
272 	 */
273 	free(diskp->dk_label, M_DEVBUF);
274 	free(diskp->dk_cpulabel, M_DEVBUF);
275 }
276 
277 /*
278  * Increment a disk's busy counter.  If the counter is going from
279  * 0 to 1, set the timestamp.
280  */
281 void
282 disk_busy(struct disk *diskp)
283 {
284 	int s;
285 
286 	/*
287 	 * XXX We'd like to use something as accurate as microtime(),
288 	 * but that doesn't depend on the system TOD clock.
289 	 */
290 	if (diskp->dk_busy++ == 0) {
291 		s = splclock();
292 		diskp->dk_timestamp = mono_time;
293 		splx(s);
294 	}
295 }
296 
297 /*
298  * Decrement a disk's busy counter, increment the byte count, total busy
299  * time, and reset the timestamp.
300  */
301 void
302 disk_unbusy(struct disk *diskp, long bcount, int read)
303 {
304 	int s;
305 	struct timeval dv_time, diff_time;
306 
307 	if (diskp->dk_busy-- == 0) {
308 		printf("%s: dk_busy < 0\n", diskp->dk_name);
309 		panic("disk_unbusy");
310 	}
311 
312 	s = splclock();
313 	dv_time = mono_time;
314 	splx(s);
315 
316 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
317 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
318 
319 	diskp->dk_timestamp = dv_time;
320 	if (bcount > 0) {
321 		if (read) {
322 			diskp->dk_rbytes += bcount;
323 			diskp->dk_rxfer++;
324 		} else {
325 			diskp->dk_wbytes += bcount;
326 			diskp->dk_wxfer++;
327 		}
328 	}
329 }
330 
331 /*
332  * Reset the metrics counters on the given disk.  Note that we cannot
333  * reset the busy counter, as it may case a panic in disk_unbusy().
334  * We also must avoid playing with the timestamp information, as it
335  * may skew any pending transfer results.
336  */
337 void
338 disk_resetstat(struct disk *diskp)
339 {
340 	int s = splbio(), t;
341 
342 	diskp->dk_rxfer = 0;
343 	diskp->dk_rbytes = 0;
344 	diskp->dk_wxfer = 0;
345 	diskp->dk_wbytes = 0;
346 
347 	t = splclock();
348 	diskp->dk_attachtime = mono_time;
349 	splx(t);
350 
351 	timerclear(&diskp->dk_time);
352 
353 	splx(s);
354 }
355 
356 int
357 sysctl_hw_disknames(SYSCTLFN_ARGS)
358 {
359 	char buf[DK_DISKNAMELEN + 1];
360 	char *where = oldp;
361 	struct disk *diskp;
362 	size_t needed, left, slen;
363 	int error, first;
364 
365 	if (newp != NULL)
366 		return (EPERM);
367 	if (namelen != 0)
368 		return (EINVAL);
369 
370 	first = 1;
371 	error = 0;
372 	needed = 0;
373 	left = *oldlenp;
374 
375 	simple_lock(&disklist_slock);
376 	for (diskp = TAILQ_FIRST(&disklist); diskp != NULL;
377 	    diskp = TAILQ_NEXT(diskp, dk_link)) {
378 		if (where == NULL)
379 			needed += strlen(diskp->dk_name) + 1;
380 		else {
381 			memset(buf, 0, sizeof(buf));
382 			if (first) {
383 				strncpy(buf, diskp->dk_name, sizeof(buf));
384 				first = 0;
385 			} else {
386 				buf[0] = ' ';
387 				strncpy(buf + 1, diskp->dk_name,
388 				    sizeof(buf) - 1);
389 			}
390 			buf[DK_DISKNAMELEN] = '\0';
391 			slen = strlen(buf);
392 			if (left < slen + 1)
393 				break;
394 			/* +1 to copy out the trailing NUL byte */
395 			error = copyout(buf, where, slen + 1);
396 			if (error)
397 				break;
398 			where += slen;
399 			needed += slen;
400 			left -= slen;
401 		}
402 	}
403 	simple_unlock(&disklist_slock);
404 	*oldlenp = needed;
405 	return (error);
406 }
407 
408 int
409 sysctl_hw_diskstats(SYSCTLFN_ARGS)
410 {
411 	struct disk_sysctl sdisk;
412 	struct disk *diskp;
413 	char *where = oldp;
414 	size_t tocopy, left;
415 	int error;
416 
417 	if (newp != NULL)
418 		return (EPERM);
419 
420 	/*
421 	 * The original hw.diskstats call was broken and did not require
422 	 * the userland to pass in it's size of struct disk_sysctl.  This
423 	 * was fixed after NetBSD 1.6 was released, and any applications
424 	 * that do not pass in the size are given an error only, unless
425 	 * we care about 1.6 compatibility.
426 	 */
427 	if (namelen == 0)
428 #ifdef COMPAT_16
429 		tocopy = offsetof(struct disk_sysctl, dk_rxfer);
430 #else
431 		return (EINVAL);
432 #endif
433 	else
434 		tocopy = name[0];
435 
436 	if (where == NULL) {
437 		*oldlenp = disk_count * tocopy;
438 		return (0);
439 	}
440 
441 	error = 0;
442 	left = *oldlenp;
443 	memset(&sdisk, 0, sizeof(sdisk));
444 	*oldlenp = 0;
445 
446 	simple_lock(&disklist_slock);
447 	TAILQ_FOREACH(diskp, &disklist, dk_link) {
448 		if (left < tocopy)
449 			break;
450 		strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name));
451 		sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer;
452 		sdisk.dk_rxfer = diskp->dk_rxfer;
453 		sdisk.dk_wxfer = diskp->dk_wxfer;
454 		sdisk.dk_seek = diskp->dk_seek;
455 		sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes;
456 		sdisk.dk_rbytes = diskp->dk_rbytes;
457 		sdisk.dk_wbytes = diskp->dk_wbytes;
458 		sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec;
459 		sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec;
460 		sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec;
461 		sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec;
462 		sdisk.dk_time_sec = diskp->dk_time.tv_sec;
463 		sdisk.dk_time_usec = diskp->dk_time.tv_usec;
464 		sdisk.dk_busy = diskp->dk_busy;
465 
466 		error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk)));
467 		if (error)
468 			break;
469 		where += tocopy;
470 		*oldlenp += tocopy;
471 		left -= tocopy;
472 	}
473 	simple_unlock(&disklist_slock);
474 	return (error);
475 }
476 
477 struct bufq_fcfs {
478 	TAILQ_HEAD(, buf) bq_head;	/* actual list of buffers */
479 };
480 
481 struct bufq_disksort {
482 	TAILQ_HEAD(, buf) bq_head;	/* actual list of buffers */
483 };
484 
485 #define PRIO_READ_BURST		48
486 #define PRIO_WRITE_REQ		16
487 
488 struct bufq_prio {
489 	TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */
490 	struct buf *bq_write_next;	/* next request in bq_write */
491 	struct buf *bq_next;		/* current request */
492 	int bq_read_burst;		/* # of consecutive reads */
493 };
494 
495 
496 static __inline int buf_inorder(const struct buf *, const struct buf *, int);
497 
498 /*
499  * Check if two buf's are in ascending order.
500  */
501 static __inline int
502 buf_inorder(const struct buf *bp, const struct buf *bq, int sortby)
503 {
504 
505 	if (bp == NULL || bq == NULL)
506 		return (bq == NULL);
507 
508 	if (sortby == BUFQ_SORT_CYLINDER) {
509 		if (bp->b_cylinder != bq->b_cylinder)
510 			return bp->b_cylinder < bq->b_cylinder;
511 		else
512 			return bp->b_rawblkno < bq->b_rawblkno;
513 	} else
514 		return bp->b_rawblkno < bq->b_rawblkno;
515 }
516 
517 
518 /*
519  * First-come first-served sort for disks.
520  *
521  * Requests are appended to the queue without any reordering.
522  */
523 static void
524 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp)
525 {
526 	struct bufq_fcfs *fcfs = bufq->bq_private;
527 
528 	TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq);
529 }
530 
531 static struct buf *
532 bufq_fcfs_get(struct bufq_state *bufq, int remove)
533 {
534 	struct bufq_fcfs *fcfs = bufq->bq_private;
535 	struct buf *bp;
536 
537 	bp = TAILQ_FIRST(&fcfs->bq_head);
538 
539 	if (bp != NULL && remove)
540 		TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
541 
542 	return (bp);
543 }
544 
545 
546 /*
547  * Seek sort for disks.
548  *
549  * There are actually two queues, sorted in ascendening order.  The first
550  * queue holds those requests which are positioned after the current block;
551  * the second holds requests which came in after their position was passed.
552  * Thus we implement a one-way scan, retracting after reaching the end of
553  * the drive to the first request on the second queue, at which time it
554  * becomes the first queue.
555  *
556  * A one-way scan is natural because of the way UNIX read-ahead blocks are
557  * allocated.
558  */
559 static void
560 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp)
561 {
562 	struct bufq_disksort *disksort = bufq->bq_private;
563 	struct buf *bq, *nbq;
564 	int sortby;
565 
566 	sortby = bufq->bq_flags & BUFQ_SORT_MASK;
567 
568 	bq = TAILQ_FIRST(&disksort->bq_head);
569 
570 	/*
571 	 * If the queue is empty it's easy; we just go on the end.
572 	 */
573 	if (bq == NULL) {
574 		TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq);
575 		return;
576 	}
577 
578 	/*
579 	 * If we lie before the currently active request, then we
580 	 * must locate the second request list and add ourselves to it.
581 	 */
582 	if (buf_inorder(bp, bq, sortby)) {
583 		while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
584 			/*
585 			 * Check for an ``inversion'' in the normally ascending
586 			 * block numbers, indicating the start of the second
587 			 * request list.
588 			 */
589 			if (buf_inorder(nbq, bq, sortby)) {
590 				/*
591 				 * Search the second request list for the first
592 				 * request at a larger block number.  We go
593 				 * after that; if there is no such request, we
594 				 * go at the end.
595 				 */
596 				do {
597 					if (buf_inorder(bp, nbq, sortby))
598 						goto insert;
599 					bq = nbq;
600 				} while ((nbq =
601 				    TAILQ_NEXT(bq, b_actq)) != NULL);
602 				goto insert;		/* after last */
603 			}
604 			bq = nbq;
605 		}
606 		/*
607 		 * No inversions... we will go after the last, and
608 		 * be the first request in the second request list.
609 		 */
610 		goto insert;
611 	}
612 	/*
613 	 * Request is at/after the current request...
614 	 * sort in the first request list.
615 	 */
616 	while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
617 		/*
618 		 * We want to go after the current request if there is an
619 		 * inversion after it (i.e. it is the end of the first
620 		 * request list), or if the next request is a larger cylinder
621 		 * than our request.
622 		 */
623 		if (buf_inorder(nbq, bq, sortby) ||
624 		    buf_inorder(bp, nbq, sortby))
625 			goto insert;
626 		bq = nbq;
627 	}
628 	/*
629 	 * Neither a second list nor a larger request... we go at the end of
630 	 * the first list, which is the same as the end of the whole schebang.
631 	 */
632 insert:	TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq);
633 }
634 
635 static struct buf *
636 bufq_disksort_get(struct bufq_state *bufq, int remove)
637 {
638 	struct bufq_disksort *disksort = bufq->bq_private;
639 	struct buf *bp;
640 
641 	bp = TAILQ_FIRST(&disksort->bq_head);
642 
643 	if (bp != NULL && remove)
644 		TAILQ_REMOVE(&disksort->bq_head, bp, b_actq);
645 
646 	return (bp);
647 }
648 
649 
650 /*
651  * Seek sort for disks.
652  *
653  * There are two queues.  The first queue holds read requests; the second
654  * holds write requests.  The read queue is first-come first-served; the
655  * write queue is sorted in ascendening block order.
656  * The read queue is processed first.  After PRIO_READ_BURST consecutive
657  * read requests with non-empty write queue PRIO_WRITE_REQ requests from
658  * the write queue will be processed.
659  */
660 static void
661 bufq_prio_put(struct bufq_state *bufq, struct buf *bp)
662 {
663 	struct bufq_prio *prio = bufq->bq_private;
664 	struct buf *bq;
665 	int sortby;
666 
667 	sortby = bufq->bq_flags & BUFQ_SORT_MASK;
668 
669 	/*
670 	 * If it's a read request append it to the list.
671 	 */
672 	if ((bp->b_flags & B_READ) == B_READ) {
673 		TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq);
674 		return;
675 	}
676 
677 	bq = TAILQ_FIRST(&prio->bq_write);
678 
679 	/*
680 	 * If the write list is empty, simply append it to the list.
681 	 */
682 	if (bq == NULL) {
683 		TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq);
684 		prio->bq_write_next = bp;
685 		return;
686 	}
687 
688 	/*
689 	 * If we lie after the next request, insert after this request.
690 	 */
691 	if (buf_inorder(prio->bq_write_next, bp, sortby))
692 		bq = prio->bq_write_next;
693 
694 	/*
695 	 * Search for the first request at a larger block number.
696 	 * We go before this request if it exists.
697 	 */
698 	while (bq != NULL && buf_inorder(bq, bp, sortby))
699 		bq = TAILQ_NEXT(bq, b_actq);
700 
701 	if (bq != NULL)
702 		TAILQ_INSERT_BEFORE(bq, bp, b_actq);
703 	else
704 		TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq);
705 }
706 
707 static struct buf *
708 bufq_prio_get(struct bufq_state *bufq, int remove)
709 {
710 	struct bufq_prio *prio = bufq->bq_private;
711 	struct buf *bp;
712 
713 	/*
714 	 * If no current request, get next from the lists.
715 	 */
716 	if (prio->bq_next == NULL) {
717 		/*
718 		 * If at least one list is empty, select the other.
719 		 */
720 		if (TAILQ_FIRST(&prio->bq_read) == NULL) {
721 			prio->bq_next = prio->bq_write_next;
722 			prio->bq_read_burst = 0;
723 		} else if (prio->bq_write_next == NULL) {
724 			prio->bq_next = TAILQ_FIRST(&prio->bq_read);
725 			prio->bq_read_burst = 0;
726 		} else {
727 			/*
728 			 * Both list have requests.  Select the read list up
729 			 * to PRIO_READ_BURST times, then select the write
730 			 * list PRIO_WRITE_REQ times.
731 			 */
732 			if (prio->bq_read_burst++ < PRIO_READ_BURST)
733 				prio->bq_next = TAILQ_FIRST(&prio->bq_read);
734 			else if (prio->bq_read_burst <
735 			    PRIO_READ_BURST + PRIO_WRITE_REQ)
736 				prio->bq_next = prio->bq_write_next;
737 			else {
738 				prio->bq_next = TAILQ_FIRST(&prio->bq_read);
739 				prio->bq_read_burst = 0;
740 			}
741 		}
742 	}
743 
744 	bp = prio->bq_next;
745 
746 	if (bp != NULL && remove) {
747 		if ((bp->b_flags & B_READ) == B_READ)
748 			TAILQ_REMOVE(&prio->bq_read, bp, b_actq);
749 		else {
750 			/*
751 			 * Advance the write pointer before removing
752 			 * bp since it is actually prio->bq_write_next.
753 			 */
754 			prio->bq_write_next =
755 			    TAILQ_NEXT(prio->bq_write_next, b_actq);
756 			TAILQ_REMOVE(&prio->bq_write, bp, b_actq);
757 			if (prio->bq_write_next == NULL)
758 				prio->bq_write_next =
759 				    TAILQ_FIRST(&prio->bq_write);
760 		}
761 
762 		prio->bq_next = NULL;
763 	}
764 
765 	return (bp);
766 }
767 
768 
769 /*
770  * Cyclical scan (CSCAN)
771  */
772 TAILQ_HEAD(bqhead, buf);
773 struct cscan_queue {
774 	struct bqhead cq_head[2];	/* actual lists of buffers */
775 	int cq_idx;			/* current list index */
776 	int cq_lastcylinder;		/* b_cylinder of the last request */
777 	daddr_t cq_lastrawblkno;	/* b_rawblkno of the last request */
778 };
779 
780 static int __inline cscan_empty(const struct cscan_queue *);
781 static void cscan_put(struct cscan_queue *, struct buf *, int);
782 static struct buf *cscan_get(struct cscan_queue *, int);
783 static void cscan_init(struct cscan_queue *);
784 
785 static __inline int
786 cscan_empty(const struct cscan_queue *q)
787 {
788 
789 	return TAILQ_EMPTY(&q->cq_head[0]) && TAILQ_EMPTY(&q->cq_head[1]);
790 }
791 
792 static void
793 cscan_put(struct cscan_queue *q, struct buf *bp, int sortby)
794 {
795 	struct buf tmp;
796 	struct buf *it;
797 	struct bqhead *bqh;
798 	int idx;
799 
800 	tmp.b_cylinder = q->cq_lastcylinder;
801 	tmp.b_rawblkno = q->cq_lastrawblkno;
802 
803 	if (buf_inorder(bp, &tmp, sortby))
804 		idx = 1 - q->cq_idx;
805 	else
806 		idx = q->cq_idx;
807 
808 	bqh = &q->cq_head[idx];
809 
810 	TAILQ_FOREACH(it, bqh, b_actq)
811 		if (buf_inorder(bp, it, sortby))
812 			break;
813 
814 	if (it != NULL)
815 		TAILQ_INSERT_BEFORE(it, bp, b_actq);
816 	else
817 		TAILQ_INSERT_TAIL(bqh, bp, b_actq);
818 }
819 
820 static struct buf *
821 cscan_get(struct cscan_queue *q, int remove)
822 {
823 	int idx = q->cq_idx;
824 	struct bqhead *bqh;
825 	struct buf *bp;
826 
827 	bqh = &q->cq_head[idx];
828 	bp = TAILQ_FIRST(bqh);
829 
830 	if (bp == NULL) {
831 		/* switch queue */
832 		idx = 1 - idx;
833 		bqh = &q->cq_head[idx];
834 		bp = TAILQ_FIRST(bqh);
835 	}
836 
837 	KDASSERT((bp != NULL && !cscan_empty(q)) ||
838 	         (bp == NULL && cscan_empty(q)));
839 
840 	if (bp != NULL && remove) {
841 		q->cq_idx = idx;
842 		TAILQ_REMOVE(bqh, bp, b_actq);
843 
844 		q->cq_lastcylinder = bp->b_cylinder;
845 		q->cq_lastrawblkno =
846 		    bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
847 	}
848 
849 	return (bp);
850 }
851 
852 static void
853 cscan_init(struct cscan_queue *q)
854 {
855 
856 	TAILQ_INIT(&q->cq_head[0]);
857 	TAILQ_INIT(&q->cq_head[1]);
858 }
859 
860 
861 /*
862  * Per-prioritiy CSCAN.
863  *
864  * XXX probably we should have a way to raise
865  * priority of the on-queue requests.
866  */
867 #define	PRIOCSCAN_NQUEUE	3
868 
869 struct priocscan_queue {
870 	struct cscan_queue q_queue;
871 	int q_burst;
872 };
873 
874 struct bufq_priocscan {
875 	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
876 
877 #if 0
878 	/*
879 	 * XXX using "global" head position can reduce positioning time
880 	 * when switching between queues.
881 	 * although it might affect against fairness.
882 	 */
883 	daddr_t bq_lastrawblkno;
884 	int bq_lastcylinder;
885 #endif
886 };
887 
888 /*
889  * how many requests to serve when having pending requests on other queues.
890  *
891  * XXX tune
892  */
893 const int priocscan_burst[] = {
894 	64, 16, 4
895 };
896 
897 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
898 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
899 static void bufq_priocscan_init(struct bufq_state *);
900 static __inline struct cscan_queue *bufq_priocscan_selectqueue(
901     struct bufq_priocscan *, const struct buf *);
902 
903 static __inline struct cscan_queue *
904 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
905 {
906 	static const int priocscan_priomap[] = {
907 		[BPRIO_TIMENONCRITICAL] = 2,
908 		[BPRIO_TIMELIMITED] = 1,
909 		[BPRIO_TIMECRITICAL] = 0
910 	};
911 
912 	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
913 }
914 
915 static void
916 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
917 {
918 	struct bufq_priocscan *q = bufq->bq_private;
919 	struct cscan_queue *cq;
920 	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
921 
922 	cq = bufq_priocscan_selectqueue(q, bp);
923 	cscan_put(cq, bp, sortby);
924 }
925 
926 static struct buf *
927 bufq_priocscan_get(struct bufq_state *bufq, int remove)
928 {
929 	struct bufq_priocscan *q = bufq->bq_private;
930 	struct priocscan_queue *pq, *npq;
931 	struct priocscan_queue *first; /* first non-empty queue */
932 	const struct priocscan_queue *epq;
933 	const struct cscan_queue *cq;
934 	struct buf *bp;
935 	boolean_t single; /* true if there's only one non-empty queue */
936 
937 	pq = &q->bq_queue[0];
938 	epq = pq + PRIOCSCAN_NQUEUE;
939 	for (; pq < epq; pq++) {
940 		cq = &pq->q_queue;
941 		if (!cscan_empty(cq))
942 			break;
943 	}
944 	if (pq == epq) {
945 		/* there's no requests */
946 		return NULL;
947 	}
948 
949 	first = pq;
950 	single = TRUE;
951 	for (npq = first + 1; npq < epq; npq++) {
952 		cq = &npq->q_queue;
953 		if (!cscan_empty(cq)) {
954 			single = FALSE;
955 			if (pq->q_burst > 0)
956 				break;
957 			pq = npq;
958 		}
959 	}
960 	if (single) {
961 		/*
962 		 * there's only a non-empty queue.  just serve it.
963 		 */
964 		pq = first;
965 	} else if (pq->q_burst > 0) {
966 		/*
967 		 * XXX account only by number of requests.  is it good enough?
968 		 */
969 		pq->q_burst--;
970 	} else {
971 		/*
972 		 * no queue was selected due to burst counts
973 		 */
974 		int i;
975 #ifdef DEBUG
976 		for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
977 			pq = &q->bq_queue[i];
978 			cq = &pq->q_queue;
979 			if (!cscan_empty(cq) && pq->q_burst)
980 				panic("%s: inconsist", __func__);
981 		}
982 #endif /* DEBUG */
983 
984 		/*
985 		 * reset burst counts
986 		 */
987 		for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
988 			pq = &q->bq_queue[i];
989 			pq->q_burst = priocscan_burst[i];
990 		}
991 
992 		/*
993 		 * serve first non-empty queue.
994 		 */
995 		pq = first;
996 	}
997 
998 	KDASSERT(!cscan_empty(&pq->q_queue));
999 	bp = cscan_get(&pq->q_queue, remove);
1000 	KDASSERT(bp != NULL);
1001 	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
1002 
1003 	return bp;
1004 }
1005 
1006 static void
1007 bufq_priocscan_init(struct bufq_state *bufq)
1008 {
1009 	struct bufq_priocscan *q;
1010 	int i;
1011 
1012 	bufq->bq_get = bufq_priocscan_get;
1013 	bufq->bq_put = bufq_priocscan_put;
1014 	bufq->bq_private = malloc(sizeof(struct bufq_priocscan),
1015 	    M_DEVBUF, M_ZERO);
1016 
1017 	q = bufq->bq_private;
1018 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
1019 		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
1020 
1021 		cscan_init(cq);
1022 	}
1023 }
1024 
1025 
1026 /*
1027  * Create a device buffer queue.
1028  */
1029 void
1030 bufq_alloc(struct bufq_state *bufq, int flags)
1031 {
1032 	struct bufq_fcfs *fcfs;
1033 	struct bufq_disksort *disksort;
1034 	struct bufq_prio *prio;
1035 
1036 	bufq->bq_flags = flags;
1037 
1038 	switch (flags & BUFQ_SORT_MASK) {
1039 	case BUFQ_SORT_RAWBLOCK:
1040 	case BUFQ_SORT_CYLINDER:
1041 		break;
1042 	case 0:
1043 		if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS)
1044 			break;
1045 		/* FALLTHROUGH */
1046 	default:
1047 		panic("bufq_alloc: sort out of range");
1048 	}
1049 
1050 	switch (flags & BUFQ_METHOD_MASK) {
1051 	case BUFQ_FCFS:
1052 		bufq->bq_get = bufq_fcfs_get;
1053 		bufq->bq_put = bufq_fcfs_put;
1054 		MALLOC(bufq->bq_private, struct bufq_fcfs *,
1055 		    sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO);
1056 		fcfs = (struct bufq_fcfs *)bufq->bq_private;
1057 		TAILQ_INIT(&fcfs->bq_head);
1058 		break;
1059 	case BUFQ_DISKSORT:
1060 		bufq->bq_get = bufq_disksort_get;
1061 		bufq->bq_put = bufq_disksort_put;
1062 		MALLOC(bufq->bq_private, struct bufq_disksort *,
1063 		    sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO);
1064 		disksort = (struct bufq_disksort *)bufq->bq_private;
1065 		TAILQ_INIT(&disksort->bq_head);
1066 		break;
1067 	case BUFQ_READ_PRIO:
1068 		bufq->bq_get = bufq_prio_get;
1069 		bufq->bq_put = bufq_prio_put;
1070 		MALLOC(bufq->bq_private, struct bufq_prio *,
1071 		    sizeof(struct bufq_prio), M_DEVBUF, M_ZERO);
1072 		prio = (struct bufq_prio *)bufq->bq_private;
1073 		TAILQ_INIT(&prio->bq_read);
1074 		TAILQ_INIT(&prio->bq_write);
1075 		break;
1076 	case BUFQ_PRIOCSCAN:
1077 		bufq_priocscan_init(bufq);
1078 		break;
1079 	default:
1080 		panic("bufq_alloc: method out of range");
1081 	}
1082 }
1083 
1084 /*
1085  * Destroy a device buffer queue.
1086  */
1087 void
1088 bufq_free(struct bufq_state *bufq)
1089 {
1090 
1091 	KASSERT(bufq->bq_private != NULL);
1092 	KASSERT(BUFQ_PEEK(bufq) == NULL);
1093 
1094 	FREE(bufq->bq_private, M_DEVBUF);
1095 	bufq->bq_get = NULL;
1096 	bufq->bq_put = NULL;
1097 }
1098 
1099 /*
1100  * Bounds checking against the media size, used for the raw partition.
1101  * The sector size passed in should currently always be DEV_BSIZE,
1102  * and the media size the size of the device in DEV_BSIZE sectors.
1103  */
1104 int
1105 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize)
1106 {
1107 	int sz;
1108 
1109 	sz = howmany(bp->b_bcount, secsize);
1110 
1111 	if (bp->b_blkno + sz > mediasize) {
1112 		sz = mediasize - bp->b_blkno;
1113 		if (sz == 0) {
1114 			/* If exactly at end of disk, return EOF. */
1115 			bp->b_resid = bp->b_bcount;
1116 			goto done;
1117 		}
1118 		if (sz < 0) {
1119 			/* If past end of disk, return EINVAL. */
1120 			bp->b_error = EINVAL;
1121 			goto bad;
1122 		}
1123 		/* Otherwise, truncate request. */
1124 		bp->b_bcount = sz << DEV_BSHIFT;
1125 	}
1126 
1127 	return 1;
1128 
1129 bad:
1130 	bp->b_flags |= B_ERROR;
1131 done:
1132 	return 0;
1133 }
1134