xref: /netbsd-src/sys/kern/subr_disk.c (revision d20841bb642898112fe68f0ad3f7b26dddf56f07)
1 /*	$NetBSD: subr_disk.c,v 1.58 2004/01/10 14:49:44 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*
41  * Copyright (c) 1982, 1986, 1988, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. Neither the name of the University nor the names of its contributors
58  *    may be used to endorse or promote products derived from this software
59  *    without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  *
73  *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
74  */
75 
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.58 2004/01/10 14:49:44 yamt Exp $");
78 
79 #include "opt_compat_netbsd.h"
80 
81 #include <sys/param.h>
82 #include <sys/kernel.h>
83 #include <sys/malloc.h>
84 #include <sys/buf.h>
85 #include <sys/syslog.h>
86 #include <sys/disklabel.h>
87 #include <sys/disk.h>
88 #include <sys/sysctl.h>
89 #include <lib/libkern/libkern.h>
90 
91 /*
92  * A global list of all disks attached to the system.  May grow or
93  * shrink over time.
94  */
95 struct	disklist_head disklist;	/* TAILQ_HEAD */
96 int	disk_count;		/* number of drives in global disklist */
97 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER;
98 
99 /*
100  * Compute checksum for disk label.
101  */
102 u_int
103 dkcksum(struct disklabel *lp)
104 {
105 	u_short *start, *end;
106 	u_short sum = 0;
107 
108 	start = (u_short *)lp;
109 	end = (u_short *)&lp->d_partitions[lp->d_npartitions];
110 	while (start < end)
111 		sum ^= *start++;
112 	return (sum);
113 }
114 
115 /*
116  * Disk error is the preface to plaintive error messages
117  * about failing disk transfers.  It prints messages of the form
118 
119 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
120 
121  * if the offset of the error in the transfer and a disk label
122  * are both available.  blkdone should be -1 if the position of the error
123  * is unknown; the disklabel pointer may be null from drivers that have not
124  * been converted to use them.  The message is printed with printf
125  * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
126  * The message should be completed (with at least a newline) with printf
127  * or addlog, respectively.  There is no trailing space.
128  */
129 #ifndef PRIdaddr
130 #define PRIdaddr PRId64
131 #endif
132 void
133 diskerr(const struct buf *bp, const char *dname, const char *what, int pri,
134     int blkdone, const struct disklabel *lp)
135 {
136 	int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
137 	void (*pr)(const char *, ...);
138 	char partname = 'a' + part;
139 	daddr_t sn;
140 
141 	if (/*CONSTCOND*/0)
142 		/* Compiler will error this is the format is wrong... */
143 		printf("%" PRIdaddr, bp->b_blkno);
144 
145 	if (pri != LOG_PRINTF) {
146 		static const char fmt[] = "";
147 		log(pri, fmt);
148 		pr = addlog;
149 	} else
150 		pr = printf;
151 	(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
152 	    bp->b_flags & B_READ ? "read" : "writ");
153 	sn = bp->b_blkno;
154 	if (bp->b_bcount <= DEV_BSIZE)
155 		(*pr)("%" PRIdaddr, sn);
156 	else {
157 		if (blkdone >= 0) {
158 			sn += blkdone;
159 			(*pr)("%" PRIdaddr " of ", sn);
160 		}
161 		(*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno,
162 		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
163 	}
164 	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
165 		sn += lp->d_partitions[part].p_offset;
166 		(*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "",
167 		    dname, unit, sn, sn / lp->d_secpercyl);
168 		sn %= lp->d_secpercyl;
169 		(*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")",
170 		    sn / lp->d_nsectors, sn % lp->d_nsectors);
171 	}
172 }
173 
174 /*
175  * Initialize the disklist.  Called by main() before autoconfiguration.
176  */
177 void
178 disk_init(void)
179 {
180 
181 	TAILQ_INIT(&disklist);
182 	disk_count = 0;
183 }
184 
185 /*
186  * Searches the disklist for the disk corresponding to the
187  * name provided.
188  */
189 struct disk *
190 disk_find(char *name)
191 {
192 	struct disk *diskp;
193 
194 	if ((name == NULL) || (disk_count <= 0))
195 		return (NULL);
196 
197 	simple_lock(&disklist_slock);
198 	for (diskp = TAILQ_FIRST(&disklist); diskp != NULL;
199 	    diskp = TAILQ_NEXT(diskp, dk_link))
200 		if (strcmp(diskp->dk_name, name) == 0) {
201 			simple_unlock(&disklist_slock);
202 			return (diskp);
203 		}
204 	simple_unlock(&disklist_slock);
205 
206 	return (NULL);
207 }
208 
209 /*
210  * Attach a disk.
211  */
212 void
213 disk_attach(struct disk *diskp)
214 {
215 	int s;
216 
217 	/*
218 	 * Allocate and initialize the disklabel structures.  Note that
219 	 * it's not safe to sleep here, since we're probably going to be
220 	 * called during autoconfiguration.
221 	 */
222 	diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT);
223 	diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF,
224 	    M_NOWAIT);
225 	if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL))
226 		panic("disk_attach: can't allocate storage for disklabel");
227 
228 	memset(diskp->dk_label, 0, sizeof(struct disklabel));
229 	memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel));
230 
231 	/*
232 	 * Set the attached timestamp.
233 	 */
234 	s = splclock();
235 	diskp->dk_attachtime = mono_time;
236 	splx(s);
237 
238 	/*
239 	 * Link into the disklist.
240 	 */
241 	simple_lock(&disklist_slock);
242 	TAILQ_INSERT_TAIL(&disklist, diskp, dk_link);
243 	simple_unlock(&disklist_slock);
244 	++disk_count;
245 }
246 
247 /*
248  * Detach a disk.
249  */
250 void
251 disk_detach(struct disk *diskp)
252 {
253 
254 	/*
255 	 * Remove from the disklist.
256 	 */
257 	if (--disk_count < 0)
258 		panic("disk_detach: disk_count < 0");
259 	simple_lock(&disklist_slock);
260 	TAILQ_REMOVE(&disklist, diskp, dk_link);
261 	simple_unlock(&disklist_slock);
262 
263 	/*
264 	 * Free the space used by the disklabel structures.
265 	 */
266 	free(diskp->dk_label, M_DEVBUF);
267 	free(diskp->dk_cpulabel, M_DEVBUF);
268 }
269 
270 /*
271  * Increment a disk's busy counter.  If the counter is going from
272  * 0 to 1, set the timestamp.
273  */
274 void
275 disk_busy(struct disk *diskp)
276 {
277 	int s;
278 
279 	/*
280 	 * XXX We'd like to use something as accurate as microtime(),
281 	 * but that doesn't depend on the system TOD clock.
282 	 */
283 	if (diskp->dk_busy++ == 0) {
284 		s = splclock();
285 		diskp->dk_timestamp = mono_time;
286 		splx(s);
287 	}
288 }
289 
290 /*
291  * Decrement a disk's busy counter, increment the byte count, total busy
292  * time, and reset the timestamp.
293  */
294 void
295 disk_unbusy(struct disk *diskp, long bcount, int read)
296 {
297 	int s;
298 	struct timeval dv_time, diff_time;
299 
300 	if (diskp->dk_busy-- == 0) {
301 		printf("%s: dk_busy < 0\n", diskp->dk_name);
302 		panic("disk_unbusy");
303 	}
304 
305 	s = splclock();
306 	dv_time = mono_time;
307 	splx(s);
308 
309 	timersub(&dv_time, &diskp->dk_timestamp, &diff_time);
310 	timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time);
311 
312 	diskp->dk_timestamp = dv_time;
313 	if (bcount > 0) {
314 		if (read) {
315 			diskp->dk_rbytes += bcount;
316 			diskp->dk_rxfer++;
317 		} else {
318 			diskp->dk_wbytes += bcount;
319 			diskp->dk_wxfer++;
320 		}
321 	}
322 }
323 
324 /*
325  * Reset the metrics counters on the given disk.  Note that we cannot
326  * reset the busy counter, as it may case a panic in disk_unbusy().
327  * We also must avoid playing with the timestamp information, as it
328  * may skew any pending transfer results.
329  */
330 void
331 disk_resetstat(struct disk *diskp)
332 {
333 	int s = splbio(), t;
334 
335 	diskp->dk_rxfer = 0;
336 	diskp->dk_rbytes = 0;
337 	diskp->dk_wxfer = 0;
338 	diskp->dk_wbytes = 0;
339 
340 	t = splclock();
341 	diskp->dk_attachtime = mono_time;
342 	splx(t);
343 
344 	timerclear(&diskp->dk_time);
345 
346 	splx(s);
347 }
348 
349 int
350 sysctl_hw_disknames(SYSCTLFN_ARGS)
351 {
352 	char buf[DK_DISKNAMELEN + 1];
353 	char *where = oldp;
354 	struct disk *diskp;
355 	size_t needed, left, slen;
356 	int error, first;
357 
358 	if (newp != NULL)
359 		return (EPERM);
360 	if (namelen != 0)
361 		return (EINVAL);
362 
363 	first = 1;
364 	error = 0;
365 	needed = 0;
366 	left = *oldlenp;
367 
368 	simple_lock(&disklist_slock);
369 	for (diskp = TAILQ_FIRST(&disklist); diskp != NULL;
370 	    diskp = TAILQ_NEXT(diskp, dk_link)) {
371 		if (where == NULL)
372 			needed += strlen(diskp->dk_name) + 1;
373 		else {
374 			memset(buf, 0, sizeof(buf));
375 			if (first) {
376 				strncpy(buf, diskp->dk_name, sizeof(buf));
377 				first = 0;
378 			} else {
379 				buf[0] = ' ';
380 				strncpy(buf + 1, diskp->dk_name,
381 				    sizeof(buf) - 1);
382 			}
383 			buf[DK_DISKNAMELEN] = '\0';
384 			slen = strlen(buf);
385 			if (left < slen + 1)
386 				break;
387 			/* +1 to copy out the trailing NUL byte */
388 			error = copyout(buf, where, slen + 1);
389 			if (error)
390 				break;
391 			where += slen;
392 			needed += slen;
393 			left -= slen;
394 		}
395 	}
396 	simple_unlock(&disklist_slock);
397 	*oldlenp = needed;
398 	return (error);
399 }
400 
401 int
402 sysctl_hw_diskstats(SYSCTLFN_ARGS)
403 {
404 	struct disk_sysctl sdisk;
405 	struct disk *diskp;
406 	char *where = oldp;
407 	size_t tocopy, left;
408 	int error;
409 
410 	if (newp != NULL)
411 		return (EPERM);
412 
413 	/*
414 	 * The original hw.diskstats call was broken and did not require
415 	 * the userland to pass in it's size of struct disk_sysctl.  This
416 	 * was fixed after NetBSD 1.6 was released, and any applications
417 	 * that do not pass in the size are given an error only, unless
418 	 * we care about 1.6 compatibility.
419 	 */
420 	if (namelen == 0)
421 #ifdef COMPAT_16
422 		tocopy = offsetof(struct disk_sysctl, dk_rxfer);
423 #else
424 		return (EINVAL);
425 #endif
426 	else
427 		tocopy = name[0];
428 
429 	if (where == NULL) {
430 		*oldlenp = disk_count * tocopy;
431 		return (0);
432 	}
433 
434 	error = 0;
435 	left = *oldlenp;
436 	memset(&sdisk, 0, sizeof(sdisk));
437 	*oldlenp = 0;
438 
439 	simple_lock(&disklist_slock);
440 	TAILQ_FOREACH(diskp, &disklist, dk_link) {
441 		if (left < tocopy)
442 			break;
443 		strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name));
444 		sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer;
445 		sdisk.dk_rxfer = diskp->dk_rxfer;
446 		sdisk.dk_wxfer = diskp->dk_wxfer;
447 		sdisk.dk_seek = diskp->dk_seek;
448 		sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes;
449 		sdisk.dk_rbytes = diskp->dk_rbytes;
450 		sdisk.dk_wbytes = diskp->dk_wbytes;
451 		sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec;
452 		sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec;
453 		sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec;
454 		sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec;
455 		sdisk.dk_time_sec = diskp->dk_time.tv_sec;
456 		sdisk.dk_time_usec = diskp->dk_time.tv_usec;
457 		sdisk.dk_busy = diskp->dk_busy;
458 
459 		error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk)));
460 		if (error)
461 			break;
462 		where += tocopy;
463 		*oldlenp += tocopy;
464 		left -= tocopy;
465 	}
466 	simple_unlock(&disklist_slock);
467 	return (error);
468 }
469 
470 struct bufq_fcfs {
471 	TAILQ_HEAD(, buf) bq_head;	/* actual list of buffers */
472 };
473 
474 struct bufq_disksort {
475 	TAILQ_HEAD(, buf) bq_head;	/* actual list of buffers */
476 };
477 
478 #define PRIO_READ_BURST		48
479 #define PRIO_WRITE_REQ		16
480 
481 struct bufq_prio {
482 	TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */
483 	struct buf *bq_write_next;	/* next request in bq_write */
484 	struct buf *bq_next;		/* current request */
485 	int bq_read_burst;		/* # of consecutive reads */
486 };
487 
488 
489 /*
490  * Check if two buf's are in ascending order.
491  */
492 static __inline int
493 buf_inorder(struct buf *bp, struct buf *bq, int sortby)
494 {
495 
496 	if (bp == NULL || bq == NULL)
497 		return (bq == NULL);
498 
499 	if (sortby == BUFQ_SORT_CYLINDER) {
500 		if (bp->b_cylinder != bq->b_cylinder)
501 			return bp->b_cylinder < bq->b_cylinder;
502 		else
503 			return bp->b_rawblkno < bq->b_rawblkno;
504 	} else
505 		return bp->b_rawblkno < bq->b_rawblkno;
506 }
507 
508 
509 /*
510  * First-come first-served sort for disks.
511  *
512  * Requests are appended to the queue without any reordering.
513  */
514 static void
515 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp)
516 {
517 	struct bufq_fcfs *fcfs = bufq->bq_private;
518 
519 	TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq);
520 }
521 
522 static struct buf *
523 bufq_fcfs_get(struct bufq_state *bufq, int remove)
524 {
525 	struct bufq_fcfs *fcfs = bufq->bq_private;
526 	struct buf *bp;
527 
528 	bp = TAILQ_FIRST(&fcfs->bq_head);
529 
530 	if (bp != NULL && remove)
531 		TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
532 
533 	return (bp);
534 }
535 
536 
537 /*
538  * Seek sort for disks.
539  *
540  * There are actually two queues, sorted in ascendening order.  The first
541  * queue holds those requests which are positioned after the current block;
542  * the second holds requests which came in after their position was passed.
543  * Thus we implement a one-way scan, retracting after reaching the end of
544  * the drive to the first request on the second queue, at which time it
545  * becomes the first queue.
546  *
547  * A one-way scan is natural because of the way UNIX read-ahead blocks are
548  * allocated.
549  */
550 static void
551 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp)
552 {
553 	struct bufq_disksort *disksort = bufq->bq_private;
554 	struct buf *bq, *nbq;
555 	int sortby;
556 
557 	sortby = bufq->bq_flags & BUFQ_SORT_MASK;
558 
559 	bq = TAILQ_FIRST(&disksort->bq_head);
560 
561 	/*
562 	 * If the queue is empty it's easy; we just go on the end.
563 	 */
564 	if (bq == NULL) {
565 		TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq);
566 		return;
567 	}
568 
569 	/*
570 	 * If we lie before the currently active request, then we
571 	 * must locate the second request list and add ourselves to it.
572 	 */
573 	if (buf_inorder(bp, bq, sortby)) {
574 		while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
575 			/*
576 			 * Check for an ``inversion'' in the normally ascending
577 			 * block numbers, indicating the start of the second
578 			 * request list.
579 			 */
580 			if (buf_inorder(nbq, bq, sortby)) {
581 				/*
582 				 * Search the second request list for the first
583 				 * request at a larger block number.  We go
584 				 * after that; if there is no such request, we
585 				 * go at the end.
586 				 */
587 				do {
588 					if (buf_inorder(bp, nbq, sortby))
589 						goto insert;
590 					bq = nbq;
591 				} while ((nbq =
592 				    TAILQ_NEXT(bq, b_actq)) != NULL);
593 				goto insert;		/* after last */
594 			}
595 			bq = nbq;
596 		}
597 		/*
598 		 * No inversions... we will go after the last, and
599 		 * be the first request in the second request list.
600 		 */
601 		goto insert;
602 	}
603 	/*
604 	 * Request is at/after the current request...
605 	 * sort in the first request list.
606 	 */
607 	while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
608 		/*
609 		 * We want to go after the current request if there is an
610 		 * inversion after it (i.e. it is the end of the first
611 		 * request list), or if the next request is a larger cylinder
612 		 * than our request.
613 		 */
614 		if (buf_inorder(nbq, bq, sortby) ||
615 		    buf_inorder(bp, nbq, sortby))
616 			goto insert;
617 		bq = nbq;
618 	}
619 	/*
620 	 * Neither a second list nor a larger request... we go at the end of
621 	 * the first list, which is the same as the end of the whole schebang.
622 	 */
623 insert:	TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq);
624 }
625 
626 static struct buf *
627 bufq_disksort_get(struct bufq_state *bufq, int remove)
628 {
629 	struct bufq_disksort *disksort = bufq->bq_private;
630 	struct buf *bp;
631 
632 	bp = TAILQ_FIRST(&disksort->bq_head);
633 
634 	if (bp != NULL && remove)
635 		TAILQ_REMOVE(&disksort->bq_head, bp, b_actq);
636 
637 	return (bp);
638 }
639 
640 
641 /*
642  * Seek sort for disks.
643  *
644  * There are two queues.  The first queue holds read requests; the second
645  * holds write requests.  The read queue is first-come first-served; the
646  * write queue is sorted in ascendening block order.
647  * The read queue is processed first.  After PRIO_READ_BURST consecutive
648  * read requests with non-empty write queue PRIO_WRITE_REQ requests from
649  * the write queue will be processed.
650  */
651 static void
652 bufq_prio_put(struct bufq_state *bufq, struct buf *bp)
653 {
654 	struct bufq_prio *prio = bufq->bq_private;
655 	struct buf *bq;
656 	int sortby;
657 
658 	sortby = bufq->bq_flags & BUFQ_SORT_MASK;
659 
660 	/*
661 	 * If it's a read request append it to the list.
662 	 */
663 	if ((bp->b_flags & B_READ) == B_READ) {
664 		TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq);
665 		return;
666 	}
667 
668 	bq = TAILQ_FIRST(&prio->bq_write);
669 
670 	/*
671 	 * If the write list is empty, simply append it to the list.
672 	 */
673 	if (bq == NULL) {
674 		TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq);
675 		prio->bq_write_next = bp;
676 		return;
677 	}
678 
679 	/*
680 	 * If we lie after the next request, insert after this request.
681 	 */
682 	if (buf_inorder(prio->bq_write_next, bp, sortby))
683 		bq = prio->bq_write_next;
684 
685 	/*
686 	 * Search for the first request at a larger block number.
687 	 * We go before this request if it exists.
688 	 */
689 	while (bq != NULL && buf_inorder(bq, bp, sortby))
690 		bq = TAILQ_NEXT(bq, b_actq);
691 
692 	if (bq != NULL)
693 		TAILQ_INSERT_BEFORE(bq, bp, b_actq);
694 	else
695 		TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq);
696 }
697 
698 static struct buf *
699 bufq_prio_get(struct bufq_state *bufq, int remove)
700 {
701 	struct bufq_prio *prio = bufq->bq_private;
702 	struct buf *bp;
703 
704 	/*
705 	 * If no current request, get next from the lists.
706 	 */
707 	if (prio->bq_next == NULL) {
708 		/*
709 		 * If at least one list is empty, select the other.
710 		 */
711 		if (TAILQ_FIRST(&prio->bq_read) == NULL) {
712 			prio->bq_next = prio->bq_write_next;
713 			prio->bq_read_burst = 0;
714 		} else if (prio->bq_write_next == NULL) {
715 			prio->bq_next = TAILQ_FIRST(&prio->bq_read);
716 			prio->bq_read_burst = 0;
717 		} else {
718 			/*
719 			 * Both list have requests.  Select the read list up
720 			 * to PRIO_READ_BURST times, then select the write
721 			 * list PRIO_WRITE_REQ times.
722 			 */
723 			if (prio->bq_read_burst++ < PRIO_READ_BURST)
724 				prio->bq_next = TAILQ_FIRST(&prio->bq_read);
725 			else if (prio->bq_read_burst <
726 			    PRIO_READ_BURST + PRIO_WRITE_REQ)
727 				prio->bq_next = prio->bq_write_next;
728 			else {
729 				prio->bq_next = TAILQ_FIRST(&prio->bq_read);
730 				prio->bq_read_burst = 0;
731 			}
732 		}
733 	}
734 
735 	bp = prio->bq_next;
736 
737 	if (bp != NULL && remove) {
738 		if ((bp->b_flags & B_READ) == B_READ)
739 			TAILQ_REMOVE(&prio->bq_read, bp, b_actq);
740 		else {
741 			/*
742 			 * Advance the write pointer before removing
743 			 * bp since it is actually prio->bq_write_next.
744 			 */
745 			prio->bq_write_next =
746 			    TAILQ_NEXT(prio->bq_write_next, b_actq);
747 			TAILQ_REMOVE(&prio->bq_write, bp, b_actq);
748 			if (prio->bq_write_next == NULL)
749 				prio->bq_write_next =
750 				    TAILQ_FIRST(&prio->bq_write);
751 		}
752 
753 		prio->bq_next = NULL;
754 	}
755 
756 	return (bp);
757 }
758 
759 
760 /*
761  * Cyclical scan (CSCAN)
762  */
763 TAILQ_HEAD(bqhead, buf);
764 struct cscan_queue {
765 	struct bqhead cq_head[2];	/* actual lists of buffers */
766 	int cq_idx;			/* current list index */
767 	int cq_lastcylinder;		/* b_cylinder of the last request */
768 	daddr_t cq_lastrawblkno;	/* b_rawblkno of the last request */
769 };
770 
771 static int __inline cscan_empty(const struct cscan_queue *);
772 static void cscan_put(struct cscan_queue *, struct buf *, int);
773 static struct buf *cscan_get(struct cscan_queue *, int);
774 static void cscan_init(struct cscan_queue *);
775 
776 static __inline int
777 cscan_empty(const struct cscan_queue *q)
778 {
779 
780 	return TAILQ_EMPTY(&q->cq_head[0]) && TAILQ_EMPTY(&q->cq_head[1]);
781 }
782 
783 static void
784 cscan_put(struct cscan_queue *q, struct buf *bp, int sortby)
785 {
786 	struct buf tmp;
787 	struct buf *it;
788 	struct bqhead *bqh;
789 	int idx;
790 
791 	tmp.b_cylinder = q->cq_lastcylinder;
792 	tmp.b_rawblkno = q->cq_lastrawblkno;
793 
794 	if (buf_inorder(bp, &tmp, sortby))
795 		idx = 1 - q->cq_idx;
796 	else
797 		idx = q->cq_idx;
798 
799 	bqh = &q->cq_head[idx];
800 
801 	TAILQ_FOREACH(it, bqh, b_actq)
802 		if (buf_inorder(bp, it, sortby))
803 			break;
804 
805 	if (it != NULL)
806 		TAILQ_INSERT_BEFORE(it, bp, b_actq);
807 	else
808 		TAILQ_INSERT_TAIL(bqh, bp, b_actq);
809 }
810 
811 static struct buf *
812 cscan_get(struct cscan_queue *q, int remove)
813 {
814 	int idx = q->cq_idx;
815 	struct bqhead *bqh;
816 	struct buf *bp;
817 
818 	bqh = &q->cq_head[idx];
819 	bp = TAILQ_FIRST(bqh);
820 
821 	if (bp == NULL) {
822 		/* switch queue */
823 		idx = 1 - idx;
824 		bqh = &q->cq_head[idx];
825 		bp = TAILQ_FIRST(bqh);
826 	}
827 
828 	KDASSERT((bp != NULL && !cscan_empty(q)) ||
829 	         (bp == NULL && cscan_empty(q)));
830 
831 	if (bp != NULL && remove) {
832 		q->cq_idx = idx;
833 		TAILQ_REMOVE(bqh, bp, b_actq);
834 
835 		q->cq_lastcylinder = bp->b_cylinder;
836 		q->cq_lastrawblkno =
837 		    bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
838 	}
839 
840 	return (bp);
841 }
842 
843 static void
844 cscan_init(struct cscan_queue *q)
845 {
846 
847 	TAILQ_INIT(&q->cq_head[0]);
848 	TAILQ_INIT(&q->cq_head[1]);
849 }
850 
851 
852 /*
853  * Per-prioritiy CSCAN.
854  *
855  * XXX probably we should have a way to raise
856  * priority of the on-queue requests.
857  */
858 #define	PRIOCSCAN_NQUEUE	3
859 
860 struct priocscan_queue {
861 	struct cscan_queue q_queue;
862 	int q_burst;
863 };
864 
865 struct bufq_priocscan {
866 	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
867 
868 #if 0
869 	/*
870 	 * XXX using "global" head position can reduce positioning time
871 	 * when switching between queues.
872 	 * although it might affect against fairness.
873 	 */
874 	daddr_t bq_lastrawblkno;
875 	int bq_lastcylinder;
876 #endif
877 };
878 
879 /*
880  * how many requests to serve when having pending requests on other queues.
881  *
882  * XXX tune
883  */
884 const int priocscan_burst[] = {
885 	64, 16, 4
886 };
887 
888 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
889 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
890 static void bufq_priocscan_init(struct bufq_state *);
891 static __inline struct cscan_queue *bufq_priocscan_selectqueue(
892     struct bufq_priocscan *, const struct buf *);
893 
894 static __inline struct cscan_queue *
895 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
896 {
897 	static const int priocscan_priomap[] = {
898 		[BPRIO_TIMENONCRITICAL] = 2,
899 		[BPRIO_TIMELIMITED] = 1,
900 		[BPRIO_TIMECRITICAL] = 0
901 	};
902 
903 	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
904 }
905 
906 static void
907 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
908 {
909 	struct bufq_priocscan *q = bufq->bq_private;
910 	struct cscan_queue *cq;
911 	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
912 
913 	cq = bufq_priocscan_selectqueue(q, bp);
914 	cscan_put(cq, bp, sortby);
915 }
916 
917 static struct buf *
918 bufq_priocscan_get(struct bufq_state *bufq, int remove)
919 {
920 	struct bufq_priocscan *q = bufq->bq_private;
921 	struct priocscan_queue *pq, *npq;
922 	struct priocscan_queue *first; /* first non-empty queue */
923 	const struct priocscan_queue *epq;
924 	const struct cscan_queue *cq;
925 	struct buf *bp;
926 	boolean_t single; /* true if there's only one non-empty queue */
927 
928 	pq = &q->bq_queue[0];
929 	epq = pq + PRIOCSCAN_NQUEUE;
930 	for (; pq < epq; pq++) {
931 		cq = &pq->q_queue;
932 		if (!cscan_empty(cq))
933 			break;
934 	}
935 	if (pq == epq) {
936 		/* there's no requests */
937 		return NULL;
938 	}
939 
940 	first = pq;
941 	single = TRUE;
942 	for (npq = first + 1; npq < epq; npq++) {
943 		cq = &npq->q_queue;
944 		if (!cscan_empty(cq)) {
945 			single = FALSE;
946 			if (pq->q_burst > 0)
947 				break;
948 			pq = npq;
949 		}
950 	}
951 	if (single) {
952 		/*
953 		 * there's only a non-empty queue.  just serve it.
954 		 */
955 		pq = first;
956 	} else if (pq->q_burst > 0) {
957 		/*
958 		 * XXX account only by number of requests.  is it good enough?
959 		 */
960 		pq->q_burst--;
961 	} else {
962 		/*
963 		 * no queue was selected due to burst counts
964 		 */
965 		int i;
966 #ifdef DEBUG
967 		for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
968 			pq = &q->bq_queue[i];
969 			cq = &pq->q_queue;
970 			if (!cscan_empty(cq) && pq->q_burst)
971 				panic("%s: inconsist", __func__);
972 		}
973 #endif /* DEBUG */
974 
975 		/*
976 		 * reset burst counts
977 		 */
978 		for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
979 			pq = &q->bq_queue[i];
980 			pq->q_burst = priocscan_burst[i];
981 		}
982 
983 		/*
984 		 * serve first non-empty queue.
985 		 */
986 		pq = first;
987 	}
988 
989 	KDASSERT(!cscan_empty(&pq->q_queue));
990 	bp = cscan_get(&pq->q_queue, remove);
991 	KDASSERT(bp != NULL);
992 	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
993 
994 	return bp;
995 }
996 
997 static void
998 bufq_priocscan_init(struct bufq_state *bufq)
999 {
1000 	struct bufq_priocscan *q;
1001 	int i;
1002 
1003 	bufq->bq_get = bufq_priocscan_get;
1004 	bufq->bq_put = bufq_priocscan_put;
1005 	bufq->bq_private = malloc(sizeof(struct bufq_priocscan),
1006 	    M_DEVBUF, M_ZERO);
1007 
1008 	q = bufq->bq_private;
1009 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
1010 		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
1011 
1012 		cscan_init(cq);
1013 	}
1014 }
1015 
1016 
1017 /*
1018  * Create a device buffer queue.
1019  */
1020 void
1021 bufq_alloc(struct bufq_state *bufq, int flags)
1022 {
1023 	struct bufq_fcfs *fcfs;
1024 	struct bufq_disksort *disksort;
1025 	struct bufq_prio *prio;
1026 
1027 	bufq->bq_flags = flags;
1028 
1029 	switch (flags & BUFQ_SORT_MASK) {
1030 	case BUFQ_SORT_RAWBLOCK:
1031 	case BUFQ_SORT_CYLINDER:
1032 		break;
1033 	case 0:
1034 		if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS)
1035 			break;
1036 		/* FALLTHROUGH */
1037 	default:
1038 		panic("bufq_alloc: sort out of range");
1039 	}
1040 
1041 	switch (flags & BUFQ_METHOD_MASK) {
1042 	case BUFQ_FCFS:
1043 		bufq->bq_get = bufq_fcfs_get;
1044 		bufq->bq_put = bufq_fcfs_put;
1045 		MALLOC(bufq->bq_private, struct bufq_fcfs *,
1046 		    sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO);
1047 		fcfs = (struct bufq_fcfs *)bufq->bq_private;
1048 		TAILQ_INIT(&fcfs->bq_head);
1049 		break;
1050 	case BUFQ_DISKSORT:
1051 		bufq->bq_get = bufq_disksort_get;
1052 		bufq->bq_put = bufq_disksort_put;
1053 		MALLOC(bufq->bq_private, struct bufq_disksort *,
1054 		    sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO);
1055 		disksort = (struct bufq_disksort *)bufq->bq_private;
1056 		TAILQ_INIT(&disksort->bq_head);
1057 		break;
1058 	case BUFQ_READ_PRIO:
1059 		bufq->bq_get = bufq_prio_get;
1060 		bufq->bq_put = bufq_prio_put;
1061 		MALLOC(bufq->bq_private, struct bufq_prio *,
1062 		    sizeof(struct bufq_prio), M_DEVBUF, M_ZERO);
1063 		prio = (struct bufq_prio *)bufq->bq_private;
1064 		TAILQ_INIT(&prio->bq_read);
1065 		TAILQ_INIT(&prio->bq_write);
1066 		break;
1067 	case BUFQ_PRIOCSCAN:
1068 		bufq_priocscan_init(bufq);
1069 		break;
1070 	default:
1071 		panic("bufq_alloc: method out of range");
1072 	}
1073 }
1074 
1075 /*
1076  * Destroy a device buffer queue.
1077  */
1078 void
1079 bufq_free(struct bufq_state *bufq)
1080 {
1081 
1082 	KASSERT(bufq->bq_private != NULL);
1083 	KASSERT(BUFQ_PEEK(bufq) == NULL);
1084 
1085 	FREE(bufq->bq_private, M_DEVBUF);
1086 	bufq->bq_get = NULL;
1087 	bufq->bq_put = NULL;
1088 }
1089 
1090 /*
1091  * Bounds checking against the media size, used for the raw partition.
1092  * The sector size passed in should currently always be DEV_BSIZE,
1093  * and the media size the size of the device in DEV_BSIZE sectors.
1094  */
1095 int
1096 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize)
1097 {
1098 	int sz;
1099 
1100 	sz = howmany(bp->b_bcount, secsize);
1101 
1102 	if (bp->b_blkno + sz > mediasize) {
1103 		sz = mediasize - bp->b_blkno;
1104 		if (sz == 0) {
1105 			/* If exactly at end of disk, return EOF. */
1106 			bp->b_resid = bp->b_bcount;
1107 			goto done;
1108 		}
1109 		if (sz < 0) {
1110 			/* If past end of disk, return EINVAL. */
1111 			bp->b_error = EINVAL;
1112 			goto bad;
1113 		}
1114 		/* Otherwise, truncate request. */
1115 		bp->b_bcount = sz << DEV_BSHIFT;
1116 	}
1117 
1118 	return 1;
1119 
1120 bad:
1121 	bp->b_flags |= B_ERROR;
1122 done:
1123 	return 0;
1124 }
1125