1 /* $NetBSD: subr_disk.c,v 1.52 2003/04/13 09:08:04 dsl Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by the University of 60 * California, Berkeley and its contributors. 61 * 4. Neither the name of the University nor the names of its contributors 62 * may be used to endorse or promote products derived from this software 63 * without specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 66 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 69 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 70 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 71 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 72 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 73 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 74 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 75 * SUCH DAMAGE. 76 * 77 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.52 2003/04/13 09:08:04 dsl Exp $"); 82 83 #include "opt_compat_netbsd.h" 84 85 #include <sys/param.h> 86 #include <sys/kernel.h> 87 #include <sys/malloc.h> 88 #include <sys/buf.h> 89 #include <sys/syslog.h> 90 #include <sys/disklabel.h> 91 #include <sys/disk.h> 92 #include <sys/sysctl.h> 93 #include <lib/libkern/libkern.h> 94 95 /* 96 * A global list of all disks attached to the system. May grow or 97 * shrink over time. 98 */ 99 struct disklist_head disklist; /* TAILQ_HEAD */ 100 int disk_count; /* number of drives in global disklist */ 101 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 102 103 /* 104 * Compute checksum for disk label. 105 */ 106 u_int 107 dkcksum(struct disklabel *lp) 108 { 109 u_short *start, *end; 110 u_short sum = 0; 111 112 start = (u_short *)lp; 113 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 114 while (start < end) 115 sum ^= *start++; 116 return (sum); 117 } 118 119 /* 120 * Disk error is the preface to plaintive error messages 121 * about failing disk transfers. It prints messages of the form 122 123 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 124 125 * if the offset of the error in the transfer and a disk label 126 * are both available. blkdone should be -1 if the position of the error 127 * is unknown; the disklabel pointer may be null from drivers that have not 128 * been converted to use them. The message is printed with printf 129 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 130 * The message should be completed (with at least a newline) with printf 131 * or addlog, respectively. There is no trailing space. 132 */ 133 #ifndef PRIdaddr 134 #define PRIdaddr PRId64 135 #endif 136 void 137 diskerr(const struct buf *bp, const char *dname, const char *what, int pri, 138 int blkdone, const struct disklabel *lp) 139 { 140 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 141 void (*pr)(const char *, ...); 142 char partname = 'a' + part; 143 daddr_t sn; 144 145 if (/*CONSTCOND*/0) 146 /* Compiler will error this is the format is wrong... */ 147 printf("%" PRIdaddr, bp->b_blkno); 148 149 if (pri != LOG_PRINTF) { 150 static const char fmt[] = ""; 151 log(pri, fmt); 152 pr = addlog; 153 } else 154 pr = printf; 155 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 156 bp->b_flags & B_READ ? "read" : "writ"); 157 sn = bp->b_blkno; 158 if (bp->b_bcount <= DEV_BSIZE) 159 (*pr)("%" PRIdaddr, sn); 160 else { 161 if (blkdone >= 0) { 162 sn += blkdone; 163 (*pr)("%" PRIdaddr " of ", sn); 164 } 165 (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno, 166 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 167 } 168 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 169 sn += lp->d_partitions[part].p_offset; 170 (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "", 171 dname, unit, sn, sn / lp->d_secpercyl); 172 sn %= lp->d_secpercyl; 173 (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")", 174 sn / lp->d_nsectors, sn % lp->d_nsectors); 175 } 176 } 177 178 /* 179 * Initialize the disklist. Called by main() before autoconfiguration. 180 */ 181 void 182 disk_init(void) 183 { 184 185 TAILQ_INIT(&disklist); 186 disk_count = 0; 187 } 188 189 /* 190 * Searches the disklist for the disk corresponding to the 191 * name provided. 192 */ 193 struct disk * 194 disk_find(char *name) 195 { 196 struct disk *diskp; 197 198 if ((name == NULL) || (disk_count <= 0)) 199 return (NULL); 200 201 simple_lock(&disklist_slock); 202 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 203 diskp = TAILQ_NEXT(diskp, dk_link)) 204 if (strcmp(diskp->dk_name, name) == 0) { 205 simple_unlock(&disklist_slock); 206 return (diskp); 207 } 208 simple_unlock(&disklist_slock); 209 210 return (NULL); 211 } 212 213 /* 214 * Attach a disk. 215 */ 216 void 217 disk_attach(struct disk *diskp) 218 { 219 int s; 220 221 /* 222 * Allocate and initialize the disklabel structures. Note that 223 * it's not safe to sleep here, since we're probably going to be 224 * called during autoconfiguration. 225 */ 226 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 227 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 228 M_NOWAIT); 229 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 230 panic("disk_attach: can't allocate storage for disklabel"); 231 232 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 233 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 234 235 /* 236 * Set the attached timestamp. 237 */ 238 s = splclock(); 239 diskp->dk_attachtime = mono_time; 240 splx(s); 241 242 /* 243 * Link into the disklist. 244 */ 245 simple_lock(&disklist_slock); 246 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 247 simple_unlock(&disklist_slock); 248 ++disk_count; 249 } 250 251 /* 252 * Detach a disk. 253 */ 254 void 255 disk_detach(struct disk *diskp) 256 { 257 258 /* 259 * Remove from the disklist. 260 */ 261 if (--disk_count < 0) 262 panic("disk_detach: disk_count < 0"); 263 simple_lock(&disklist_slock); 264 TAILQ_REMOVE(&disklist, diskp, dk_link); 265 simple_unlock(&disklist_slock); 266 267 /* 268 * Free the space used by the disklabel structures. 269 */ 270 free(diskp->dk_label, M_DEVBUF); 271 free(diskp->dk_cpulabel, M_DEVBUF); 272 } 273 274 /* 275 * Increment a disk's busy counter. If the counter is going from 276 * 0 to 1, set the timestamp. 277 */ 278 void 279 disk_busy(struct disk *diskp) 280 { 281 int s; 282 283 /* 284 * XXX We'd like to use something as accurate as microtime(), 285 * but that doesn't depend on the system TOD clock. 286 */ 287 if (diskp->dk_busy++ == 0) { 288 s = splclock(); 289 diskp->dk_timestamp = mono_time; 290 splx(s); 291 } 292 } 293 294 /* 295 * Decrement a disk's busy counter, increment the byte count, total busy 296 * time, and reset the timestamp. 297 */ 298 void 299 disk_unbusy(struct disk *diskp, long bcount, int read) 300 { 301 int s; 302 struct timeval dv_time, diff_time; 303 304 if (diskp->dk_busy-- == 0) { 305 printf("%s: dk_busy < 0\n", diskp->dk_name); 306 panic("disk_unbusy"); 307 } 308 309 s = splclock(); 310 dv_time = mono_time; 311 splx(s); 312 313 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 314 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 315 316 diskp->dk_timestamp = dv_time; 317 if (bcount > 0) { 318 if (read) { 319 diskp->dk_rbytes += bcount; 320 diskp->dk_rxfer++; 321 } else { 322 diskp->dk_wbytes += bcount; 323 diskp->dk_wxfer++; 324 } 325 } 326 } 327 328 /* 329 * Reset the metrics counters on the given disk. Note that we cannot 330 * reset the busy counter, as it may case a panic in disk_unbusy(). 331 * We also must avoid playing with the timestamp information, as it 332 * may skew any pending transfer results. 333 */ 334 void 335 disk_resetstat(struct disk *diskp) 336 { 337 int s = splbio(), t; 338 339 diskp->dk_rxfer = 0; 340 diskp->dk_rbytes = 0; 341 diskp->dk_wxfer = 0; 342 diskp->dk_wbytes = 0; 343 344 t = splclock(); 345 diskp->dk_attachtime = mono_time; 346 splx(t); 347 348 timerclear(&diskp->dk_time); 349 350 splx(s); 351 } 352 353 int 354 sysctl_disknames(void *vwhere, size_t *sizep) 355 { 356 char buf[DK_DISKNAMELEN + 1]; 357 char *where = vwhere; 358 struct disk *diskp; 359 size_t needed, left, slen; 360 int error, first; 361 362 first = 1; 363 error = 0; 364 needed = 0; 365 left = *sizep; 366 367 simple_lock(&disklist_slock); 368 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 369 diskp = TAILQ_NEXT(diskp, dk_link)) { 370 if (where == NULL) 371 needed += strlen(diskp->dk_name) + 1; 372 else { 373 memset(buf, 0, sizeof(buf)); 374 if (first) { 375 strncpy(buf, diskp->dk_name, sizeof(buf)); 376 first = 0; 377 } else { 378 buf[0] = ' '; 379 strncpy(buf + 1, diskp->dk_name, 380 sizeof(buf) - 1); 381 } 382 buf[DK_DISKNAMELEN] = '\0'; 383 slen = strlen(buf); 384 if (left < slen + 1) 385 break; 386 /* +1 to copy out the trailing NUL byte */ 387 error = copyout(buf, where, slen + 1); 388 if (error) 389 break; 390 where += slen; 391 needed += slen; 392 left -= slen; 393 } 394 } 395 simple_unlock(&disklist_slock); 396 *sizep = needed; 397 return (error); 398 } 399 400 int 401 sysctl_diskstats(int *name, u_int namelen, void *vwhere, size_t *sizep) 402 { 403 struct disk_sysctl sdisk; 404 struct disk *diskp; 405 char *where = vwhere; 406 size_t tocopy, left; 407 int error; 408 409 /* 410 * The original hw.diskstats call was broken and did not require 411 * the userland to pass in it's size of struct disk_sysctl. This 412 * was fixed after NetBSD 1.6 was released, and any applications 413 * that do not pass in the size are given an error only, unless 414 * we care about 1.6 compatibility. 415 */ 416 if (namelen == 0) 417 #ifdef COMPAT_16 418 tocopy = offsetof(struct disk_sysctl, dk_rxfer); 419 #else 420 return (EINVAL); 421 #endif 422 else 423 tocopy = name[0]; 424 425 if (where == NULL) { 426 *sizep = disk_count * tocopy; 427 return (0); 428 } 429 430 error = 0; 431 left = *sizep; 432 memset(&sdisk, 0, sizeof(sdisk)); 433 *sizep = 0; 434 435 simple_lock(&disklist_slock); 436 TAILQ_FOREACH(diskp, &disklist, dk_link) { 437 if (left < tocopy) 438 break; 439 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 440 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer; 441 sdisk.dk_rxfer = diskp->dk_rxfer; 442 sdisk.dk_wxfer = diskp->dk_wxfer; 443 sdisk.dk_seek = diskp->dk_seek; 444 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes; 445 sdisk.dk_rbytes = diskp->dk_rbytes; 446 sdisk.dk_wbytes = diskp->dk_wbytes; 447 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 448 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 449 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 450 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 451 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 452 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 453 sdisk.dk_busy = diskp->dk_busy; 454 455 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 456 if (error) 457 break; 458 where += tocopy; 459 *sizep += tocopy; 460 left -= tocopy; 461 } 462 simple_unlock(&disklist_slock); 463 return (error); 464 } 465 466 struct bufq_fcfs { 467 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 468 }; 469 470 struct bufq_disksort { 471 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 472 }; 473 474 #define PRIO_READ_BURST 48 475 #define PRIO_WRITE_REQ 16 476 477 struct bufq_prio { 478 TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */ 479 struct buf *bq_write_next; /* next request in bq_write */ 480 struct buf *bq_next; /* current request */ 481 int bq_read_burst; /* # of consecutive reads */ 482 }; 483 484 485 /* 486 * Check if two buf's are in ascending order. 487 */ 488 static __inline int 489 buf_inorder(struct buf *bp, struct buf *bq, int sortby) 490 { 491 int r; 492 493 if (bp == NULL || bq == NULL) 494 return (bq == NULL); 495 496 if (sortby == BUFQ_SORT_CYLINDER) 497 r = bp->b_cylinder - bq->b_cylinder; 498 else 499 r = 0; 500 501 if (r == 0) 502 r = bp->b_rawblkno - bq->b_rawblkno; 503 504 return (r <= 0); 505 } 506 507 508 /* 509 * First-come first-served sort for disks. 510 * 511 * Requests are appended to the queue without any reordering. 512 */ 513 static void 514 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp) 515 { 516 struct bufq_fcfs *fcfs = bufq->bq_private; 517 518 TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq); 519 } 520 521 static struct buf * 522 bufq_fcfs_get(struct bufq_state *bufq, int remove) 523 { 524 struct bufq_fcfs *fcfs = bufq->bq_private; 525 struct buf *bp; 526 527 bp = TAILQ_FIRST(&fcfs->bq_head); 528 529 if (bp != NULL && remove) 530 TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); 531 532 return (bp); 533 } 534 535 536 /* 537 * Seek sort for disks. 538 * 539 * There are actually two queues, sorted in ascendening order. The first 540 * queue holds those requests which are positioned after the current block; 541 * the second holds requests which came in after their position was passed. 542 * Thus we implement a one-way scan, retracting after reaching the end of 543 * the drive to the first request on the second queue, at which time it 544 * becomes the first queue. 545 * 546 * A one-way scan is natural because of the way UNIX read-ahead blocks are 547 * allocated. 548 */ 549 static void 550 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp) 551 { 552 struct bufq_disksort *disksort = bufq->bq_private; 553 struct buf *bq, *nbq; 554 int sortby; 555 556 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 557 558 bq = TAILQ_FIRST(&disksort->bq_head); 559 560 /* 561 * If the queue is empty it's easy; we just go on the end. 562 */ 563 if (bq == NULL) { 564 TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq); 565 return; 566 } 567 568 /* 569 * If we lie before the currently active request, then we 570 * must locate the second request list and add ourselves to it. 571 */ 572 if (buf_inorder(bp, bq, sortby)) { 573 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 574 /* 575 * Check for an ``inversion'' in the normally ascending 576 * block numbers, indicating the start of the second 577 * request list. 578 */ 579 if (buf_inorder(nbq, bq, sortby)) { 580 /* 581 * Search the second request list for the first 582 * request at a larger block number. We go 583 * after that; if there is no such request, we 584 * go at the end. 585 */ 586 do { 587 if (buf_inorder(bp, nbq, sortby)) 588 goto insert; 589 bq = nbq; 590 } while ((nbq = 591 TAILQ_NEXT(bq, b_actq)) != NULL); 592 goto insert; /* after last */ 593 } 594 bq = nbq; 595 } 596 /* 597 * No inversions... we will go after the last, and 598 * be the first request in the second request list. 599 */ 600 goto insert; 601 } 602 /* 603 * Request is at/after the current request... 604 * sort in the first request list. 605 */ 606 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 607 /* 608 * We want to go after the current request if there is an 609 * inversion after it (i.e. it is the end of the first 610 * request list), or if the next request is a larger cylinder 611 * than our request. 612 */ 613 if (buf_inorder(nbq, bq, sortby) || 614 buf_inorder(bp, nbq, sortby)) 615 goto insert; 616 bq = nbq; 617 } 618 /* 619 * Neither a second list nor a larger request... we go at the end of 620 * the first list, which is the same as the end of the whole schebang. 621 */ 622 insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq); 623 } 624 625 static struct buf * 626 bufq_disksort_get(struct bufq_state *bufq, int remove) 627 { 628 struct bufq_disksort *disksort = bufq->bq_private; 629 struct buf *bp; 630 631 bp = TAILQ_FIRST(&disksort->bq_head); 632 633 if (bp != NULL && remove) 634 TAILQ_REMOVE(&disksort->bq_head, bp, b_actq); 635 636 return (bp); 637 } 638 639 640 /* 641 * Seek sort for disks. 642 * 643 * There are two queues. The first queue holds read requests; the second 644 * holds write requests. The read queue is first-come first-served; the 645 * write queue is sorted in ascendening block order. 646 * The read queue is processed first. After PRIO_READ_BURST consecutive 647 * read requests with non-empty write queue PRIO_WRITE_REQ requests from 648 * the write queue will be processed. 649 */ 650 static void 651 bufq_prio_put(struct bufq_state *bufq, struct buf *bp) 652 { 653 struct bufq_prio *prio = bufq->bq_private; 654 struct buf *bq; 655 int sortby; 656 657 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 658 659 /* 660 * If it's a read request append it to the list. 661 */ 662 if ((bp->b_flags & B_READ) == B_READ) { 663 TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq); 664 return; 665 } 666 667 bq = TAILQ_FIRST(&prio->bq_write); 668 669 /* 670 * If the write list is empty, simply append it to the list. 671 */ 672 if (bq == NULL) { 673 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 674 prio->bq_write_next = bp; 675 return; 676 } 677 678 /* 679 * If we lie after the next request, insert after this request. 680 */ 681 if (buf_inorder(prio->bq_write_next, bp, sortby)) 682 bq = prio->bq_write_next; 683 684 /* 685 * Search for the first request at a larger block number. 686 * We go before this request if it exists. 687 */ 688 while (bq != NULL && buf_inorder(bq, bp, sortby)) 689 bq = TAILQ_NEXT(bq, b_actq); 690 691 if (bq != NULL) 692 TAILQ_INSERT_BEFORE(bq, bp, b_actq); 693 else 694 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 695 } 696 697 static struct buf * 698 bufq_prio_get(struct bufq_state *bufq, int remove) 699 { 700 struct bufq_prio *prio = bufq->bq_private; 701 struct buf *bp; 702 703 /* 704 * If no current request, get next from the lists. 705 */ 706 if (prio->bq_next == NULL) { 707 /* 708 * If at least one list is empty, select the other. 709 */ 710 if (TAILQ_FIRST(&prio->bq_read) == NULL) { 711 prio->bq_next = prio->bq_write_next; 712 prio->bq_read_burst = 0; 713 } else if (prio->bq_write_next == NULL) { 714 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 715 prio->bq_read_burst = 0; 716 } else { 717 /* 718 * Both list have requests. Select the read list up 719 * to PRIO_READ_BURST times, then select the write 720 * list PRIO_WRITE_REQ times. 721 */ 722 if (prio->bq_read_burst++ < PRIO_READ_BURST) 723 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 724 else if (prio->bq_read_burst < 725 PRIO_READ_BURST + PRIO_WRITE_REQ) 726 prio->bq_next = prio->bq_write_next; 727 else { 728 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 729 prio->bq_read_burst = 0; 730 } 731 } 732 } 733 734 bp = prio->bq_next; 735 736 if (bp != NULL && remove) { 737 if ((bp->b_flags & B_READ) == B_READ) 738 TAILQ_REMOVE(&prio->bq_read, bp, b_actq); 739 else { 740 /* 741 * Advance the write pointer before removing 742 * bp since it is actually prio->bq_write_next. 743 */ 744 prio->bq_write_next = 745 TAILQ_NEXT(prio->bq_write_next, b_actq); 746 TAILQ_REMOVE(&prio->bq_write, bp, b_actq); 747 if (prio->bq_write_next == NULL) 748 prio->bq_write_next = 749 TAILQ_FIRST(&prio->bq_write); 750 } 751 752 prio->bq_next = NULL; 753 } 754 755 return (bp); 756 } 757 758 /* 759 * Create a device buffer queue. 760 */ 761 void 762 bufq_alloc(struct bufq_state *bufq, int flags) 763 { 764 struct bufq_fcfs *fcfs; 765 struct bufq_disksort *disksort; 766 struct bufq_prio *prio; 767 768 bufq->bq_flags = flags; 769 770 switch (flags & BUFQ_SORT_MASK) { 771 case BUFQ_SORT_RAWBLOCK: 772 case BUFQ_SORT_CYLINDER: 773 break; 774 case 0: 775 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS) 776 break; 777 /* FALLTHROUGH */ 778 default: 779 panic("bufq_alloc: sort out of range"); 780 } 781 782 switch (flags & BUFQ_METHOD_MASK) { 783 case BUFQ_FCFS: 784 bufq->bq_get = bufq_fcfs_get; 785 bufq->bq_put = bufq_fcfs_put; 786 MALLOC(bufq->bq_private, struct bufq_fcfs *, 787 sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO); 788 fcfs = (struct bufq_fcfs *)bufq->bq_private; 789 TAILQ_INIT(&fcfs->bq_head); 790 break; 791 case BUFQ_DISKSORT: 792 bufq->bq_get = bufq_disksort_get; 793 bufq->bq_put = bufq_disksort_put; 794 MALLOC(bufq->bq_private, struct bufq_disksort *, 795 sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO); 796 disksort = (struct bufq_disksort *)bufq->bq_private; 797 TAILQ_INIT(&disksort->bq_head); 798 break; 799 case BUFQ_READ_PRIO: 800 bufq->bq_get = bufq_prio_get; 801 bufq->bq_put = bufq_prio_put; 802 MALLOC(bufq->bq_private, struct bufq_prio *, 803 sizeof(struct bufq_prio), M_DEVBUF, M_ZERO); 804 prio = (struct bufq_prio *)bufq->bq_private; 805 TAILQ_INIT(&prio->bq_read); 806 TAILQ_INIT(&prio->bq_write); 807 break; 808 default: 809 panic("bufq_alloc: method out of range"); 810 } 811 } 812 813 /* 814 * Destroy a device buffer queue. 815 */ 816 void 817 bufq_free(struct bufq_state *bufq) 818 { 819 820 KASSERT(bufq->bq_private != NULL); 821 KASSERT(BUFQ_PEEK(bufq) == NULL); 822 823 FREE(bufq->bq_private, M_DEVBUF); 824 bufq->bq_get = NULL; 825 bufq->bq_put = NULL; 826 } 827 828 /* 829 * Bounds checking against the media size, used for the raw partition. 830 * The sector size passed in should currently always be DEV_BSIZE, 831 * and the media size the size of the device in DEV_BSIZE sectors. 832 */ 833 int 834 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize) 835 { 836 int sz; 837 838 sz = howmany(bp->b_bcount, secsize); 839 840 if (bp->b_blkno + sz > mediasize) { 841 sz = mediasize - bp->b_blkno; 842 if (sz == 0) { 843 /* If exactly at end of disk, return EOF. */ 844 bp->b_resid = bp->b_bcount; 845 goto done; 846 } 847 if (sz < 0) { 848 /* If past end of disk, return EINVAL. */ 849 bp->b_error = EINVAL; 850 goto bad; 851 } 852 /* Otherwise, truncate request. */ 853 bp->b_bcount = sz << DEV_BSHIFT; 854 } 855 856 return 1; 857 858 bad: 859 bp->b_flags |= B_ERROR; 860 done: 861 return 0; 862 } 863