1 /* $NetBSD: subr_disk.c,v 1.53 2003/08/07 16:31:52 agc Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.53 2003/08/07 16:31:52 agc Exp $"); 78 79 #include "opt_compat_netbsd.h" 80 81 #include <sys/param.h> 82 #include <sys/kernel.h> 83 #include <sys/malloc.h> 84 #include <sys/buf.h> 85 #include <sys/syslog.h> 86 #include <sys/disklabel.h> 87 #include <sys/disk.h> 88 #include <sys/sysctl.h> 89 #include <lib/libkern/libkern.h> 90 91 /* 92 * A global list of all disks attached to the system. May grow or 93 * shrink over time. 94 */ 95 struct disklist_head disklist; /* TAILQ_HEAD */ 96 int disk_count; /* number of drives in global disklist */ 97 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 98 99 /* 100 * Compute checksum for disk label. 101 */ 102 u_int 103 dkcksum(struct disklabel *lp) 104 { 105 u_short *start, *end; 106 u_short sum = 0; 107 108 start = (u_short *)lp; 109 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 110 while (start < end) 111 sum ^= *start++; 112 return (sum); 113 } 114 115 /* 116 * Disk error is the preface to plaintive error messages 117 * about failing disk transfers. It prints messages of the form 118 119 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 120 121 * if the offset of the error in the transfer and a disk label 122 * are both available. blkdone should be -1 if the position of the error 123 * is unknown; the disklabel pointer may be null from drivers that have not 124 * been converted to use them. The message is printed with printf 125 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 126 * The message should be completed (with at least a newline) with printf 127 * or addlog, respectively. There is no trailing space. 128 */ 129 #ifndef PRIdaddr 130 #define PRIdaddr PRId64 131 #endif 132 void 133 diskerr(const struct buf *bp, const char *dname, const char *what, int pri, 134 int blkdone, const struct disklabel *lp) 135 { 136 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 137 void (*pr)(const char *, ...); 138 char partname = 'a' + part; 139 daddr_t sn; 140 141 if (/*CONSTCOND*/0) 142 /* Compiler will error this is the format is wrong... */ 143 printf("%" PRIdaddr, bp->b_blkno); 144 145 if (pri != LOG_PRINTF) { 146 static const char fmt[] = ""; 147 log(pri, fmt); 148 pr = addlog; 149 } else 150 pr = printf; 151 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 152 bp->b_flags & B_READ ? "read" : "writ"); 153 sn = bp->b_blkno; 154 if (bp->b_bcount <= DEV_BSIZE) 155 (*pr)("%" PRIdaddr, sn); 156 else { 157 if (blkdone >= 0) { 158 sn += blkdone; 159 (*pr)("%" PRIdaddr " of ", sn); 160 } 161 (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno, 162 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 163 } 164 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 165 sn += lp->d_partitions[part].p_offset; 166 (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "", 167 dname, unit, sn, sn / lp->d_secpercyl); 168 sn %= lp->d_secpercyl; 169 (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")", 170 sn / lp->d_nsectors, sn % lp->d_nsectors); 171 } 172 } 173 174 /* 175 * Initialize the disklist. Called by main() before autoconfiguration. 176 */ 177 void 178 disk_init(void) 179 { 180 181 TAILQ_INIT(&disklist); 182 disk_count = 0; 183 } 184 185 /* 186 * Searches the disklist for the disk corresponding to the 187 * name provided. 188 */ 189 struct disk * 190 disk_find(char *name) 191 { 192 struct disk *diskp; 193 194 if ((name == NULL) || (disk_count <= 0)) 195 return (NULL); 196 197 simple_lock(&disklist_slock); 198 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 199 diskp = TAILQ_NEXT(diskp, dk_link)) 200 if (strcmp(diskp->dk_name, name) == 0) { 201 simple_unlock(&disklist_slock); 202 return (diskp); 203 } 204 simple_unlock(&disklist_slock); 205 206 return (NULL); 207 } 208 209 /* 210 * Attach a disk. 211 */ 212 void 213 disk_attach(struct disk *diskp) 214 { 215 int s; 216 217 /* 218 * Allocate and initialize the disklabel structures. Note that 219 * it's not safe to sleep here, since we're probably going to be 220 * called during autoconfiguration. 221 */ 222 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 223 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 224 M_NOWAIT); 225 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 226 panic("disk_attach: can't allocate storage for disklabel"); 227 228 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 229 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 230 231 /* 232 * Set the attached timestamp. 233 */ 234 s = splclock(); 235 diskp->dk_attachtime = mono_time; 236 splx(s); 237 238 /* 239 * Link into the disklist. 240 */ 241 simple_lock(&disklist_slock); 242 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 243 simple_unlock(&disklist_slock); 244 ++disk_count; 245 } 246 247 /* 248 * Detach a disk. 249 */ 250 void 251 disk_detach(struct disk *diskp) 252 { 253 254 /* 255 * Remove from the disklist. 256 */ 257 if (--disk_count < 0) 258 panic("disk_detach: disk_count < 0"); 259 simple_lock(&disklist_slock); 260 TAILQ_REMOVE(&disklist, diskp, dk_link); 261 simple_unlock(&disklist_slock); 262 263 /* 264 * Free the space used by the disklabel structures. 265 */ 266 free(diskp->dk_label, M_DEVBUF); 267 free(diskp->dk_cpulabel, M_DEVBUF); 268 } 269 270 /* 271 * Increment a disk's busy counter. If the counter is going from 272 * 0 to 1, set the timestamp. 273 */ 274 void 275 disk_busy(struct disk *diskp) 276 { 277 int s; 278 279 /* 280 * XXX We'd like to use something as accurate as microtime(), 281 * but that doesn't depend on the system TOD clock. 282 */ 283 if (diskp->dk_busy++ == 0) { 284 s = splclock(); 285 diskp->dk_timestamp = mono_time; 286 splx(s); 287 } 288 } 289 290 /* 291 * Decrement a disk's busy counter, increment the byte count, total busy 292 * time, and reset the timestamp. 293 */ 294 void 295 disk_unbusy(struct disk *diskp, long bcount, int read) 296 { 297 int s; 298 struct timeval dv_time, diff_time; 299 300 if (diskp->dk_busy-- == 0) { 301 printf("%s: dk_busy < 0\n", diskp->dk_name); 302 panic("disk_unbusy"); 303 } 304 305 s = splclock(); 306 dv_time = mono_time; 307 splx(s); 308 309 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 310 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 311 312 diskp->dk_timestamp = dv_time; 313 if (bcount > 0) { 314 if (read) { 315 diskp->dk_rbytes += bcount; 316 diskp->dk_rxfer++; 317 } else { 318 diskp->dk_wbytes += bcount; 319 diskp->dk_wxfer++; 320 } 321 } 322 } 323 324 /* 325 * Reset the metrics counters on the given disk. Note that we cannot 326 * reset the busy counter, as it may case a panic in disk_unbusy(). 327 * We also must avoid playing with the timestamp information, as it 328 * may skew any pending transfer results. 329 */ 330 void 331 disk_resetstat(struct disk *diskp) 332 { 333 int s = splbio(), t; 334 335 diskp->dk_rxfer = 0; 336 diskp->dk_rbytes = 0; 337 diskp->dk_wxfer = 0; 338 diskp->dk_wbytes = 0; 339 340 t = splclock(); 341 diskp->dk_attachtime = mono_time; 342 splx(t); 343 344 timerclear(&diskp->dk_time); 345 346 splx(s); 347 } 348 349 int 350 sysctl_disknames(void *vwhere, size_t *sizep) 351 { 352 char buf[DK_DISKNAMELEN + 1]; 353 char *where = vwhere; 354 struct disk *diskp; 355 size_t needed, left, slen; 356 int error, first; 357 358 first = 1; 359 error = 0; 360 needed = 0; 361 left = *sizep; 362 363 simple_lock(&disklist_slock); 364 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 365 diskp = TAILQ_NEXT(diskp, dk_link)) { 366 if (where == NULL) 367 needed += strlen(diskp->dk_name) + 1; 368 else { 369 memset(buf, 0, sizeof(buf)); 370 if (first) { 371 strncpy(buf, diskp->dk_name, sizeof(buf)); 372 first = 0; 373 } else { 374 buf[0] = ' '; 375 strncpy(buf + 1, diskp->dk_name, 376 sizeof(buf) - 1); 377 } 378 buf[DK_DISKNAMELEN] = '\0'; 379 slen = strlen(buf); 380 if (left < slen + 1) 381 break; 382 /* +1 to copy out the trailing NUL byte */ 383 error = copyout(buf, where, slen + 1); 384 if (error) 385 break; 386 where += slen; 387 needed += slen; 388 left -= slen; 389 } 390 } 391 simple_unlock(&disklist_slock); 392 *sizep = needed; 393 return (error); 394 } 395 396 int 397 sysctl_diskstats(int *name, u_int namelen, void *vwhere, size_t *sizep) 398 { 399 struct disk_sysctl sdisk; 400 struct disk *diskp; 401 char *where = vwhere; 402 size_t tocopy, left; 403 int error; 404 405 /* 406 * The original hw.diskstats call was broken and did not require 407 * the userland to pass in it's size of struct disk_sysctl. This 408 * was fixed after NetBSD 1.6 was released, and any applications 409 * that do not pass in the size are given an error only, unless 410 * we care about 1.6 compatibility. 411 */ 412 if (namelen == 0) 413 #ifdef COMPAT_16 414 tocopy = offsetof(struct disk_sysctl, dk_rxfer); 415 #else 416 return (EINVAL); 417 #endif 418 else 419 tocopy = name[0]; 420 421 if (where == NULL) { 422 *sizep = disk_count * tocopy; 423 return (0); 424 } 425 426 error = 0; 427 left = *sizep; 428 memset(&sdisk, 0, sizeof(sdisk)); 429 *sizep = 0; 430 431 simple_lock(&disklist_slock); 432 TAILQ_FOREACH(diskp, &disklist, dk_link) { 433 if (left < tocopy) 434 break; 435 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 436 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer; 437 sdisk.dk_rxfer = diskp->dk_rxfer; 438 sdisk.dk_wxfer = diskp->dk_wxfer; 439 sdisk.dk_seek = diskp->dk_seek; 440 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes; 441 sdisk.dk_rbytes = diskp->dk_rbytes; 442 sdisk.dk_wbytes = diskp->dk_wbytes; 443 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 444 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 445 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 446 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 447 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 448 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 449 sdisk.dk_busy = diskp->dk_busy; 450 451 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 452 if (error) 453 break; 454 where += tocopy; 455 *sizep += tocopy; 456 left -= tocopy; 457 } 458 simple_unlock(&disklist_slock); 459 return (error); 460 } 461 462 struct bufq_fcfs { 463 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 464 }; 465 466 struct bufq_disksort { 467 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 468 }; 469 470 #define PRIO_READ_BURST 48 471 #define PRIO_WRITE_REQ 16 472 473 struct bufq_prio { 474 TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */ 475 struct buf *bq_write_next; /* next request in bq_write */ 476 struct buf *bq_next; /* current request */ 477 int bq_read_burst; /* # of consecutive reads */ 478 }; 479 480 481 /* 482 * Check if two buf's are in ascending order. 483 */ 484 static __inline int 485 buf_inorder(struct buf *bp, struct buf *bq, int sortby) 486 { 487 int r; 488 489 if (bp == NULL || bq == NULL) 490 return (bq == NULL); 491 492 if (sortby == BUFQ_SORT_CYLINDER) 493 r = bp->b_cylinder - bq->b_cylinder; 494 else 495 r = 0; 496 497 if (r == 0) 498 r = bp->b_rawblkno - bq->b_rawblkno; 499 500 return (r <= 0); 501 } 502 503 504 /* 505 * First-come first-served sort for disks. 506 * 507 * Requests are appended to the queue without any reordering. 508 */ 509 static void 510 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp) 511 { 512 struct bufq_fcfs *fcfs = bufq->bq_private; 513 514 TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq); 515 } 516 517 static struct buf * 518 bufq_fcfs_get(struct bufq_state *bufq, int remove) 519 { 520 struct bufq_fcfs *fcfs = bufq->bq_private; 521 struct buf *bp; 522 523 bp = TAILQ_FIRST(&fcfs->bq_head); 524 525 if (bp != NULL && remove) 526 TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); 527 528 return (bp); 529 } 530 531 532 /* 533 * Seek sort for disks. 534 * 535 * There are actually two queues, sorted in ascendening order. The first 536 * queue holds those requests which are positioned after the current block; 537 * the second holds requests which came in after their position was passed. 538 * Thus we implement a one-way scan, retracting after reaching the end of 539 * the drive to the first request on the second queue, at which time it 540 * becomes the first queue. 541 * 542 * A one-way scan is natural because of the way UNIX read-ahead blocks are 543 * allocated. 544 */ 545 static void 546 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp) 547 { 548 struct bufq_disksort *disksort = bufq->bq_private; 549 struct buf *bq, *nbq; 550 int sortby; 551 552 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 553 554 bq = TAILQ_FIRST(&disksort->bq_head); 555 556 /* 557 * If the queue is empty it's easy; we just go on the end. 558 */ 559 if (bq == NULL) { 560 TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq); 561 return; 562 } 563 564 /* 565 * If we lie before the currently active request, then we 566 * must locate the second request list and add ourselves to it. 567 */ 568 if (buf_inorder(bp, bq, sortby)) { 569 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 570 /* 571 * Check for an ``inversion'' in the normally ascending 572 * block numbers, indicating the start of the second 573 * request list. 574 */ 575 if (buf_inorder(nbq, bq, sortby)) { 576 /* 577 * Search the second request list for the first 578 * request at a larger block number. We go 579 * after that; if there is no such request, we 580 * go at the end. 581 */ 582 do { 583 if (buf_inorder(bp, nbq, sortby)) 584 goto insert; 585 bq = nbq; 586 } while ((nbq = 587 TAILQ_NEXT(bq, b_actq)) != NULL); 588 goto insert; /* after last */ 589 } 590 bq = nbq; 591 } 592 /* 593 * No inversions... we will go after the last, and 594 * be the first request in the second request list. 595 */ 596 goto insert; 597 } 598 /* 599 * Request is at/after the current request... 600 * sort in the first request list. 601 */ 602 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 603 /* 604 * We want to go after the current request if there is an 605 * inversion after it (i.e. it is the end of the first 606 * request list), or if the next request is a larger cylinder 607 * than our request. 608 */ 609 if (buf_inorder(nbq, bq, sortby) || 610 buf_inorder(bp, nbq, sortby)) 611 goto insert; 612 bq = nbq; 613 } 614 /* 615 * Neither a second list nor a larger request... we go at the end of 616 * the first list, which is the same as the end of the whole schebang. 617 */ 618 insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq); 619 } 620 621 static struct buf * 622 bufq_disksort_get(struct bufq_state *bufq, int remove) 623 { 624 struct bufq_disksort *disksort = bufq->bq_private; 625 struct buf *bp; 626 627 bp = TAILQ_FIRST(&disksort->bq_head); 628 629 if (bp != NULL && remove) 630 TAILQ_REMOVE(&disksort->bq_head, bp, b_actq); 631 632 return (bp); 633 } 634 635 636 /* 637 * Seek sort for disks. 638 * 639 * There are two queues. The first queue holds read requests; the second 640 * holds write requests. The read queue is first-come first-served; the 641 * write queue is sorted in ascendening block order. 642 * The read queue is processed first. After PRIO_READ_BURST consecutive 643 * read requests with non-empty write queue PRIO_WRITE_REQ requests from 644 * the write queue will be processed. 645 */ 646 static void 647 bufq_prio_put(struct bufq_state *bufq, struct buf *bp) 648 { 649 struct bufq_prio *prio = bufq->bq_private; 650 struct buf *bq; 651 int sortby; 652 653 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 654 655 /* 656 * If it's a read request append it to the list. 657 */ 658 if ((bp->b_flags & B_READ) == B_READ) { 659 TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq); 660 return; 661 } 662 663 bq = TAILQ_FIRST(&prio->bq_write); 664 665 /* 666 * If the write list is empty, simply append it to the list. 667 */ 668 if (bq == NULL) { 669 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 670 prio->bq_write_next = bp; 671 return; 672 } 673 674 /* 675 * If we lie after the next request, insert after this request. 676 */ 677 if (buf_inorder(prio->bq_write_next, bp, sortby)) 678 bq = prio->bq_write_next; 679 680 /* 681 * Search for the first request at a larger block number. 682 * We go before this request if it exists. 683 */ 684 while (bq != NULL && buf_inorder(bq, bp, sortby)) 685 bq = TAILQ_NEXT(bq, b_actq); 686 687 if (bq != NULL) 688 TAILQ_INSERT_BEFORE(bq, bp, b_actq); 689 else 690 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 691 } 692 693 static struct buf * 694 bufq_prio_get(struct bufq_state *bufq, int remove) 695 { 696 struct bufq_prio *prio = bufq->bq_private; 697 struct buf *bp; 698 699 /* 700 * If no current request, get next from the lists. 701 */ 702 if (prio->bq_next == NULL) { 703 /* 704 * If at least one list is empty, select the other. 705 */ 706 if (TAILQ_FIRST(&prio->bq_read) == NULL) { 707 prio->bq_next = prio->bq_write_next; 708 prio->bq_read_burst = 0; 709 } else if (prio->bq_write_next == NULL) { 710 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 711 prio->bq_read_burst = 0; 712 } else { 713 /* 714 * Both list have requests. Select the read list up 715 * to PRIO_READ_BURST times, then select the write 716 * list PRIO_WRITE_REQ times. 717 */ 718 if (prio->bq_read_burst++ < PRIO_READ_BURST) 719 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 720 else if (prio->bq_read_burst < 721 PRIO_READ_BURST + PRIO_WRITE_REQ) 722 prio->bq_next = prio->bq_write_next; 723 else { 724 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 725 prio->bq_read_burst = 0; 726 } 727 } 728 } 729 730 bp = prio->bq_next; 731 732 if (bp != NULL && remove) { 733 if ((bp->b_flags & B_READ) == B_READ) 734 TAILQ_REMOVE(&prio->bq_read, bp, b_actq); 735 else { 736 /* 737 * Advance the write pointer before removing 738 * bp since it is actually prio->bq_write_next. 739 */ 740 prio->bq_write_next = 741 TAILQ_NEXT(prio->bq_write_next, b_actq); 742 TAILQ_REMOVE(&prio->bq_write, bp, b_actq); 743 if (prio->bq_write_next == NULL) 744 prio->bq_write_next = 745 TAILQ_FIRST(&prio->bq_write); 746 } 747 748 prio->bq_next = NULL; 749 } 750 751 return (bp); 752 } 753 754 /* 755 * Create a device buffer queue. 756 */ 757 void 758 bufq_alloc(struct bufq_state *bufq, int flags) 759 { 760 struct bufq_fcfs *fcfs; 761 struct bufq_disksort *disksort; 762 struct bufq_prio *prio; 763 764 bufq->bq_flags = flags; 765 766 switch (flags & BUFQ_SORT_MASK) { 767 case BUFQ_SORT_RAWBLOCK: 768 case BUFQ_SORT_CYLINDER: 769 break; 770 case 0: 771 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS) 772 break; 773 /* FALLTHROUGH */ 774 default: 775 panic("bufq_alloc: sort out of range"); 776 } 777 778 switch (flags & BUFQ_METHOD_MASK) { 779 case BUFQ_FCFS: 780 bufq->bq_get = bufq_fcfs_get; 781 bufq->bq_put = bufq_fcfs_put; 782 MALLOC(bufq->bq_private, struct bufq_fcfs *, 783 sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO); 784 fcfs = (struct bufq_fcfs *)bufq->bq_private; 785 TAILQ_INIT(&fcfs->bq_head); 786 break; 787 case BUFQ_DISKSORT: 788 bufq->bq_get = bufq_disksort_get; 789 bufq->bq_put = bufq_disksort_put; 790 MALLOC(bufq->bq_private, struct bufq_disksort *, 791 sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO); 792 disksort = (struct bufq_disksort *)bufq->bq_private; 793 TAILQ_INIT(&disksort->bq_head); 794 break; 795 case BUFQ_READ_PRIO: 796 bufq->bq_get = bufq_prio_get; 797 bufq->bq_put = bufq_prio_put; 798 MALLOC(bufq->bq_private, struct bufq_prio *, 799 sizeof(struct bufq_prio), M_DEVBUF, M_ZERO); 800 prio = (struct bufq_prio *)bufq->bq_private; 801 TAILQ_INIT(&prio->bq_read); 802 TAILQ_INIT(&prio->bq_write); 803 break; 804 default: 805 panic("bufq_alloc: method out of range"); 806 } 807 } 808 809 /* 810 * Destroy a device buffer queue. 811 */ 812 void 813 bufq_free(struct bufq_state *bufq) 814 { 815 816 KASSERT(bufq->bq_private != NULL); 817 KASSERT(BUFQ_PEEK(bufq) == NULL); 818 819 FREE(bufq->bq_private, M_DEVBUF); 820 bufq->bq_get = NULL; 821 bufq->bq_put = NULL; 822 } 823 824 /* 825 * Bounds checking against the media size, used for the raw partition. 826 * The sector size passed in should currently always be DEV_BSIZE, 827 * and the media size the size of the device in DEV_BSIZE sectors. 828 */ 829 int 830 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize) 831 { 832 int sz; 833 834 sz = howmany(bp->b_bcount, secsize); 835 836 if (bp->b_blkno + sz > mediasize) { 837 sz = mediasize - bp->b_blkno; 838 if (sz == 0) { 839 /* If exactly at end of disk, return EOF. */ 840 bp->b_resid = bp->b_bcount; 841 goto done; 842 } 843 if (sz < 0) { 844 /* If past end of disk, return EINVAL. */ 845 bp->b_error = EINVAL; 846 goto bad; 847 } 848 /* Otherwise, truncate request. */ 849 bp->b_bcount = sz << DEV_BSHIFT; 850 } 851 852 return 1; 853 854 bad: 855 bp->b_flags |= B_ERROR; 856 done: 857 return 0; 858 } 859