1 /* $NetBSD: subr_disk.c,v 1.58 2004/01/10 14:49:44 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.58 2004/01/10 14:49:44 yamt Exp $"); 78 79 #include "opt_compat_netbsd.h" 80 81 #include <sys/param.h> 82 #include <sys/kernel.h> 83 #include <sys/malloc.h> 84 #include <sys/buf.h> 85 #include <sys/syslog.h> 86 #include <sys/disklabel.h> 87 #include <sys/disk.h> 88 #include <sys/sysctl.h> 89 #include <lib/libkern/libkern.h> 90 91 /* 92 * A global list of all disks attached to the system. May grow or 93 * shrink over time. 94 */ 95 struct disklist_head disklist; /* TAILQ_HEAD */ 96 int disk_count; /* number of drives in global disklist */ 97 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 98 99 /* 100 * Compute checksum for disk label. 101 */ 102 u_int 103 dkcksum(struct disklabel *lp) 104 { 105 u_short *start, *end; 106 u_short sum = 0; 107 108 start = (u_short *)lp; 109 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 110 while (start < end) 111 sum ^= *start++; 112 return (sum); 113 } 114 115 /* 116 * Disk error is the preface to plaintive error messages 117 * about failing disk transfers. It prints messages of the form 118 119 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 120 121 * if the offset of the error in the transfer and a disk label 122 * are both available. blkdone should be -1 if the position of the error 123 * is unknown; the disklabel pointer may be null from drivers that have not 124 * been converted to use them. The message is printed with printf 125 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 126 * The message should be completed (with at least a newline) with printf 127 * or addlog, respectively. There is no trailing space. 128 */ 129 #ifndef PRIdaddr 130 #define PRIdaddr PRId64 131 #endif 132 void 133 diskerr(const struct buf *bp, const char *dname, const char *what, int pri, 134 int blkdone, const struct disklabel *lp) 135 { 136 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 137 void (*pr)(const char *, ...); 138 char partname = 'a' + part; 139 daddr_t sn; 140 141 if (/*CONSTCOND*/0) 142 /* Compiler will error this is the format is wrong... */ 143 printf("%" PRIdaddr, bp->b_blkno); 144 145 if (pri != LOG_PRINTF) { 146 static const char fmt[] = ""; 147 log(pri, fmt); 148 pr = addlog; 149 } else 150 pr = printf; 151 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 152 bp->b_flags & B_READ ? "read" : "writ"); 153 sn = bp->b_blkno; 154 if (bp->b_bcount <= DEV_BSIZE) 155 (*pr)("%" PRIdaddr, sn); 156 else { 157 if (blkdone >= 0) { 158 sn += blkdone; 159 (*pr)("%" PRIdaddr " of ", sn); 160 } 161 (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno, 162 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 163 } 164 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 165 sn += lp->d_partitions[part].p_offset; 166 (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "", 167 dname, unit, sn, sn / lp->d_secpercyl); 168 sn %= lp->d_secpercyl; 169 (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")", 170 sn / lp->d_nsectors, sn % lp->d_nsectors); 171 } 172 } 173 174 /* 175 * Initialize the disklist. Called by main() before autoconfiguration. 176 */ 177 void 178 disk_init(void) 179 { 180 181 TAILQ_INIT(&disklist); 182 disk_count = 0; 183 } 184 185 /* 186 * Searches the disklist for the disk corresponding to the 187 * name provided. 188 */ 189 struct disk * 190 disk_find(char *name) 191 { 192 struct disk *diskp; 193 194 if ((name == NULL) || (disk_count <= 0)) 195 return (NULL); 196 197 simple_lock(&disklist_slock); 198 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 199 diskp = TAILQ_NEXT(diskp, dk_link)) 200 if (strcmp(diskp->dk_name, name) == 0) { 201 simple_unlock(&disklist_slock); 202 return (diskp); 203 } 204 simple_unlock(&disklist_slock); 205 206 return (NULL); 207 } 208 209 /* 210 * Attach a disk. 211 */ 212 void 213 disk_attach(struct disk *diskp) 214 { 215 int s; 216 217 /* 218 * Allocate and initialize the disklabel structures. Note that 219 * it's not safe to sleep here, since we're probably going to be 220 * called during autoconfiguration. 221 */ 222 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 223 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 224 M_NOWAIT); 225 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 226 panic("disk_attach: can't allocate storage for disklabel"); 227 228 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 229 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 230 231 /* 232 * Set the attached timestamp. 233 */ 234 s = splclock(); 235 diskp->dk_attachtime = mono_time; 236 splx(s); 237 238 /* 239 * Link into the disklist. 240 */ 241 simple_lock(&disklist_slock); 242 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 243 simple_unlock(&disklist_slock); 244 ++disk_count; 245 } 246 247 /* 248 * Detach a disk. 249 */ 250 void 251 disk_detach(struct disk *diskp) 252 { 253 254 /* 255 * Remove from the disklist. 256 */ 257 if (--disk_count < 0) 258 panic("disk_detach: disk_count < 0"); 259 simple_lock(&disklist_slock); 260 TAILQ_REMOVE(&disklist, diskp, dk_link); 261 simple_unlock(&disklist_slock); 262 263 /* 264 * Free the space used by the disklabel structures. 265 */ 266 free(diskp->dk_label, M_DEVBUF); 267 free(diskp->dk_cpulabel, M_DEVBUF); 268 } 269 270 /* 271 * Increment a disk's busy counter. If the counter is going from 272 * 0 to 1, set the timestamp. 273 */ 274 void 275 disk_busy(struct disk *diskp) 276 { 277 int s; 278 279 /* 280 * XXX We'd like to use something as accurate as microtime(), 281 * but that doesn't depend on the system TOD clock. 282 */ 283 if (diskp->dk_busy++ == 0) { 284 s = splclock(); 285 diskp->dk_timestamp = mono_time; 286 splx(s); 287 } 288 } 289 290 /* 291 * Decrement a disk's busy counter, increment the byte count, total busy 292 * time, and reset the timestamp. 293 */ 294 void 295 disk_unbusy(struct disk *diskp, long bcount, int read) 296 { 297 int s; 298 struct timeval dv_time, diff_time; 299 300 if (diskp->dk_busy-- == 0) { 301 printf("%s: dk_busy < 0\n", diskp->dk_name); 302 panic("disk_unbusy"); 303 } 304 305 s = splclock(); 306 dv_time = mono_time; 307 splx(s); 308 309 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 310 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 311 312 diskp->dk_timestamp = dv_time; 313 if (bcount > 0) { 314 if (read) { 315 diskp->dk_rbytes += bcount; 316 diskp->dk_rxfer++; 317 } else { 318 diskp->dk_wbytes += bcount; 319 diskp->dk_wxfer++; 320 } 321 } 322 } 323 324 /* 325 * Reset the metrics counters on the given disk. Note that we cannot 326 * reset the busy counter, as it may case a panic in disk_unbusy(). 327 * We also must avoid playing with the timestamp information, as it 328 * may skew any pending transfer results. 329 */ 330 void 331 disk_resetstat(struct disk *diskp) 332 { 333 int s = splbio(), t; 334 335 diskp->dk_rxfer = 0; 336 diskp->dk_rbytes = 0; 337 diskp->dk_wxfer = 0; 338 diskp->dk_wbytes = 0; 339 340 t = splclock(); 341 diskp->dk_attachtime = mono_time; 342 splx(t); 343 344 timerclear(&diskp->dk_time); 345 346 splx(s); 347 } 348 349 int 350 sysctl_hw_disknames(SYSCTLFN_ARGS) 351 { 352 char buf[DK_DISKNAMELEN + 1]; 353 char *where = oldp; 354 struct disk *diskp; 355 size_t needed, left, slen; 356 int error, first; 357 358 if (newp != NULL) 359 return (EPERM); 360 if (namelen != 0) 361 return (EINVAL); 362 363 first = 1; 364 error = 0; 365 needed = 0; 366 left = *oldlenp; 367 368 simple_lock(&disklist_slock); 369 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 370 diskp = TAILQ_NEXT(diskp, dk_link)) { 371 if (where == NULL) 372 needed += strlen(diskp->dk_name) + 1; 373 else { 374 memset(buf, 0, sizeof(buf)); 375 if (first) { 376 strncpy(buf, diskp->dk_name, sizeof(buf)); 377 first = 0; 378 } else { 379 buf[0] = ' '; 380 strncpy(buf + 1, diskp->dk_name, 381 sizeof(buf) - 1); 382 } 383 buf[DK_DISKNAMELEN] = '\0'; 384 slen = strlen(buf); 385 if (left < slen + 1) 386 break; 387 /* +1 to copy out the trailing NUL byte */ 388 error = copyout(buf, where, slen + 1); 389 if (error) 390 break; 391 where += slen; 392 needed += slen; 393 left -= slen; 394 } 395 } 396 simple_unlock(&disklist_slock); 397 *oldlenp = needed; 398 return (error); 399 } 400 401 int 402 sysctl_hw_diskstats(SYSCTLFN_ARGS) 403 { 404 struct disk_sysctl sdisk; 405 struct disk *diskp; 406 char *where = oldp; 407 size_t tocopy, left; 408 int error; 409 410 if (newp != NULL) 411 return (EPERM); 412 413 /* 414 * The original hw.diskstats call was broken and did not require 415 * the userland to pass in it's size of struct disk_sysctl. This 416 * was fixed after NetBSD 1.6 was released, and any applications 417 * that do not pass in the size are given an error only, unless 418 * we care about 1.6 compatibility. 419 */ 420 if (namelen == 0) 421 #ifdef COMPAT_16 422 tocopy = offsetof(struct disk_sysctl, dk_rxfer); 423 #else 424 return (EINVAL); 425 #endif 426 else 427 tocopy = name[0]; 428 429 if (where == NULL) { 430 *oldlenp = disk_count * tocopy; 431 return (0); 432 } 433 434 error = 0; 435 left = *oldlenp; 436 memset(&sdisk, 0, sizeof(sdisk)); 437 *oldlenp = 0; 438 439 simple_lock(&disklist_slock); 440 TAILQ_FOREACH(diskp, &disklist, dk_link) { 441 if (left < tocopy) 442 break; 443 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 444 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer; 445 sdisk.dk_rxfer = diskp->dk_rxfer; 446 sdisk.dk_wxfer = diskp->dk_wxfer; 447 sdisk.dk_seek = diskp->dk_seek; 448 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes; 449 sdisk.dk_rbytes = diskp->dk_rbytes; 450 sdisk.dk_wbytes = diskp->dk_wbytes; 451 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 452 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 453 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 454 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 455 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 456 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 457 sdisk.dk_busy = diskp->dk_busy; 458 459 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 460 if (error) 461 break; 462 where += tocopy; 463 *oldlenp += tocopy; 464 left -= tocopy; 465 } 466 simple_unlock(&disklist_slock); 467 return (error); 468 } 469 470 struct bufq_fcfs { 471 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 472 }; 473 474 struct bufq_disksort { 475 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 476 }; 477 478 #define PRIO_READ_BURST 48 479 #define PRIO_WRITE_REQ 16 480 481 struct bufq_prio { 482 TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */ 483 struct buf *bq_write_next; /* next request in bq_write */ 484 struct buf *bq_next; /* current request */ 485 int bq_read_burst; /* # of consecutive reads */ 486 }; 487 488 489 /* 490 * Check if two buf's are in ascending order. 491 */ 492 static __inline int 493 buf_inorder(struct buf *bp, struct buf *bq, int sortby) 494 { 495 496 if (bp == NULL || bq == NULL) 497 return (bq == NULL); 498 499 if (sortby == BUFQ_SORT_CYLINDER) { 500 if (bp->b_cylinder != bq->b_cylinder) 501 return bp->b_cylinder < bq->b_cylinder; 502 else 503 return bp->b_rawblkno < bq->b_rawblkno; 504 } else 505 return bp->b_rawblkno < bq->b_rawblkno; 506 } 507 508 509 /* 510 * First-come first-served sort for disks. 511 * 512 * Requests are appended to the queue without any reordering. 513 */ 514 static void 515 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp) 516 { 517 struct bufq_fcfs *fcfs = bufq->bq_private; 518 519 TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq); 520 } 521 522 static struct buf * 523 bufq_fcfs_get(struct bufq_state *bufq, int remove) 524 { 525 struct bufq_fcfs *fcfs = bufq->bq_private; 526 struct buf *bp; 527 528 bp = TAILQ_FIRST(&fcfs->bq_head); 529 530 if (bp != NULL && remove) 531 TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); 532 533 return (bp); 534 } 535 536 537 /* 538 * Seek sort for disks. 539 * 540 * There are actually two queues, sorted in ascendening order. The first 541 * queue holds those requests which are positioned after the current block; 542 * the second holds requests which came in after their position was passed. 543 * Thus we implement a one-way scan, retracting after reaching the end of 544 * the drive to the first request on the second queue, at which time it 545 * becomes the first queue. 546 * 547 * A one-way scan is natural because of the way UNIX read-ahead blocks are 548 * allocated. 549 */ 550 static void 551 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp) 552 { 553 struct bufq_disksort *disksort = bufq->bq_private; 554 struct buf *bq, *nbq; 555 int sortby; 556 557 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 558 559 bq = TAILQ_FIRST(&disksort->bq_head); 560 561 /* 562 * If the queue is empty it's easy; we just go on the end. 563 */ 564 if (bq == NULL) { 565 TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq); 566 return; 567 } 568 569 /* 570 * If we lie before the currently active request, then we 571 * must locate the second request list and add ourselves to it. 572 */ 573 if (buf_inorder(bp, bq, sortby)) { 574 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 575 /* 576 * Check for an ``inversion'' in the normally ascending 577 * block numbers, indicating the start of the second 578 * request list. 579 */ 580 if (buf_inorder(nbq, bq, sortby)) { 581 /* 582 * Search the second request list for the first 583 * request at a larger block number. We go 584 * after that; if there is no such request, we 585 * go at the end. 586 */ 587 do { 588 if (buf_inorder(bp, nbq, sortby)) 589 goto insert; 590 bq = nbq; 591 } while ((nbq = 592 TAILQ_NEXT(bq, b_actq)) != NULL); 593 goto insert; /* after last */ 594 } 595 bq = nbq; 596 } 597 /* 598 * No inversions... we will go after the last, and 599 * be the first request in the second request list. 600 */ 601 goto insert; 602 } 603 /* 604 * Request is at/after the current request... 605 * sort in the first request list. 606 */ 607 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 608 /* 609 * We want to go after the current request if there is an 610 * inversion after it (i.e. it is the end of the first 611 * request list), or if the next request is a larger cylinder 612 * than our request. 613 */ 614 if (buf_inorder(nbq, bq, sortby) || 615 buf_inorder(bp, nbq, sortby)) 616 goto insert; 617 bq = nbq; 618 } 619 /* 620 * Neither a second list nor a larger request... we go at the end of 621 * the first list, which is the same as the end of the whole schebang. 622 */ 623 insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq); 624 } 625 626 static struct buf * 627 bufq_disksort_get(struct bufq_state *bufq, int remove) 628 { 629 struct bufq_disksort *disksort = bufq->bq_private; 630 struct buf *bp; 631 632 bp = TAILQ_FIRST(&disksort->bq_head); 633 634 if (bp != NULL && remove) 635 TAILQ_REMOVE(&disksort->bq_head, bp, b_actq); 636 637 return (bp); 638 } 639 640 641 /* 642 * Seek sort for disks. 643 * 644 * There are two queues. The first queue holds read requests; the second 645 * holds write requests. The read queue is first-come first-served; the 646 * write queue is sorted in ascendening block order. 647 * The read queue is processed first. After PRIO_READ_BURST consecutive 648 * read requests with non-empty write queue PRIO_WRITE_REQ requests from 649 * the write queue will be processed. 650 */ 651 static void 652 bufq_prio_put(struct bufq_state *bufq, struct buf *bp) 653 { 654 struct bufq_prio *prio = bufq->bq_private; 655 struct buf *bq; 656 int sortby; 657 658 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 659 660 /* 661 * If it's a read request append it to the list. 662 */ 663 if ((bp->b_flags & B_READ) == B_READ) { 664 TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq); 665 return; 666 } 667 668 bq = TAILQ_FIRST(&prio->bq_write); 669 670 /* 671 * If the write list is empty, simply append it to the list. 672 */ 673 if (bq == NULL) { 674 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 675 prio->bq_write_next = bp; 676 return; 677 } 678 679 /* 680 * If we lie after the next request, insert after this request. 681 */ 682 if (buf_inorder(prio->bq_write_next, bp, sortby)) 683 bq = prio->bq_write_next; 684 685 /* 686 * Search for the first request at a larger block number. 687 * We go before this request if it exists. 688 */ 689 while (bq != NULL && buf_inorder(bq, bp, sortby)) 690 bq = TAILQ_NEXT(bq, b_actq); 691 692 if (bq != NULL) 693 TAILQ_INSERT_BEFORE(bq, bp, b_actq); 694 else 695 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 696 } 697 698 static struct buf * 699 bufq_prio_get(struct bufq_state *bufq, int remove) 700 { 701 struct bufq_prio *prio = bufq->bq_private; 702 struct buf *bp; 703 704 /* 705 * If no current request, get next from the lists. 706 */ 707 if (prio->bq_next == NULL) { 708 /* 709 * If at least one list is empty, select the other. 710 */ 711 if (TAILQ_FIRST(&prio->bq_read) == NULL) { 712 prio->bq_next = prio->bq_write_next; 713 prio->bq_read_burst = 0; 714 } else if (prio->bq_write_next == NULL) { 715 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 716 prio->bq_read_burst = 0; 717 } else { 718 /* 719 * Both list have requests. Select the read list up 720 * to PRIO_READ_BURST times, then select the write 721 * list PRIO_WRITE_REQ times. 722 */ 723 if (prio->bq_read_burst++ < PRIO_READ_BURST) 724 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 725 else if (prio->bq_read_burst < 726 PRIO_READ_BURST + PRIO_WRITE_REQ) 727 prio->bq_next = prio->bq_write_next; 728 else { 729 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 730 prio->bq_read_burst = 0; 731 } 732 } 733 } 734 735 bp = prio->bq_next; 736 737 if (bp != NULL && remove) { 738 if ((bp->b_flags & B_READ) == B_READ) 739 TAILQ_REMOVE(&prio->bq_read, bp, b_actq); 740 else { 741 /* 742 * Advance the write pointer before removing 743 * bp since it is actually prio->bq_write_next. 744 */ 745 prio->bq_write_next = 746 TAILQ_NEXT(prio->bq_write_next, b_actq); 747 TAILQ_REMOVE(&prio->bq_write, bp, b_actq); 748 if (prio->bq_write_next == NULL) 749 prio->bq_write_next = 750 TAILQ_FIRST(&prio->bq_write); 751 } 752 753 prio->bq_next = NULL; 754 } 755 756 return (bp); 757 } 758 759 760 /* 761 * Cyclical scan (CSCAN) 762 */ 763 TAILQ_HEAD(bqhead, buf); 764 struct cscan_queue { 765 struct bqhead cq_head[2]; /* actual lists of buffers */ 766 int cq_idx; /* current list index */ 767 int cq_lastcylinder; /* b_cylinder of the last request */ 768 daddr_t cq_lastrawblkno; /* b_rawblkno of the last request */ 769 }; 770 771 static int __inline cscan_empty(const struct cscan_queue *); 772 static void cscan_put(struct cscan_queue *, struct buf *, int); 773 static struct buf *cscan_get(struct cscan_queue *, int); 774 static void cscan_init(struct cscan_queue *); 775 776 static __inline int 777 cscan_empty(const struct cscan_queue *q) 778 { 779 780 return TAILQ_EMPTY(&q->cq_head[0]) && TAILQ_EMPTY(&q->cq_head[1]); 781 } 782 783 static void 784 cscan_put(struct cscan_queue *q, struct buf *bp, int sortby) 785 { 786 struct buf tmp; 787 struct buf *it; 788 struct bqhead *bqh; 789 int idx; 790 791 tmp.b_cylinder = q->cq_lastcylinder; 792 tmp.b_rawblkno = q->cq_lastrawblkno; 793 794 if (buf_inorder(bp, &tmp, sortby)) 795 idx = 1 - q->cq_idx; 796 else 797 idx = q->cq_idx; 798 799 bqh = &q->cq_head[idx]; 800 801 TAILQ_FOREACH(it, bqh, b_actq) 802 if (buf_inorder(bp, it, sortby)) 803 break; 804 805 if (it != NULL) 806 TAILQ_INSERT_BEFORE(it, bp, b_actq); 807 else 808 TAILQ_INSERT_TAIL(bqh, bp, b_actq); 809 } 810 811 static struct buf * 812 cscan_get(struct cscan_queue *q, int remove) 813 { 814 int idx = q->cq_idx; 815 struct bqhead *bqh; 816 struct buf *bp; 817 818 bqh = &q->cq_head[idx]; 819 bp = TAILQ_FIRST(bqh); 820 821 if (bp == NULL) { 822 /* switch queue */ 823 idx = 1 - idx; 824 bqh = &q->cq_head[idx]; 825 bp = TAILQ_FIRST(bqh); 826 } 827 828 KDASSERT((bp != NULL && !cscan_empty(q)) || 829 (bp == NULL && cscan_empty(q))); 830 831 if (bp != NULL && remove) { 832 q->cq_idx = idx; 833 TAILQ_REMOVE(bqh, bp, b_actq); 834 835 q->cq_lastcylinder = bp->b_cylinder; 836 q->cq_lastrawblkno = 837 bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT); 838 } 839 840 return (bp); 841 } 842 843 static void 844 cscan_init(struct cscan_queue *q) 845 { 846 847 TAILQ_INIT(&q->cq_head[0]); 848 TAILQ_INIT(&q->cq_head[1]); 849 } 850 851 852 /* 853 * Per-prioritiy CSCAN. 854 * 855 * XXX probably we should have a way to raise 856 * priority of the on-queue requests. 857 */ 858 #define PRIOCSCAN_NQUEUE 3 859 860 struct priocscan_queue { 861 struct cscan_queue q_queue; 862 int q_burst; 863 }; 864 865 struct bufq_priocscan { 866 struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE]; 867 868 #if 0 869 /* 870 * XXX using "global" head position can reduce positioning time 871 * when switching between queues. 872 * although it might affect against fairness. 873 */ 874 daddr_t bq_lastrawblkno; 875 int bq_lastcylinder; 876 #endif 877 }; 878 879 /* 880 * how many requests to serve when having pending requests on other queues. 881 * 882 * XXX tune 883 */ 884 const int priocscan_burst[] = { 885 64, 16, 4 886 }; 887 888 static void bufq_priocscan_put(struct bufq_state *, struct buf *); 889 static struct buf *bufq_priocscan_get(struct bufq_state *, int); 890 static void bufq_priocscan_init(struct bufq_state *); 891 static __inline struct cscan_queue *bufq_priocscan_selectqueue( 892 struct bufq_priocscan *, const struct buf *); 893 894 static __inline struct cscan_queue * 895 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp) 896 { 897 static const int priocscan_priomap[] = { 898 [BPRIO_TIMENONCRITICAL] = 2, 899 [BPRIO_TIMELIMITED] = 1, 900 [BPRIO_TIMECRITICAL] = 0 901 }; 902 903 return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue; 904 } 905 906 static void 907 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp) 908 { 909 struct bufq_priocscan *q = bufq->bq_private; 910 struct cscan_queue *cq; 911 const int sortby = bufq->bq_flags & BUFQ_SORT_MASK; 912 913 cq = bufq_priocscan_selectqueue(q, bp); 914 cscan_put(cq, bp, sortby); 915 } 916 917 static struct buf * 918 bufq_priocscan_get(struct bufq_state *bufq, int remove) 919 { 920 struct bufq_priocscan *q = bufq->bq_private; 921 struct priocscan_queue *pq, *npq; 922 struct priocscan_queue *first; /* first non-empty queue */ 923 const struct priocscan_queue *epq; 924 const struct cscan_queue *cq; 925 struct buf *bp; 926 boolean_t single; /* true if there's only one non-empty queue */ 927 928 pq = &q->bq_queue[0]; 929 epq = pq + PRIOCSCAN_NQUEUE; 930 for (; pq < epq; pq++) { 931 cq = &pq->q_queue; 932 if (!cscan_empty(cq)) 933 break; 934 } 935 if (pq == epq) { 936 /* there's no requests */ 937 return NULL; 938 } 939 940 first = pq; 941 single = TRUE; 942 for (npq = first + 1; npq < epq; npq++) { 943 cq = &npq->q_queue; 944 if (!cscan_empty(cq)) { 945 single = FALSE; 946 if (pq->q_burst > 0) 947 break; 948 pq = npq; 949 } 950 } 951 if (single) { 952 /* 953 * there's only a non-empty queue. just serve it. 954 */ 955 pq = first; 956 } else if (pq->q_burst > 0) { 957 /* 958 * XXX account only by number of requests. is it good enough? 959 */ 960 pq->q_burst--; 961 } else { 962 /* 963 * no queue was selected due to burst counts 964 */ 965 int i; 966 #ifdef DEBUG 967 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 968 pq = &q->bq_queue[i]; 969 cq = &pq->q_queue; 970 if (!cscan_empty(cq) && pq->q_burst) 971 panic("%s: inconsist", __func__); 972 } 973 #endif /* DEBUG */ 974 975 /* 976 * reset burst counts 977 */ 978 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 979 pq = &q->bq_queue[i]; 980 pq->q_burst = priocscan_burst[i]; 981 } 982 983 /* 984 * serve first non-empty queue. 985 */ 986 pq = first; 987 } 988 989 KDASSERT(!cscan_empty(&pq->q_queue)); 990 bp = cscan_get(&pq->q_queue, remove); 991 KDASSERT(bp != NULL); 992 KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp)); 993 994 return bp; 995 } 996 997 static void 998 bufq_priocscan_init(struct bufq_state *bufq) 999 { 1000 struct bufq_priocscan *q; 1001 int i; 1002 1003 bufq->bq_get = bufq_priocscan_get; 1004 bufq->bq_put = bufq_priocscan_put; 1005 bufq->bq_private = malloc(sizeof(struct bufq_priocscan), 1006 M_DEVBUF, M_ZERO); 1007 1008 q = bufq->bq_private; 1009 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 1010 struct cscan_queue *cq = &q->bq_queue[i].q_queue; 1011 1012 cscan_init(cq); 1013 } 1014 } 1015 1016 1017 /* 1018 * Create a device buffer queue. 1019 */ 1020 void 1021 bufq_alloc(struct bufq_state *bufq, int flags) 1022 { 1023 struct bufq_fcfs *fcfs; 1024 struct bufq_disksort *disksort; 1025 struct bufq_prio *prio; 1026 1027 bufq->bq_flags = flags; 1028 1029 switch (flags & BUFQ_SORT_MASK) { 1030 case BUFQ_SORT_RAWBLOCK: 1031 case BUFQ_SORT_CYLINDER: 1032 break; 1033 case 0: 1034 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS) 1035 break; 1036 /* FALLTHROUGH */ 1037 default: 1038 panic("bufq_alloc: sort out of range"); 1039 } 1040 1041 switch (flags & BUFQ_METHOD_MASK) { 1042 case BUFQ_FCFS: 1043 bufq->bq_get = bufq_fcfs_get; 1044 bufq->bq_put = bufq_fcfs_put; 1045 MALLOC(bufq->bq_private, struct bufq_fcfs *, 1046 sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO); 1047 fcfs = (struct bufq_fcfs *)bufq->bq_private; 1048 TAILQ_INIT(&fcfs->bq_head); 1049 break; 1050 case BUFQ_DISKSORT: 1051 bufq->bq_get = bufq_disksort_get; 1052 bufq->bq_put = bufq_disksort_put; 1053 MALLOC(bufq->bq_private, struct bufq_disksort *, 1054 sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO); 1055 disksort = (struct bufq_disksort *)bufq->bq_private; 1056 TAILQ_INIT(&disksort->bq_head); 1057 break; 1058 case BUFQ_READ_PRIO: 1059 bufq->bq_get = bufq_prio_get; 1060 bufq->bq_put = bufq_prio_put; 1061 MALLOC(bufq->bq_private, struct bufq_prio *, 1062 sizeof(struct bufq_prio), M_DEVBUF, M_ZERO); 1063 prio = (struct bufq_prio *)bufq->bq_private; 1064 TAILQ_INIT(&prio->bq_read); 1065 TAILQ_INIT(&prio->bq_write); 1066 break; 1067 case BUFQ_PRIOCSCAN: 1068 bufq_priocscan_init(bufq); 1069 break; 1070 default: 1071 panic("bufq_alloc: method out of range"); 1072 } 1073 } 1074 1075 /* 1076 * Destroy a device buffer queue. 1077 */ 1078 void 1079 bufq_free(struct bufq_state *bufq) 1080 { 1081 1082 KASSERT(bufq->bq_private != NULL); 1083 KASSERT(BUFQ_PEEK(bufq) == NULL); 1084 1085 FREE(bufq->bq_private, M_DEVBUF); 1086 bufq->bq_get = NULL; 1087 bufq->bq_put = NULL; 1088 } 1089 1090 /* 1091 * Bounds checking against the media size, used for the raw partition. 1092 * The sector size passed in should currently always be DEV_BSIZE, 1093 * and the media size the size of the device in DEV_BSIZE sectors. 1094 */ 1095 int 1096 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize) 1097 { 1098 int sz; 1099 1100 sz = howmany(bp->b_bcount, secsize); 1101 1102 if (bp->b_blkno + sz > mediasize) { 1103 sz = mediasize - bp->b_blkno; 1104 if (sz == 0) { 1105 /* If exactly at end of disk, return EOF. */ 1106 bp->b_resid = bp->b_bcount; 1107 goto done; 1108 } 1109 if (sz < 0) { 1110 /* If past end of disk, return EINVAL. */ 1111 bp->b_error = EINVAL; 1112 goto bad; 1113 } 1114 /* Otherwise, truncate request. */ 1115 bp->b_bcount = sz << DEV_BSHIFT; 1116 } 1117 1118 return 1; 1119 1120 bad: 1121 bp->b_flags |= B_ERROR; 1122 done: 1123 return 0; 1124 } 1125