1 /* $NetBSD: subr_disk.c,v 1.60 2004/03/09 12:23:07 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.60 2004/03/09 12:23:07 yamt Exp $"); 78 79 #include "opt_compat_netbsd.h" 80 #include "opt_bufq.h" 81 82 #include <sys/param.h> 83 #include <sys/kernel.h> 84 #include <sys/malloc.h> 85 #include <sys/buf.h> 86 #include <sys/syslog.h> 87 #include <sys/disklabel.h> 88 #include <sys/disk.h> 89 #include <sys/sysctl.h> 90 #include <lib/libkern/libkern.h> 91 92 /* 93 * A global list of all disks attached to the system. May grow or 94 * shrink over time. 95 */ 96 struct disklist_head disklist; /* TAILQ_HEAD */ 97 int disk_count; /* number of drives in global disklist */ 98 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 99 100 #ifdef NEW_BUFQ_STRATEGY 101 int bufq_disk_default_strat = BUFQ_READ_PRIO; 102 #else /* NEW_BUFQ_STRATEGY */ 103 int bufq_disk_default_strat = BUFQ_DISKSORT; 104 #endif /* NEW_BUFQ_STRATEGY */ 105 106 /* 107 * Compute checksum for disk label. 108 */ 109 u_int 110 dkcksum(struct disklabel *lp) 111 { 112 u_short *start, *end; 113 u_short sum = 0; 114 115 start = (u_short *)lp; 116 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 117 while (start < end) 118 sum ^= *start++; 119 return (sum); 120 } 121 122 /* 123 * Disk error is the preface to plaintive error messages 124 * about failing disk transfers. It prints messages of the form 125 126 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 127 128 * if the offset of the error in the transfer and a disk label 129 * are both available. blkdone should be -1 if the position of the error 130 * is unknown; the disklabel pointer may be null from drivers that have not 131 * been converted to use them. The message is printed with printf 132 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 133 * The message should be completed (with at least a newline) with printf 134 * or addlog, respectively. There is no trailing space. 135 */ 136 #ifndef PRIdaddr 137 #define PRIdaddr PRId64 138 #endif 139 void 140 diskerr(const struct buf *bp, const char *dname, const char *what, int pri, 141 int blkdone, const struct disklabel *lp) 142 { 143 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 144 void (*pr)(const char *, ...); 145 char partname = 'a' + part; 146 daddr_t sn; 147 148 if (/*CONSTCOND*/0) 149 /* Compiler will error this is the format is wrong... */ 150 printf("%" PRIdaddr, bp->b_blkno); 151 152 if (pri != LOG_PRINTF) { 153 static const char fmt[] = ""; 154 log(pri, fmt); 155 pr = addlog; 156 } else 157 pr = printf; 158 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 159 bp->b_flags & B_READ ? "read" : "writ"); 160 sn = bp->b_blkno; 161 if (bp->b_bcount <= DEV_BSIZE) 162 (*pr)("%" PRIdaddr, sn); 163 else { 164 if (blkdone >= 0) { 165 sn += blkdone; 166 (*pr)("%" PRIdaddr " of ", sn); 167 } 168 (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno, 169 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 170 } 171 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 172 sn += lp->d_partitions[part].p_offset; 173 (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "", 174 dname, unit, sn, sn / lp->d_secpercyl); 175 sn %= lp->d_secpercyl; 176 (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")", 177 sn / lp->d_nsectors, sn % lp->d_nsectors); 178 } 179 } 180 181 /* 182 * Initialize the disklist. Called by main() before autoconfiguration. 183 */ 184 void 185 disk_init(void) 186 { 187 188 TAILQ_INIT(&disklist); 189 disk_count = 0; 190 } 191 192 /* 193 * Searches the disklist for the disk corresponding to the 194 * name provided. 195 */ 196 struct disk * 197 disk_find(char *name) 198 { 199 struct disk *diskp; 200 201 if ((name == NULL) || (disk_count <= 0)) 202 return (NULL); 203 204 simple_lock(&disklist_slock); 205 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 206 diskp = TAILQ_NEXT(diskp, dk_link)) 207 if (strcmp(diskp->dk_name, name) == 0) { 208 simple_unlock(&disklist_slock); 209 return (diskp); 210 } 211 simple_unlock(&disklist_slock); 212 213 return (NULL); 214 } 215 216 /* 217 * Attach a disk. 218 */ 219 void 220 disk_attach(struct disk *diskp) 221 { 222 int s; 223 224 /* 225 * Allocate and initialize the disklabel structures. Note that 226 * it's not safe to sleep here, since we're probably going to be 227 * called during autoconfiguration. 228 */ 229 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 230 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 231 M_NOWAIT); 232 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 233 panic("disk_attach: can't allocate storage for disklabel"); 234 235 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 236 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 237 238 /* 239 * Set the attached timestamp. 240 */ 241 s = splclock(); 242 diskp->dk_attachtime = mono_time; 243 splx(s); 244 245 /* 246 * Link into the disklist. 247 */ 248 simple_lock(&disklist_slock); 249 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 250 simple_unlock(&disklist_slock); 251 ++disk_count; 252 } 253 254 /* 255 * Detach a disk. 256 */ 257 void 258 disk_detach(struct disk *diskp) 259 { 260 261 /* 262 * Remove from the disklist. 263 */ 264 if (--disk_count < 0) 265 panic("disk_detach: disk_count < 0"); 266 simple_lock(&disklist_slock); 267 TAILQ_REMOVE(&disklist, diskp, dk_link); 268 simple_unlock(&disklist_slock); 269 270 /* 271 * Free the space used by the disklabel structures. 272 */ 273 free(diskp->dk_label, M_DEVBUF); 274 free(diskp->dk_cpulabel, M_DEVBUF); 275 } 276 277 /* 278 * Increment a disk's busy counter. If the counter is going from 279 * 0 to 1, set the timestamp. 280 */ 281 void 282 disk_busy(struct disk *diskp) 283 { 284 int s; 285 286 /* 287 * XXX We'd like to use something as accurate as microtime(), 288 * but that doesn't depend on the system TOD clock. 289 */ 290 if (diskp->dk_busy++ == 0) { 291 s = splclock(); 292 diskp->dk_timestamp = mono_time; 293 splx(s); 294 } 295 } 296 297 /* 298 * Decrement a disk's busy counter, increment the byte count, total busy 299 * time, and reset the timestamp. 300 */ 301 void 302 disk_unbusy(struct disk *diskp, long bcount, int read) 303 { 304 int s; 305 struct timeval dv_time, diff_time; 306 307 if (diskp->dk_busy-- == 0) { 308 printf("%s: dk_busy < 0\n", diskp->dk_name); 309 panic("disk_unbusy"); 310 } 311 312 s = splclock(); 313 dv_time = mono_time; 314 splx(s); 315 316 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 317 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 318 319 diskp->dk_timestamp = dv_time; 320 if (bcount > 0) { 321 if (read) { 322 diskp->dk_rbytes += bcount; 323 diskp->dk_rxfer++; 324 } else { 325 diskp->dk_wbytes += bcount; 326 diskp->dk_wxfer++; 327 } 328 } 329 } 330 331 /* 332 * Reset the metrics counters on the given disk. Note that we cannot 333 * reset the busy counter, as it may case a panic in disk_unbusy(). 334 * We also must avoid playing with the timestamp information, as it 335 * may skew any pending transfer results. 336 */ 337 void 338 disk_resetstat(struct disk *diskp) 339 { 340 int s = splbio(), t; 341 342 diskp->dk_rxfer = 0; 343 diskp->dk_rbytes = 0; 344 diskp->dk_wxfer = 0; 345 diskp->dk_wbytes = 0; 346 347 t = splclock(); 348 diskp->dk_attachtime = mono_time; 349 splx(t); 350 351 timerclear(&diskp->dk_time); 352 353 splx(s); 354 } 355 356 int 357 sysctl_hw_disknames(SYSCTLFN_ARGS) 358 { 359 char buf[DK_DISKNAMELEN + 1]; 360 char *where = oldp; 361 struct disk *diskp; 362 size_t needed, left, slen; 363 int error, first; 364 365 if (newp != NULL) 366 return (EPERM); 367 if (namelen != 0) 368 return (EINVAL); 369 370 first = 1; 371 error = 0; 372 needed = 0; 373 left = *oldlenp; 374 375 simple_lock(&disklist_slock); 376 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 377 diskp = TAILQ_NEXT(diskp, dk_link)) { 378 if (where == NULL) 379 needed += strlen(diskp->dk_name) + 1; 380 else { 381 memset(buf, 0, sizeof(buf)); 382 if (first) { 383 strncpy(buf, diskp->dk_name, sizeof(buf)); 384 first = 0; 385 } else { 386 buf[0] = ' '; 387 strncpy(buf + 1, diskp->dk_name, 388 sizeof(buf) - 1); 389 } 390 buf[DK_DISKNAMELEN] = '\0'; 391 slen = strlen(buf); 392 if (left < slen + 1) 393 break; 394 /* +1 to copy out the trailing NUL byte */ 395 error = copyout(buf, where, slen + 1); 396 if (error) 397 break; 398 where += slen; 399 needed += slen; 400 left -= slen; 401 } 402 } 403 simple_unlock(&disklist_slock); 404 *oldlenp = needed; 405 return (error); 406 } 407 408 int 409 sysctl_hw_diskstats(SYSCTLFN_ARGS) 410 { 411 struct disk_sysctl sdisk; 412 struct disk *diskp; 413 char *where = oldp; 414 size_t tocopy, left; 415 int error; 416 417 if (newp != NULL) 418 return (EPERM); 419 420 /* 421 * The original hw.diskstats call was broken and did not require 422 * the userland to pass in it's size of struct disk_sysctl. This 423 * was fixed after NetBSD 1.6 was released, and any applications 424 * that do not pass in the size are given an error only, unless 425 * we care about 1.6 compatibility. 426 */ 427 if (namelen == 0) 428 #ifdef COMPAT_16 429 tocopy = offsetof(struct disk_sysctl, dk_rxfer); 430 #else 431 return (EINVAL); 432 #endif 433 else 434 tocopy = name[0]; 435 436 if (where == NULL) { 437 *oldlenp = disk_count * tocopy; 438 return (0); 439 } 440 441 error = 0; 442 left = *oldlenp; 443 memset(&sdisk, 0, sizeof(sdisk)); 444 *oldlenp = 0; 445 446 simple_lock(&disklist_slock); 447 TAILQ_FOREACH(diskp, &disklist, dk_link) { 448 if (left < tocopy) 449 break; 450 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 451 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer; 452 sdisk.dk_rxfer = diskp->dk_rxfer; 453 sdisk.dk_wxfer = diskp->dk_wxfer; 454 sdisk.dk_seek = diskp->dk_seek; 455 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes; 456 sdisk.dk_rbytes = diskp->dk_rbytes; 457 sdisk.dk_wbytes = diskp->dk_wbytes; 458 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 459 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 460 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 461 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 462 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 463 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 464 sdisk.dk_busy = diskp->dk_busy; 465 466 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 467 if (error) 468 break; 469 where += tocopy; 470 *oldlenp += tocopy; 471 left -= tocopy; 472 } 473 simple_unlock(&disklist_slock); 474 return (error); 475 } 476 477 struct bufq_fcfs { 478 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 479 }; 480 481 struct bufq_disksort { 482 TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */ 483 }; 484 485 #define PRIO_READ_BURST 48 486 #define PRIO_WRITE_REQ 16 487 488 struct bufq_prio { 489 TAILQ_HEAD(, buf) bq_read, bq_write; /* actual list of buffers */ 490 struct buf *bq_write_next; /* next request in bq_write */ 491 struct buf *bq_next; /* current request */ 492 int bq_read_burst; /* # of consecutive reads */ 493 }; 494 495 496 static __inline int buf_inorder(const struct buf *, const struct buf *, int); 497 498 /* 499 * Check if two buf's are in ascending order. 500 */ 501 static __inline int 502 buf_inorder(const struct buf *bp, const struct buf *bq, int sortby) 503 { 504 505 if (bp == NULL || bq == NULL) 506 return (bq == NULL); 507 508 if (sortby == BUFQ_SORT_CYLINDER) { 509 if (bp->b_cylinder != bq->b_cylinder) 510 return bp->b_cylinder < bq->b_cylinder; 511 else 512 return bp->b_rawblkno < bq->b_rawblkno; 513 } else 514 return bp->b_rawblkno < bq->b_rawblkno; 515 } 516 517 518 /* 519 * First-come first-served sort for disks. 520 * 521 * Requests are appended to the queue without any reordering. 522 */ 523 static void 524 bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp) 525 { 526 struct bufq_fcfs *fcfs = bufq->bq_private; 527 528 TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq); 529 } 530 531 static struct buf * 532 bufq_fcfs_get(struct bufq_state *bufq, int remove) 533 { 534 struct bufq_fcfs *fcfs = bufq->bq_private; 535 struct buf *bp; 536 537 bp = TAILQ_FIRST(&fcfs->bq_head); 538 539 if (bp != NULL && remove) 540 TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq); 541 542 return (bp); 543 } 544 545 546 /* 547 * Seek sort for disks. 548 * 549 * There are actually two queues, sorted in ascendening order. The first 550 * queue holds those requests which are positioned after the current block; 551 * the second holds requests which came in after their position was passed. 552 * Thus we implement a one-way scan, retracting after reaching the end of 553 * the drive to the first request on the second queue, at which time it 554 * becomes the first queue. 555 * 556 * A one-way scan is natural because of the way UNIX read-ahead blocks are 557 * allocated. 558 */ 559 static void 560 bufq_disksort_put(struct bufq_state *bufq, struct buf *bp) 561 { 562 struct bufq_disksort *disksort = bufq->bq_private; 563 struct buf *bq, *nbq; 564 int sortby; 565 566 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 567 568 bq = TAILQ_FIRST(&disksort->bq_head); 569 570 /* 571 * If the queue is empty it's easy; we just go on the end. 572 */ 573 if (bq == NULL) { 574 TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq); 575 return; 576 } 577 578 /* 579 * If we lie before the currently active request, then we 580 * must locate the second request list and add ourselves to it. 581 */ 582 if (buf_inorder(bp, bq, sortby)) { 583 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 584 /* 585 * Check for an ``inversion'' in the normally ascending 586 * block numbers, indicating the start of the second 587 * request list. 588 */ 589 if (buf_inorder(nbq, bq, sortby)) { 590 /* 591 * Search the second request list for the first 592 * request at a larger block number. We go 593 * after that; if there is no such request, we 594 * go at the end. 595 */ 596 do { 597 if (buf_inorder(bp, nbq, sortby)) 598 goto insert; 599 bq = nbq; 600 } while ((nbq = 601 TAILQ_NEXT(bq, b_actq)) != NULL); 602 goto insert; /* after last */ 603 } 604 bq = nbq; 605 } 606 /* 607 * No inversions... we will go after the last, and 608 * be the first request in the second request list. 609 */ 610 goto insert; 611 } 612 /* 613 * Request is at/after the current request... 614 * sort in the first request list. 615 */ 616 while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) { 617 /* 618 * We want to go after the current request if there is an 619 * inversion after it (i.e. it is the end of the first 620 * request list), or if the next request is a larger cylinder 621 * than our request. 622 */ 623 if (buf_inorder(nbq, bq, sortby) || 624 buf_inorder(bp, nbq, sortby)) 625 goto insert; 626 bq = nbq; 627 } 628 /* 629 * Neither a second list nor a larger request... we go at the end of 630 * the first list, which is the same as the end of the whole schebang. 631 */ 632 insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq); 633 } 634 635 static struct buf * 636 bufq_disksort_get(struct bufq_state *bufq, int remove) 637 { 638 struct bufq_disksort *disksort = bufq->bq_private; 639 struct buf *bp; 640 641 bp = TAILQ_FIRST(&disksort->bq_head); 642 643 if (bp != NULL && remove) 644 TAILQ_REMOVE(&disksort->bq_head, bp, b_actq); 645 646 return (bp); 647 } 648 649 650 /* 651 * Seek sort for disks. 652 * 653 * There are two queues. The first queue holds read requests; the second 654 * holds write requests. The read queue is first-come first-served; the 655 * write queue is sorted in ascendening block order. 656 * The read queue is processed first. After PRIO_READ_BURST consecutive 657 * read requests with non-empty write queue PRIO_WRITE_REQ requests from 658 * the write queue will be processed. 659 */ 660 static void 661 bufq_prio_put(struct bufq_state *bufq, struct buf *bp) 662 { 663 struct bufq_prio *prio = bufq->bq_private; 664 struct buf *bq; 665 int sortby; 666 667 sortby = bufq->bq_flags & BUFQ_SORT_MASK; 668 669 /* 670 * If it's a read request append it to the list. 671 */ 672 if ((bp->b_flags & B_READ) == B_READ) { 673 TAILQ_INSERT_TAIL(&prio->bq_read, bp, b_actq); 674 return; 675 } 676 677 bq = TAILQ_FIRST(&prio->bq_write); 678 679 /* 680 * If the write list is empty, simply append it to the list. 681 */ 682 if (bq == NULL) { 683 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 684 prio->bq_write_next = bp; 685 return; 686 } 687 688 /* 689 * If we lie after the next request, insert after this request. 690 */ 691 if (buf_inorder(prio->bq_write_next, bp, sortby)) 692 bq = prio->bq_write_next; 693 694 /* 695 * Search for the first request at a larger block number. 696 * We go before this request if it exists. 697 */ 698 while (bq != NULL && buf_inorder(bq, bp, sortby)) 699 bq = TAILQ_NEXT(bq, b_actq); 700 701 if (bq != NULL) 702 TAILQ_INSERT_BEFORE(bq, bp, b_actq); 703 else 704 TAILQ_INSERT_TAIL(&prio->bq_write, bp, b_actq); 705 } 706 707 static struct buf * 708 bufq_prio_get(struct bufq_state *bufq, int remove) 709 { 710 struct bufq_prio *prio = bufq->bq_private; 711 struct buf *bp; 712 713 /* 714 * If no current request, get next from the lists. 715 */ 716 if (prio->bq_next == NULL) { 717 /* 718 * If at least one list is empty, select the other. 719 */ 720 if (TAILQ_FIRST(&prio->bq_read) == NULL) { 721 prio->bq_next = prio->bq_write_next; 722 prio->bq_read_burst = 0; 723 } else if (prio->bq_write_next == NULL) { 724 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 725 prio->bq_read_burst = 0; 726 } else { 727 /* 728 * Both list have requests. Select the read list up 729 * to PRIO_READ_BURST times, then select the write 730 * list PRIO_WRITE_REQ times. 731 */ 732 if (prio->bq_read_burst++ < PRIO_READ_BURST) 733 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 734 else if (prio->bq_read_burst < 735 PRIO_READ_BURST + PRIO_WRITE_REQ) 736 prio->bq_next = prio->bq_write_next; 737 else { 738 prio->bq_next = TAILQ_FIRST(&prio->bq_read); 739 prio->bq_read_burst = 0; 740 } 741 } 742 } 743 744 bp = prio->bq_next; 745 746 if (bp != NULL && remove) { 747 if ((bp->b_flags & B_READ) == B_READ) 748 TAILQ_REMOVE(&prio->bq_read, bp, b_actq); 749 else { 750 /* 751 * Advance the write pointer before removing 752 * bp since it is actually prio->bq_write_next. 753 */ 754 prio->bq_write_next = 755 TAILQ_NEXT(prio->bq_write_next, b_actq); 756 TAILQ_REMOVE(&prio->bq_write, bp, b_actq); 757 if (prio->bq_write_next == NULL) 758 prio->bq_write_next = 759 TAILQ_FIRST(&prio->bq_write); 760 } 761 762 prio->bq_next = NULL; 763 } 764 765 return (bp); 766 } 767 768 769 /* 770 * Cyclical scan (CSCAN) 771 */ 772 TAILQ_HEAD(bqhead, buf); 773 struct cscan_queue { 774 struct bqhead cq_head[2]; /* actual lists of buffers */ 775 int cq_idx; /* current list index */ 776 int cq_lastcylinder; /* b_cylinder of the last request */ 777 daddr_t cq_lastrawblkno; /* b_rawblkno of the last request */ 778 }; 779 780 static int __inline cscan_empty(const struct cscan_queue *); 781 static void cscan_put(struct cscan_queue *, struct buf *, int); 782 static struct buf *cscan_get(struct cscan_queue *, int); 783 static void cscan_init(struct cscan_queue *); 784 785 static __inline int 786 cscan_empty(const struct cscan_queue *q) 787 { 788 789 return TAILQ_EMPTY(&q->cq_head[0]) && TAILQ_EMPTY(&q->cq_head[1]); 790 } 791 792 static void 793 cscan_put(struct cscan_queue *q, struct buf *bp, int sortby) 794 { 795 struct buf tmp; 796 struct buf *it; 797 struct bqhead *bqh; 798 int idx; 799 800 tmp.b_cylinder = q->cq_lastcylinder; 801 tmp.b_rawblkno = q->cq_lastrawblkno; 802 803 if (buf_inorder(bp, &tmp, sortby)) 804 idx = 1 - q->cq_idx; 805 else 806 idx = q->cq_idx; 807 808 bqh = &q->cq_head[idx]; 809 810 TAILQ_FOREACH(it, bqh, b_actq) 811 if (buf_inorder(bp, it, sortby)) 812 break; 813 814 if (it != NULL) 815 TAILQ_INSERT_BEFORE(it, bp, b_actq); 816 else 817 TAILQ_INSERT_TAIL(bqh, bp, b_actq); 818 } 819 820 static struct buf * 821 cscan_get(struct cscan_queue *q, int remove) 822 { 823 int idx = q->cq_idx; 824 struct bqhead *bqh; 825 struct buf *bp; 826 827 bqh = &q->cq_head[idx]; 828 bp = TAILQ_FIRST(bqh); 829 830 if (bp == NULL) { 831 /* switch queue */ 832 idx = 1 - idx; 833 bqh = &q->cq_head[idx]; 834 bp = TAILQ_FIRST(bqh); 835 } 836 837 KDASSERT((bp != NULL && !cscan_empty(q)) || 838 (bp == NULL && cscan_empty(q))); 839 840 if (bp != NULL && remove) { 841 q->cq_idx = idx; 842 TAILQ_REMOVE(bqh, bp, b_actq); 843 844 q->cq_lastcylinder = bp->b_cylinder; 845 q->cq_lastrawblkno = 846 bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT); 847 } 848 849 return (bp); 850 } 851 852 static void 853 cscan_init(struct cscan_queue *q) 854 { 855 856 TAILQ_INIT(&q->cq_head[0]); 857 TAILQ_INIT(&q->cq_head[1]); 858 } 859 860 861 /* 862 * Per-prioritiy CSCAN. 863 * 864 * XXX probably we should have a way to raise 865 * priority of the on-queue requests. 866 */ 867 #define PRIOCSCAN_NQUEUE 3 868 869 struct priocscan_queue { 870 struct cscan_queue q_queue; 871 int q_burst; 872 }; 873 874 struct bufq_priocscan { 875 struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE]; 876 877 #if 0 878 /* 879 * XXX using "global" head position can reduce positioning time 880 * when switching between queues. 881 * although it might affect against fairness. 882 */ 883 daddr_t bq_lastrawblkno; 884 int bq_lastcylinder; 885 #endif 886 }; 887 888 /* 889 * how many requests to serve when having pending requests on other queues. 890 * 891 * XXX tune 892 */ 893 const int priocscan_burst[] = { 894 64, 16, 4 895 }; 896 897 static void bufq_priocscan_put(struct bufq_state *, struct buf *); 898 static struct buf *bufq_priocscan_get(struct bufq_state *, int); 899 static void bufq_priocscan_init(struct bufq_state *); 900 static __inline struct cscan_queue *bufq_priocscan_selectqueue( 901 struct bufq_priocscan *, const struct buf *); 902 903 static __inline struct cscan_queue * 904 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp) 905 { 906 static const int priocscan_priomap[] = { 907 [BPRIO_TIMENONCRITICAL] = 2, 908 [BPRIO_TIMELIMITED] = 1, 909 [BPRIO_TIMECRITICAL] = 0 910 }; 911 912 return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue; 913 } 914 915 static void 916 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp) 917 { 918 struct bufq_priocscan *q = bufq->bq_private; 919 struct cscan_queue *cq; 920 const int sortby = bufq->bq_flags & BUFQ_SORT_MASK; 921 922 cq = bufq_priocscan_selectqueue(q, bp); 923 cscan_put(cq, bp, sortby); 924 } 925 926 static struct buf * 927 bufq_priocscan_get(struct bufq_state *bufq, int remove) 928 { 929 struct bufq_priocscan *q = bufq->bq_private; 930 struct priocscan_queue *pq, *npq; 931 struct priocscan_queue *first; /* first non-empty queue */ 932 const struct priocscan_queue *epq; 933 const struct cscan_queue *cq; 934 struct buf *bp; 935 boolean_t single; /* true if there's only one non-empty queue */ 936 937 pq = &q->bq_queue[0]; 938 epq = pq + PRIOCSCAN_NQUEUE; 939 for (; pq < epq; pq++) { 940 cq = &pq->q_queue; 941 if (!cscan_empty(cq)) 942 break; 943 } 944 if (pq == epq) { 945 /* there's no requests */ 946 return NULL; 947 } 948 949 first = pq; 950 single = TRUE; 951 for (npq = first + 1; npq < epq; npq++) { 952 cq = &npq->q_queue; 953 if (!cscan_empty(cq)) { 954 single = FALSE; 955 if (pq->q_burst > 0) 956 break; 957 pq = npq; 958 } 959 } 960 if (single) { 961 /* 962 * there's only a non-empty queue. just serve it. 963 */ 964 pq = first; 965 } else if (pq->q_burst > 0) { 966 /* 967 * XXX account only by number of requests. is it good enough? 968 */ 969 pq->q_burst--; 970 } else { 971 /* 972 * no queue was selected due to burst counts 973 */ 974 int i; 975 #ifdef DEBUG 976 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 977 pq = &q->bq_queue[i]; 978 cq = &pq->q_queue; 979 if (!cscan_empty(cq) && pq->q_burst) 980 panic("%s: inconsist", __func__); 981 } 982 #endif /* DEBUG */ 983 984 /* 985 * reset burst counts 986 */ 987 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 988 pq = &q->bq_queue[i]; 989 pq->q_burst = priocscan_burst[i]; 990 } 991 992 /* 993 * serve first non-empty queue. 994 */ 995 pq = first; 996 } 997 998 KDASSERT(!cscan_empty(&pq->q_queue)); 999 bp = cscan_get(&pq->q_queue, remove); 1000 KDASSERT(bp != NULL); 1001 KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp)); 1002 1003 return bp; 1004 } 1005 1006 static void 1007 bufq_priocscan_init(struct bufq_state *bufq) 1008 { 1009 struct bufq_priocscan *q; 1010 int i; 1011 1012 bufq->bq_get = bufq_priocscan_get; 1013 bufq->bq_put = bufq_priocscan_put; 1014 bufq->bq_private = malloc(sizeof(struct bufq_priocscan), 1015 M_DEVBUF, M_ZERO); 1016 1017 q = bufq->bq_private; 1018 for (i = 0; i < PRIOCSCAN_NQUEUE; i++) { 1019 struct cscan_queue *cq = &q->bq_queue[i].q_queue; 1020 1021 cscan_init(cq); 1022 } 1023 } 1024 1025 1026 /* 1027 * Create a device buffer queue. 1028 */ 1029 void 1030 bufq_alloc(struct bufq_state *bufq, int flags) 1031 { 1032 struct bufq_fcfs *fcfs; 1033 struct bufq_disksort *disksort; 1034 struct bufq_prio *prio; 1035 1036 bufq->bq_flags = flags; 1037 1038 switch (flags & BUFQ_SORT_MASK) { 1039 case BUFQ_SORT_RAWBLOCK: 1040 case BUFQ_SORT_CYLINDER: 1041 break; 1042 case 0: 1043 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS) 1044 break; 1045 /* FALLTHROUGH */ 1046 default: 1047 panic("bufq_alloc: sort out of range"); 1048 } 1049 1050 switch (flags & BUFQ_METHOD_MASK) { 1051 case BUFQ_FCFS: 1052 bufq->bq_get = bufq_fcfs_get; 1053 bufq->bq_put = bufq_fcfs_put; 1054 MALLOC(bufq->bq_private, struct bufq_fcfs *, 1055 sizeof(struct bufq_fcfs), M_DEVBUF, M_ZERO); 1056 fcfs = (struct bufq_fcfs *)bufq->bq_private; 1057 TAILQ_INIT(&fcfs->bq_head); 1058 break; 1059 case BUFQ_DISKSORT: 1060 bufq->bq_get = bufq_disksort_get; 1061 bufq->bq_put = bufq_disksort_put; 1062 MALLOC(bufq->bq_private, struct bufq_disksort *, 1063 sizeof(struct bufq_disksort), M_DEVBUF, M_ZERO); 1064 disksort = (struct bufq_disksort *)bufq->bq_private; 1065 TAILQ_INIT(&disksort->bq_head); 1066 break; 1067 case BUFQ_READ_PRIO: 1068 bufq->bq_get = bufq_prio_get; 1069 bufq->bq_put = bufq_prio_put; 1070 MALLOC(bufq->bq_private, struct bufq_prio *, 1071 sizeof(struct bufq_prio), M_DEVBUF, M_ZERO); 1072 prio = (struct bufq_prio *)bufq->bq_private; 1073 TAILQ_INIT(&prio->bq_read); 1074 TAILQ_INIT(&prio->bq_write); 1075 break; 1076 case BUFQ_PRIOCSCAN: 1077 bufq_priocscan_init(bufq); 1078 break; 1079 default: 1080 panic("bufq_alloc: method out of range"); 1081 } 1082 } 1083 1084 /* 1085 * Destroy a device buffer queue. 1086 */ 1087 void 1088 bufq_free(struct bufq_state *bufq) 1089 { 1090 1091 KASSERT(bufq->bq_private != NULL); 1092 KASSERT(BUFQ_PEEK(bufq) == NULL); 1093 1094 FREE(bufq->bq_private, M_DEVBUF); 1095 bufq->bq_get = NULL; 1096 bufq->bq_put = NULL; 1097 } 1098 1099 /* 1100 * Bounds checking against the media size, used for the raw partition. 1101 * The sector size passed in should currently always be DEV_BSIZE, 1102 * and the media size the size of the device in DEV_BSIZE sectors. 1103 */ 1104 int 1105 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize) 1106 { 1107 int sz; 1108 1109 sz = howmany(bp->b_bcount, secsize); 1110 1111 if (bp->b_blkno + sz > mediasize) { 1112 sz = mediasize - bp->b_blkno; 1113 if (sz == 0) { 1114 /* If exactly at end of disk, return EOF. */ 1115 bp->b_resid = bp->b_bcount; 1116 goto done; 1117 } 1118 if (sz < 0) { 1119 /* If past end of disk, return EINVAL. */ 1120 bp->b_error = EINVAL; 1121 goto bad; 1122 } 1123 /* Otherwise, truncate request. */ 1124 bp->b_bcount = sz << DEV_BSHIFT; 1125 } 1126 1127 return 1; 1128 1129 bad: 1130 bp->b_flags |= B_ERROR; 1131 done: 1132 return 0; 1133 } 1134