1 /* $OpenBSD: subr_disk.c,v 1.26 2004/06/24 19:35:24 tholo Exp $ */ 2 /* $NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1995 Jason R. Thorpe. All rights reserved. 6 * Copyright (c) 1982, 1986, 1988, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/fcntl.h> 46 #include <sys/buf.h> 47 #include <sys/stat.h> 48 #include <sys/syslog.h> 49 #include <sys/time.h> 50 #include <sys/disklabel.h> 51 #include <sys/conf.h> 52 #include <sys/lock.h> 53 #include <sys/disk.h> 54 #include <sys/dkio.h> 55 #include <sys/dkstat.h> /* XXX */ 56 #include <sys/proc.h> 57 58 #include <dev/rndvar.h> 59 60 /* 61 * A global list of all disks attached to the system. May grow or 62 * shrink over time. 63 */ 64 struct disklist_head disklist; /* TAILQ_HEAD */ 65 int disk_count; /* number of drives in global disklist */ 66 int disk_change; /* set if a disk has been attached/detached 67 * since last we looked at this variable. This 68 * is reset by hw_sysctl() 69 */ 70 71 /* 72 * Seek sort for disks. We depend on the driver which calls us using b_resid 73 * as the current cylinder number. 74 * 75 * The argument ap structure holds a b_actf activity chain pointer on which we 76 * keep two queues, sorted in ascending cylinder order. The first queue holds 77 * those requests which are positioned after the current cylinder (in the first 78 * request); the second holds requests which came in after their cylinder number 79 * was passed. Thus we implement a one way scan, retracting after reaching the 80 * end of the drive to the first request on the second queue, at which time it 81 * becomes the first queue. 82 * 83 * A one-way scan is natural because of the way UNIX read-ahead blocks are 84 * allocated. 85 */ 86 87 void 88 disksort(ap, bp) 89 register struct buf *ap, *bp; 90 { 91 register struct buf *bq; 92 93 /* If the queue is empty, then it's easy. */ 94 if (ap->b_actf == NULL) { 95 bp->b_actf = NULL; 96 ap->b_actf = bp; 97 return; 98 } 99 100 /* 101 * If we lie after the first (currently active) request, then we 102 * must locate the second request list and add ourselves to it. 103 */ 104 bq = ap->b_actf; 105 if (bp->b_cylinder < bq->b_cylinder) { 106 while (bq->b_actf) { 107 /* 108 * Check for an ``inversion'' in the normally ascending 109 * cylinder numbers, indicating the start of the second 110 * request list. 111 */ 112 if (bq->b_actf->b_cylinder < bq->b_cylinder) { 113 /* 114 * Search the second request list for the first 115 * request at a larger cylinder number. We go 116 * before that; if there is no such request, we 117 * go at end. 118 */ 119 do { 120 if (bp->b_cylinder < 121 bq->b_actf->b_cylinder) 122 goto insert; 123 if (bp->b_cylinder == 124 bq->b_actf->b_cylinder && 125 bp->b_blkno < bq->b_actf->b_blkno) 126 goto insert; 127 bq = bq->b_actf; 128 } while (bq->b_actf); 129 goto insert; /* after last */ 130 } 131 bq = bq->b_actf; 132 } 133 /* 134 * No inversions... we will go after the last, and 135 * be the first request in the second request list. 136 */ 137 goto insert; 138 } 139 /* 140 * Request is at/after the current request... 141 * sort in the first request list. 142 */ 143 while (bq->b_actf) { 144 /* 145 * We want to go after the current request if there is an 146 * inversion after it (i.e. it is the end of the first 147 * request list), or if the next request is a larger cylinder 148 * than our request. 149 */ 150 if (bq->b_actf->b_cylinder < bq->b_cylinder || 151 bp->b_cylinder < bq->b_actf->b_cylinder || 152 (bp->b_cylinder == bq->b_actf->b_cylinder && 153 bp->b_blkno < bq->b_actf->b_blkno)) 154 goto insert; 155 bq = bq->b_actf; 156 } 157 /* 158 * Neither a second list nor a larger request... we go at the end of 159 * the first list, which is the same as the end of the whole schebang. 160 */ 161 insert: bp->b_actf = bq->b_actf; 162 bq->b_actf = bp; 163 } 164 165 /* 166 * Compute checksum for disk label. 167 */ 168 u_int 169 dkcksum(lp) 170 register struct disklabel *lp; 171 { 172 register u_int16_t *start, *end; 173 register u_int16_t sum = 0; 174 175 start = (u_int16_t *)lp; 176 end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions]; 177 while (start < end) 178 sum ^= *start++; 179 return (sum); 180 } 181 182 /* 183 * Disk error is the preface to plaintive error messages 184 * about failing disk transfers. It prints messages of the form 185 186 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 187 188 * if the offset of the error in the transfer and a disk label 189 * are both available. blkdone should be -1 if the position of the error 190 * is unknown; the disklabel pointer may be null from drivers that have not 191 * been converted to use them. The message is printed with printf 192 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 193 * The message should be completed (with at least a newline) with printf 194 * or addlog, respectively. There is no trailing space. 195 */ 196 void 197 diskerr(bp, dname, what, pri, blkdone, lp) 198 register struct buf *bp; 199 char *dname, *what; 200 int pri, blkdone; 201 register struct disklabel *lp; 202 { 203 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 204 register int (*pr)(const char *, ...); 205 char partname = 'a' + part; 206 int sn; 207 208 if (pri != LOG_PRINTF) { 209 static const char fmt[] = ""; 210 log(pri, fmt); 211 pr = addlog; 212 } else 213 pr = printf; 214 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 215 bp->b_flags & B_READ ? "read" : "writ"); 216 sn = bp->b_blkno; 217 if (bp->b_bcount <= DEV_BSIZE) 218 (*pr)("%d", sn); 219 else { 220 if (blkdone >= 0) { 221 sn += blkdone; 222 (*pr)("%d of ", sn); 223 } 224 (*pr)("%d-%d", bp->b_blkno, 225 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 226 } 227 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 228 #ifdef tahoe 229 sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ 230 #endif 231 sn += lp->d_partitions[part].p_offset; 232 (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, 233 sn / lp->d_secpercyl); 234 sn %= lp->d_secpercyl; 235 (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); 236 } 237 } 238 239 /* 240 * Initialize the disklist. Called by main() before autoconfiguration. 241 */ 242 void 243 disk_init() 244 { 245 246 TAILQ_INIT(&disklist); 247 disk_count = disk_change = 0; 248 } 249 250 /* 251 * Searches the disklist for the disk corresponding to the 252 * name provided. 253 */ 254 struct disk * 255 disk_find(name) 256 char *name; 257 { 258 struct disk *diskp; 259 260 if ((name == NULL) || (disk_count <= 0)) 261 return (NULL); 262 263 for (diskp = disklist.tqh_first; diskp != NULL; 264 diskp = diskp->dk_link.tqe_next) 265 if (strcmp(diskp->dk_name, name) == 0) 266 return (diskp); 267 268 return (NULL); 269 } 270 271 int 272 disk_construct(diskp, lockname) 273 struct disk *diskp; 274 char *lockname; 275 { 276 lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname, 277 0, LK_CANRECURSE); 278 279 diskp->dk_flags |= DKF_CONSTRUCTED; 280 281 return (0); 282 } 283 284 /* 285 * Attach a disk. 286 */ 287 void 288 disk_attach(diskp) 289 struct disk *diskp; 290 { 291 292 if (!diskp->dk_flags & DKF_CONSTRUCTED) 293 disk_construct(diskp, diskp->dk_name); 294 295 /* 296 * Allocate and initialize the disklabel structures. Note that 297 * it's not safe to sleep here, since we're probably going to be 298 * called during autoconfiguration. 299 */ 300 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 301 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 302 M_NOWAIT); 303 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 304 panic("disk_attach: can't allocate storage for disklabel"); 305 306 bzero(diskp->dk_label, sizeof(struct disklabel)); 307 bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel)); 308 309 /* 310 * Set the attached timestamp. 311 */ 312 microuptime(&diskp->dk_attachtime); 313 314 /* 315 * Link into the disklist. 316 */ 317 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 318 ++disk_count; 319 disk_change = 1; 320 } 321 322 /* 323 * Detach a disk. 324 */ 325 void 326 disk_detach(diskp) 327 struct disk *diskp; 328 { 329 330 /* 331 * Free the space used by the disklabel structures. 332 */ 333 free(diskp->dk_label, M_DEVBUF); 334 free(diskp->dk_cpulabel, M_DEVBUF); 335 336 /* 337 * Remove from the disklist. 338 */ 339 TAILQ_REMOVE(&disklist, diskp, dk_link); 340 disk_change = 1; 341 if (--disk_count < 0) 342 panic("disk_detach: disk_count < 0"); 343 } 344 345 /* 346 * Increment a disk's busy counter. If the counter is going from 347 * 0 to 1, set the timestamp. 348 */ 349 void 350 disk_busy(diskp) 351 struct disk *diskp; 352 { 353 354 /* 355 * XXX We'd like to use something as accurate as microtime(), 356 * but that doesn't depend on the system TOD clock. 357 */ 358 if (diskp->dk_busy++ == 0) { 359 microuptime(&diskp->dk_timestamp); 360 } 361 } 362 363 /* 364 * Decrement a disk's busy counter, increment the byte count, total busy 365 * time, and reset the timestamp. 366 */ 367 void 368 disk_unbusy(diskp, bcount, read) 369 struct disk *diskp; 370 long bcount; 371 int read; 372 { 373 struct timeval dv_time, diff_time; 374 375 if (diskp->dk_busy-- == 0) 376 printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name); 377 378 microuptime(&dv_time); 379 380 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 381 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 382 383 diskp->dk_timestamp = dv_time; 384 if (bcount > 0) { 385 if (read) { 386 diskp->dk_rbytes += bcount; 387 diskp->dk_rxfer++; 388 } else { 389 diskp->dk_wbytes += bcount; 390 diskp->dk_wxfer++; 391 } 392 } else 393 diskp->dk_seek++; 394 395 add_disk_randomness(bcount ^ diff_time.tv_usec); 396 } 397 398 399 int 400 disk_lock(dk) 401 struct disk *dk; 402 { 403 int error; 404 405 error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc); 406 407 return (error); 408 } 409 410 void 411 disk_unlock(dk) 412 struct disk *dk; 413 { 414 lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc); 415 } 416 417 418 /* 419 * Reset the metrics counters on the given disk. Note that we cannot 420 * reset the busy counter, as it may case a panic in disk_unbusy(). 421 * We also must avoid playing with the timestamp information, as it 422 * may skew any pending transfer results. 423 */ 424 void 425 disk_resetstat(diskp) 426 struct disk *diskp; 427 { 428 int s = splbio(); 429 430 diskp->dk_rxfer = 0; 431 diskp->dk_rbytes = 0; 432 diskp->dk_wxfer = 0; 433 diskp->dk_wbytes = 0; 434 diskp->dk_seek = 0; 435 436 microuptime(&diskp->dk_attachtime); 437 438 timerclear(&diskp->dk_time); 439 440 splx(s); 441 } 442 443 444 int 445 dk_mountroot() 446 { 447 dev_t rawdev, rrootdev; 448 int part = DISKPART(rootdev); 449 int (*mountrootfn)(void); 450 struct disklabel dl; 451 int error; 452 453 rrootdev = blktochr(rootdev); 454 rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART); 455 printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev, 456 rrootdev, rawdev); 457 458 /* 459 * open device, ioctl for the disklabel, and close it. 460 */ 461 error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD, 462 S_IFCHR, curproc); 463 if (error) 464 panic("cannot open disk, 0x%x/0x%x, error %d", 465 rootdev, rrootdev, error); 466 error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO, 467 (caddr_t)&dl, FREAD, curproc); 468 if (error) 469 panic("cannot read disk label, 0x%x/0x%x, error %d", 470 rootdev, rrootdev, error); 471 (void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD, 472 S_IFCHR, curproc); 473 474 if (dl.d_partitions[part].p_size == 0) 475 panic("root filesystem has size 0"); 476 switch (dl.d_partitions[part].p_fstype) { 477 #ifdef EXT2FS 478 case FS_EXT2FS: 479 { 480 extern int ext2fs_mountroot(void); 481 mountrootfn = ext2fs_mountroot; 482 } 483 break; 484 #endif 485 #ifdef FFS 486 case FS_BSDFFS: 487 { 488 extern int ffs_mountroot(void); 489 mountrootfn = ffs_mountroot; 490 } 491 break; 492 #endif 493 #ifdef LFS 494 case FS_BSDLFS: 495 { 496 extern int lfs_mountroot(void); 497 mountrootfn = lfs_mountroot; 498 } 499 break; 500 #endif 501 #ifdef CD9660 502 case FS_ISO9660: 503 { 504 extern int cd9660_mountroot(void); 505 mountrootfn = cd9660_mountroot; 506 } 507 break; 508 #endif 509 default: 510 #ifdef FFS 511 { 512 extern int ffs_mountroot(void); 513 514 printf("filesystem type %d not known.. assuming ffs\n", 515 dl.d_partitions[part].p_fstype); 516 mountrootfn = ffs_mountroot; 517 } 518 #else 519 panic("disk 0x%x/0x%x filesystem type %d not known", 520 rootdev, rrootdev, dl.d_partitions[part].p_fstype); 521 #endif 522 } 523 return (*mountrootfn)(); 524 } 525 526 struct bufq * 527 bufq_default_alloc(void) 528 { 529 struct bufq_default *bq; 530 531 bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT); 532 memset(bq, 0, sizeof(*bq)); 533 bq->bufq.bufq_free = bufq_default_free; 534 bq->bufq.bufq_add = bufq_default_add; 535 bq->bufq.bufq_get = bufq_default_get; 536 537 return ((struct bufq *)bq); 538 } 539 540 void 541 bufq_default_free(struct bufq *bq) 542 { 543 free(bq, M_DEVBUF); 544 } 545 546 void 547 bufq_default_add(struct bufq *bq, struct buf *bp) 548 { 549 struct bufq_default *bufq = (struct bufq_default *)bq; 550 struct proc *p = bp->b_proc; 551 struct buf *head; 552 553 if (p == NULL || p->p_nice < NZERO) 554 head = &bufq->bufq_head[0]; 555 else if (p->p_nice == NZERO) 556 head = &bufq->bufq_head[1]; 557 else 558 head = &bufq->bufq_head[2]; 559 560 disksort(head, bp); 561 } 562 563 struct buf * 564 bufq_default_get(struct bufq *bq) 565 { 566 struct bufq_default *bufq = (struct bufq_default *)bq; 567 struct buf *bp, *head; 568 int i; 569 570 for (i = 0; i < 3; i++) { 571 head = &bufq->bufq_head[i]; 572 if ((bp = head->b_actf)) 573 break; 574 } 575 if (bp == NULL) 576 return (NULL); 577 head->b_actf = bp->b_actf; 578 return (bp); 579 } 580