1 /* $OpenBSD: subr_disk.c,v 1.21 2002/03/14 01:27:04 millert Exp $ */ 2 /* $NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1995 Jason R. Thorpe. All rights reserved. 6 * Copyright (c) 1982, 1986, 1988, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 43 */ 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kernel.h> 48 #include <sys/malloc.h> 49 #include <sys/fcntl.h> 50 #include <sys/buf.h> 51 #include <sys/stat.h> 52 #include <sys/syslog.h> 53 #include <sys/time.h> 54 #include <sys/disklabel.h> 55 #include <sys/conf.h> 56 #include <sys/lock.h> 57 #include <sys/disk.h> 58 #include <sys/dkio.h> 59 #include <sys/dkstat.h> /* XXX */ 60 #include <sys/proc.h> 61 62 #include <dev/rndvar.h> 63 64 /* 65 * A global list of all disks attached to the system. May grow or 66 * shrink over time. 67 */ 68 struct disklist_head disklist; /* TAILQ_HEAD */ 69 int disk_count; /* number of drives in global disklist */ 70 int disk_change; /* set if a disk has been attached/detached 71 * since last we looked at this variable. This 72 * is reset by hw_sysctl() 73 */ 74 75 /* 76 * Seek sort for disks. We depend on the driver which calls us using b_resid 77 * as the current cylinder number. 78 * 79 * The argument ap structure holds a b_actf activity chain pointer on which we 80 * keep two queues, sorted in ascending cylinder order. The first queue holds 81 * those requests which are positioned after the current cylinder (in the first 82 * request); the second holds requests which came in after their cylinder number 83 * was passed. Thus we implement a one way scan, retracting after reaching the 84 * end of the drive to the first request on the second queue, at which time it 85 * becomes the first queue. 86 * 87 * A one-way scan is natural because of the way UNIX read-ahead blocks are 88 * allocated. 89 */ 90 91 void 92 disksort(ap, bp) 93 register struct buf *ap, *bp; 94 { 95 register struct buf *bq; 96 97 /* If the queue is empty, then it's easy. */ 98 if (ap->b_actf == NULL) { 99 bp->b_actf = NULL; 100 ap->b_actf = bp; 101 return; 102 } 103 104 /* 105 * If we lie after the first (currently active) request, then we 106 * must locate the second request list and add ourselves to it. 107 */ 108 bq = ap->b_actf; 109 if (bp->b_cylinder < bq->b_cylinder) { 110 while (bq->b_actf) { 111 /* 112 * Check for an ``inversion'' in the normally ascending 113 * cylinder numbers, indicating the start of the second 114 * request list. 115 */ 116 if (bq->b_actf->b_cylinder < bq->b_cylinder) { 117 /* 118 * Search the second request list for the first 119 * request at a larger cylinder number. We go 120 * before that; if there is no such request, we 121 * go at end. 122 */ 123 do { 124 if (bp->b_cylinder < 125 bq->b_actf->b_cylinder) 126 goto insert; 127 if (bp->b_cylinder == 128 bq->b_actf->b_cylinder && 129 bp->b_blkno < bq->b_actf->b_blkno) 130 goto insert; 131 bq = bq->b_actf; 132 } while (bq->b_actf); 133 goto insert; /* after last */ 134 } 135 bq = bq->b_actf; 136 } 137 /* 138 * No inversions... we will go after the last, and 139 * be the first request in the second request list. 140 */ 141 goto insert; 142 } 143 /* 144 * Request is at/after the current request... 145 * sort in the first request list. 146 */ 147 while (bq->b_actf) { 148 /* 149 * We want to go after the current request if there is an 150 * inversion after it (i.e. it is the end of the first 151 * request list), or if the next request is a larger cylinder 152 * than our request. 153 */ 154 if (bq->b_actf->b_cylinder < bq->b_cylinder || 155 bp->b_cylinder < bq->b_actf->b_cylinder || 156 (bp->b_cylinder == bq->b_actf->b_cylinder && 157 bp->b_blkno < bq->b_actf->b_blkno)) 158 goto insert; 159 bq = bq->b_actf; 160 } 161 /* 162 * Neither a second list nor a larger request... we go at the end of 163 * the first list, which is the same as the end of the whole schebang. 164 */ 165 insert: bp->b_actf = bq->b_actf; 166 bq->b_actf = bp; 167 } 168 169 /* 170 * Compute checksum for disk label. 171 */ 172 u_int 173 dkcksum(lp) 174 register struct disklabel *lp; 175 { 176 register u_int16_t *start, *end; 177 register u_int16_t sum = 0; 178 179 start = (u_int16_t *)lp; 180 end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions]; 181 while (start < end) 182 sum ^= *start++; 183 return (sum); 184 } 185 186 /* 187 * Disk error is the preface to plaintive error messages 188 * about failing disk transfers. It prints messages of the form 189 190 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 191 192 * if the offset of the error in the transfer and a disk label 193 * are both available. blkdone should be -1 if the position of the error 194 * is unknown; the disklabel pointer may be null from drivers that have not 195 * been converted to use them. The message is printed with printf 196 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 197 * The message should be completed (with at least a newline) with printf 198 * or addlog, respectively. There is no trailing space. 199 */ 200 void 201 diskerr(bp, dname, what, pri, blkdone, lp) 202 register struct buf *bp; 203 char *dname, *what; 204 int pri, blkdone; 205 register struct disklabel *lp; 206 { 207 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 208 register int (*pr)(const char *, ...); 209 char partname = 'a' + part; 210 int sn; 211 212 if (pri != LOG_PRINTF) { 213 static const char fmt[] = ""; 214 log(pri, fmt); 215 pr = addlog; 216 } else 217 pr = printf; 218 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 219 bp->b_flags & B_READ ? "read" : "writ"); 220 sn = bp->b_blkno; 221 if (bp->b_bcount <= DEV_BSIZE) 222 (*pr)("%d", sn); 223 else { 224 if (blkdone >= 0) { 225 sn += blkdone; 226 (*pr)("%d of ", sn); 227 } 228 (*pr)("%d-%d", bp->b_blkno, 229 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 230 } 231 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 232 #ifdef tahoe 233 sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ 234 #endif 235 sn += lp->d_partitions[part].p_offset; 236 (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, 237 sn / lp->d_secpercyl); 238 sn %= lp->d_secpercyl; 239 (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); 240 } 241 } 242 243 /* 244 * Initialize the disklist. Called by main() before autoconfiguration. 245 */ 246 void 247 disk_init() 248 { 249 250 TAILQ_INIT(&disklist); 251 disk_count = disk_change = 0; 252 } 253 254 /* 255 * Searches the disklist for the disk corresponding to the 256 * name provided. 257 */ 258 struct disk * 259 disk_find(name) 260 char *name; 261 { 262 struct disk *diskp; 263 264 if ((name == NULL) || (disk_count <= 0)) 265 return (NULL); 266 267 for (diskp = disklist.tqh_first; diskp != NULL; 268 diskp = diskp->dk_link.tqe_next) 269 if (strcmp(diskp->dk_name, name) == 0) 270 return (diskp); 271 272 return (NULL); 273 } 274 275 int 276 disk_construct(diskp, lockname) 277 struct disk *diskp; 278 char *lockname; 279 { 280 lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname, 281 0, LK_CANRECURSE); 282 283 diskp->dk_flags |= DKF_CONSTRUCTED; 284 285 return (0); 286 } 287 288 /* 289 * Attach a disk. 290 */ 291 void 292 disk_attach(diskp) 293 struct disk *diskp; 294 { 295 int s; 296 297 if (!diskp->dk_flags & DKF_CONSTRUCTED) 298 disk_construct(diskp, diskp->dk_name); 299 300 /* 301 * Allocate and initialize the disklabel structures. Note that 302 * it's not safe to sleep here, since we're probably going to be 303 * called during autoconfiguration. 304 */ 305 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 306 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 307 M_NOWAIT); 308 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 309 panic("disk_attach: can't allocate storage for disklabel"); 310 311 bzero(diskp->dk_label, sizeof(struct disklabel)); 312 bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel)); 313 314 /* 315 * Set the attached timestamp. 316 */ 317 s = splclock(); 318 diskp->dk_attachtime = mono_time; 319 splx(s); 320 321 /* 322 * Link into the disklist. 323 */ 324 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 325 ++disk_count; 326 disk_change = 1; 327 } 328 329 /* 330 * Detach a disk. 331 */ 332 void 333 disk_detach(diskp) 334 struct disk *diskp; 335 { 336 337 /* 338 * Free the space used by the disklabel structures. 339 */ 340 free(diskp->dk_label, M_DEVBUF); 341 free(diskp->dk_cpulabel, M_DEVBUF); 342 343 /* 344 * Remove from the disklist. 345 */ 346 TAILQ_REMOVE(&disklist, diskp, dk_link); 347 disk_change = 1; 348 if (--disk_count < 0) 349 panic("disk_detach: disk_count < 0"); 350 } 351 352 /* 353 * Increment a disk's busy counter. If the counter is going from 354 * 0 to 1, set the timestamp. 355 */ 356 void 357 disk_busy(diskp) 358 struct disk *diskp; 359 { 360 int s; 361 362 /* 363 * XXX We'd like to use something as accurate as microtime(), 364 * but that doesn't depend on the system TOD clock. 365 */ 366 if (diskp->dk_busy++ == 0) { 367 s = splclock(); 368 diskp->dk_timestamp = mono_time; 369 splx(s); 370 } 371 } 372 373 /* 374 * Decrement a disk's busy counter, increment the byte count, total busy 375 * time, and reset the timestamp. 376 */ 377 void 378 disk_unbusy(diskp, bcount) 379 struct disk *diskp; 380 long bcount; 381 { 382 int s; 383 struct timeval dv_time, diff_time; 384 385 if (diskp->dk_busy-- == 0) 386 printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name); 387 388 s = splclock(); 389 dv_time = mono_time; 390 splx(s); 391 392 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 393 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 394 395 diskp->dk_timestamp = dv_time; 396 if (bcount > 0) { 397 diskp->dk_bytes += bcount; 398 diskp->dk_xfer++; 399 } 400 diskp->dk_seek++; 401 402 add_disk_randomness(bcount ^ diff_time.tv_usec); 403 } 404 405 406 int 407 disk_lock(dk) 408 struct disk *dk; 409 { 410 int error; 411 412 error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, 0, curproc); 413 414 return (error); 415 } 416 417 void 418 disk_unlock(dk) 419 struct disk *dk; 420 { 421 lockmgr(&dk->dk_lock, LK_RELEASE, 0, curproc); 422 } 423 424 425 /* 426 * Reset the metrics counters on the given disk. Note that we cannot 427 * reset the busy counter, as it may case a panic in disk_unbusy(). 428 * We also must avoid playing with the timestamp information, as it 429 * may skew any pending transfer results. 430 */ 431 void 432 disk_resetstat(diskp) 433 struct disk *diskp; 434 { 435 int s = splbio(), t; 436 437 diskp->dk_xfer = 0; 438 diskp->dk_bytes = 0; 439 diskp->dk_seek = 0; 440 441 t = splclock(); 442 diskp->dk_attachtime = mono_time; 443 splx(t); 444 445 timerclear(&diskp->dk_time); 446 447 splx(s); 448 } 449 450 451 int 452 dk_mountroot() 453 { 454 dev_t rawdev, rrootdev; 455 int part = DISKPART(rootdev); 456 int (*mountrootfn)(void); 457 struct disklabel dl; 458 int error; 459 460 rrootdev = blktochr(rootdev); 461 rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART); 462 printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev, 463 rrootdev, rawdev); 464 465 /* 466 * open device, ioctl for the disklabel, and close it. 467 */ 468 error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD, 469 S_IFCHR, curproc); 470 if (error) 471 panic("cannot open disk, 0x%x/0x%x, error %d", 472 rootdev, rrootdev, error); 473 error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO, 474 (caddr_t)&dl, FREAD, curproc); 475 if (error) 476 panic("cannot read disk label, 0x%x/0x%x, error %d", 477 rootdev, rrootdev, error); 478 (void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD, 479 S_IFCHR, curproc); 480 481 if (dl.d_partitions[part].p_size == 0) 482 panic("root filesystem has size 0"); 483 switch (dl.d_partitions[part].p_fstype) { 484 #ifdef EXT2FS 485 case FS_EXT2FS: 486 { 487 extern int ext2fs_mountroot(void); 488 mountrootfn = ext2fs_mountroot; 489 } 490 break; 491 #endif 492 #ifdef FFS 493 case FS_BSDFFS: 494 { 495 extern int ffs_mountroot(void); 496 mountrootfn = ffs_mountroot; 497 } 498 break; 499 #endif 500 #ifdef LFS 501 case FS_BSDLFS: 502 { 503 extern int lfs_mountroot(void); 504 mountrootfn = lfs_mountroot; 505 } 506 break; 507 #endif 508 #ifdef CD9660 509 case FS_ISO9660: 510 { 511 extern int cd9660_mountroot(void); 512 mountrootfn = cd9660_mountroot; 513 } 514 break; 515 #endif 516 default: 517 #ifdef FFS 518 { 519 extern int ffs_mountroot(void); 520 521 printf("filesystem type %d not known.. assuming ffs\n", 522 dl.d_partitions[part].p_fstype); 523 mountrootfn = ffs_mountroot; 524 } 525 #else 526 panic("disk 0x%x/0x%x filesystem type %d not known", 527 rootdev, rrootdev, dl.d_partitions[part].p_fstype); 528 #endif 529 } 530 return (*mountrootfn)(); 531 } 532