1 /* $OpenBSD: subr_disk.c,v 1.32 2006/05/11 18:58:59 miod Exp $ */ 2 /* $NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1995 Jason R. Thorpe. All rights reserved. 6 * Copyright (c) 1982, 1986, 1988, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/fcntl.h> 46 #include <sys/buf.h> 47 #include <sys/stat.h> 48 #include <sys/syslog.h> 49 #include <sys/time.h> 50 #include <sys/disklabel.h> 51 #include <sys/conf.h> 52 #include <sys/lock.h> 53 #include <sys/disk.h> 54 #include <sys/dkio.h> 55 #include <sys/dkstat.h> /* XXX */ 56 #include <sys/proc.h> 57 58 #include <dev/rndvar.h> 59 60 /* 61 * A global list of all disks attached to the system. May grow or 62 * shrink over time. 63 */ 64 struct disklist_head disklist; /* TAILQ_HEAD */ 65 int disk_count; /* number of drives in global disklist */ 66 int disk_change; /* set if a disk has been attached/detached 67 * since last we looked at this variable. This 68 * is reset by hw_sysctl() 69 */ 70 71 /* 72 * Seek sort for disks. We depend on the driver which calls us using b_resid 73 * as the current cylinder number. 74 * 75 * The argument ap structure holds a b_actf activity chain pointer on which we 76 * keep two queues, sorted in ascending cylinder order. The first queue holds 77 * those requests which are positioned after the current cylinder (in the first 78 * request); the second holds requests which came in after their cylinder number 79 * was passed. Thus we implement a one way scan, retracting after reaching the 80 * end of the drive to the first request on the second queue, at which time it 81 * becomes the first queue. 82 * 83 * A one-way scan is natural because of the way UNIX read-ahead blocks are 84 * allocated. 85 */ 86 87 void 88 disksort(struct buf *ap, struct buf *bp) 89 { 90 struct buf *bq; 91 92 /* If the queue is empty, then it's easy. */ 93 if (ap->b_actf == NULL) { 94 bp->b_actf = NULL; 95 ap->b_actf = bp; 96 return; 97 } 98 99 /* 100 * If we lie after the first (currently active) request, then we 101 * must locate the second request list and add ourselves to it. 102 */ 103 bq = ap->b_actf; 104 if (bp->b_cylinder < bq->b_cylinder) { 105 while (bq->b_actf) { 106 /* 107 * Check for an ``inversion'' in the normally ascending 108 * cylinder numbers, indicating the start of the second 109 * request list. 110 */ 111 if (bq->b_actf->b_cylinder < bq->b_cylinder) { 112 /* 113 * Search the second request list for the first 114 * request at a larger cylinder number. We go 115 * before that; if there is no such request, we 116 * go at end. 117 */ 118 do { 119 if (bp->b_cylinder < 120 bq->b_actf->b_cylinder) 121 goto insert; 122 if (bp->b_cylinder == 123 bq->b_actf->b_cylinder && 124 bp->b_blkno < bq->b_actf->b_blkno) 125 goto insert; 126 bq = bq->b_actf; 127 } while (bq->b_actf); 128 goto insert; /* after last */ 129 } 130 bq = bq->b_actf; 131 } 132 /* 133 * No inversions... we will go after the last, and 134 * be the first request in the second request list. 135 */ 136 goto insert; 137 } 138 /* 139 * Request is at/after the current request... 140 * sort in the first request list. 141 */ 142 while (bq->b_actf) { 143 /* 144 * We want to go after the current request if there is an 145 * inversion after it (i.e. it is the end of the first 146 * request list), or if the next request is a larger cylinder 147 * than our request. 148 */ 149 if (bq->b_actf->b_cylinder < bq->b_cylinder || 150 bp->b_cylinder < bq->b_actf->b_cylinder || 151 (bp->b_cylinder == bq->b_actf->b_cylinder && 152 bp->b_blkno < bq->b_actf->b_blkno)) 153 goto insert; 154 bq = bq->b_actf; 155 } 156 /* 157 * Neither a second list nor a larger request... we go at the end of 158 * the first list, which is the same as the end of the whole schebang. 159 */ 160 insert: bp->b_actf = bq->b_actf; 161 bq->b_actf = bp; 162 } 163 164 /* 165 * Compute checksum for disk label. 166 */ 167 u_int 168 dkcksum(struct disklabel *lp) 169 { 170 u_int16_t *start, *end; 171 u_int16_t sum = 0; 172 173 start = (u_int16_t *)lp; 174 end = (u_int16_t *)&lp->d_partitions[lp->d_npartitions]; 175 while (start < end) 176 sum ^= *start++; 177 return (sum); 178 } 179 180 /* 181 * Disk error is the preface to plaintive error messages 182 * about failing disk transfers. It prints messages of the form 183 184 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 185 186 * if the offset of the error in the transfer and a disk label 187 * are both available. blkdone should be -1 if the position of the error 188 * is unknown; the disklabel pointer may be null from drivers that have not 189 * been converted to use them. The message is printed with printf 190 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 191 * The message should be completed (with at least a newline) with printf 192 * or addlog, respectively. There is no trailing space. 193 */ 194 void 195 diskerr(struct buf *bp, char *dname, char *what, int pri, int blkdone, 196 struct disklabel *lp) 197 { 198 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 199 int (*pr)(const char *, ...); 200 char partname = 'a' + part; 201 int sn; 202 203 if (pri != LOG_PRINTF) { 204 static const char fmt[] = ""; 205 log(pri, fmt); 206 pr = addlog; 207 } else 208 pr = printf; 209 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 210 bp->b_flags & B_READ ? "read" : "writ"); 211 sn = bp->b_blkno; 212 if (bp->b_bcount <= DEV_BSIZE) 213 (*pr)("%d", sn); 214 else { 215 if (blkdone >= 0) { 216 sn += blkdone; 217 (*pr)("%d of ", sn); 218 } 219 (*pr)("%d-%d", bp->b_blkno, 220 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 221 } 222 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 223 sn += lp->d_partitions[part].p_offset; 224 (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, 225 sn / lp->d_secpercyl); 226 sn %= lp->d_secpercyl; 227 (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); 228 } 229 } 230 231 /* 232 * Initialize the disklist. Called by main() before autoconfiguration. 233 */ 234 void 235 disk_init(void) 236 { 237 238 TAILQ_INIT(&disklist); 239 disk_count = disk_change = 0; 240 } 241 242 /* 243 * Searches the disklist for the disk corresponding to the 244 * name provided. 245 */ 246 struct disk * 247 disk_find(char *name) 248 { 249 struct disk *diskp; 250 251 if ((name == NULL) || (disk_count <= 0)) 252 return (NULL); 253 254 TAILQ_FOREACH(diskp, &disklist, dk_link) 255 if (strcmp(diskp->dk_name, name) == 0) 256 return (diskp); 257 258 return (NULL); 259 } 260 261 int 262 disk_construct(struct disk *diskp, char *lockname) 263 { 264 lockinit(&diskp->dk_lock, PRIBIO | PCATCH, lockname, 265 0, LK_CANRECURSE); 266 267 diskp->dk_flags |= DKF_CONSTRUCTED; 268 269 return (0); 270 } 271 272 /* 273 * Attach a disk. 274 */ 275 void 276 disk_attach(struct disk *diskp) 277 { 278 279 if (!ISSET(diskp->dk_flags, DKF_CONSTRUCTED)) 280 disk_construct(diskp, diskp->dk_name); 281 282 /* 283 * Allocate and initialize the disklabel structures. Note that 284 * it's not safe to sleep here, since we're probably going to be 285 * called during autoconfiguration. 286 */ 287 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 288 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 289 M_NOWAIT); 290 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 291 panic("disk_attach: can't allocate storage for disklabel"); 292 293 bzero(diskp->dk_label, sizeof(struct disklabel)); 294 bzero(diskp->dk_cpulabel, sizeof(struct cpu_disklabel)); 295 296 /* 297 * Set the attached timestamp. 298 */ 299 microuptime(&diskp->dk_attachtime); 300 301 /* 302 * Link into the disklist. 303 */ 304 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 305 ++disk_count; 306 disk_change = 1; 307 } 308 309 /* 310 * Detach a disk. 311 */ 312 void 313 disk_detach(struct disk *diskp) 314 { 315 316 /* 317 * Free the space used by the disklabel structures. 318 */ 319 free(diskp->dk_label, M_DEVBUF); 320 free(diskp->dk_cpulabel, M_DEVBUF); 321 322 /* 323 * Remove from the disklist. 324 */ 325 TAILQ_REMOVE(&disklist, diskp, dk_link); 326 disk_change = 1; 327 if (--disk_count < 0) 328 panic("disk_detach: disk_count < 0"); 329 } 330 331 /* 332 * Increment a disk's busy counter. If the counter is going from 333 * 0 to 1, set the timestamp. 334 */ 335 void 336 disk_busy(struct disk *diskp) 337 { 338 339 /* 340 * XXX We'd like to use something as accurate as microtime(), 341 * but that doesn't depend on the system TOD clock. 342 */ 343 if (diskp->dk_busy++ == 0) { 344 microuptime(&diskp->dk_timestamp); 345 } 346 } 347 348 /* 349 * Decrement a disk's busy counter, increment the byte count, total busy 350 * time, and reset the timestamp. 351 */ 352 void 353 disk_unbusy(struct disk *diskp, long bcount, int read) 354 { 355 struct timeval dv_time, diff_time; 356 357 if (diskp->dk_busy-- == 0) 358 printf("disk_unbusy: %s: dk_busy < 0\n", diskp->dk_name); 359 360 microuptime(&dv_time); 361 362 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 363 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 364 365 diskp->dk_timestamp = dv_time; 366 if (bcount > 0) { 367 if (read) { 368 diskp->dk_rbytes += bcount; 369 diskp->dk_rxfer++; 370 } else { 371 diskp->dk_wbytes += bcount; 372 diskp->dk_wxfer++; 373 } 374 } else 375 diskp->dk_seek++; 376 377 add_disk_randomness(bcount ^ diff_time.tv_usec); 378 } 379 380 int 381 disk_lock(struct disk *dk) 382 { 383 int error; 384 385 error = lockmgr(&dk->dk_lock, LK_EXCLUSIVE, NULL); 386 387 return (error); 388 } 389 390 void 391 disk_unlock(struct disk *dk) 392 { 393 lockmgr(&dk->dk_lock, LK_RELEASE, NULL); 394 } 395 396 /* 397 * Reset the metrics counters on the given disk. Note that we cannot 398 * reset the busy counter, as it may case a panic in disk_unbusy(). 399 * We also must avoid playing with the timestamp information, as it 400 * may skew any pending transfer results. 401 */ 402 void 403 disk_resetstat(struct disk *diskp) 404 { 405 int s = splbio(); 406 407 diskp->dk_rxfer = 0; 408 diskp->dk_rbytes = 0; 409 diskp->dk_wxfer = 0; 410 diskp->dk_wbytes = 0; 411 diskp->dk_seek = 0; 412 413 microuptime(&diskp->dk_attachtime); 414 415 timerclear(&diskp->dk_time); 416 417 splx(s); 418 } 419 420 421 int 422 dk_mountroot(void) 423 { 424 dev_t rawdev, rrootdev; 425 int part = DISKPART(rootdev); 426 int (*mountrootfn)(void); 427 struct disklabel dl; 428 int error; 429 430 rrootdev = blktochr(rootdev); 431 rawdev = MAKEDISKDEV(major(rrootdev), DISKUNIT(rootdev), RAW_PART); 432 printf("rootdev=0x%x rrootdev=0x%x rawdev=0x%x\n", rootdev, 433 rrootdev, rawdev); 434 435 /* 436 * open device, ioctl for the disklabel, and close it. 437 */ 438 error = (cdevsw[major(rrootdev)].d_open)(rawdev, FREAD, 439 S_IFCHR, curproc); 440 if (error) 441 panic("cannot open disk, 0x%x/0x%x, error %d", 442 rootdev, rrootdev, error); 443 error = (cdevsw[major(rrootdev)].d_ioctl)(rawdev, DIOCGDINFO, 444 (caddr_t)&dl, FREAD, curproc); 445 if (error) 446 panic("cannot read disk label, 0x%x/0x%x, error %d", 447 rootdev, rrootdev, error); 448 (void) (cdevsw[major(rrootdev)].d_close)(rawdev, FREAD, 449 S_IFCHR, curproc); 450 451 if (dl.d_partitions[part].p_size == 0) 452 panic("root filesystem has size 0"); 453 switch (dl.d_partitions[part].p_fstype) { 454 #ifdef EXT2FS 455 case FS_EXT2FS: 456 { 457 extern int ext2fs_mountroot(void); 458 mountrootfn = ext2fs_mountroot; 459 } 460 break; 461 #endif 462 #ifdef FFS 463 case FS_BSDFFS: 464 { 465 extern int ffs_mountroot(void); 466 mountrootfn = ffs_mountroot; 467 } 468 break; 469 #endif 470 #ifdef LFS 471 case FS_BSDLFS: 472 { 473 extern int lfs_mountroot(void); 474 mountrootfn = lfs_mountroot; 475 } 476 break; 477 #endif 478 #ifdef CD9660 479 case FS_ISO9660: 480 { 481 extern int cd9660_mountroot(void); 482 mountrootfn = cd9660_mountroot; 483 } 484 break; 485 #endif 486 default: 487 #ifdef FFS 488 { 489 extern int ffs_mountroot(void); 490 491 printf("filesystem type %d not known.. assuming ffs\n", 492 dl.d_partitions[part].p_fstype); 493 mountrootfn = ffs_mountroot; 494 } 495 #else 496 panic("disk 0x%x/0x%x filesystem type %d not known", 497 rootdev, rrootdev, dl.d_partitions[part].p_fstype); 498 #endif 499 } 500 return (*mountrootfn)(); 501 } 502 503 struct bufq * 504 bufq_default_alloc(void) 505 { 506 struct bufq_default *bq; 507 508 bq = malloc(sizeof(*bq), M_DEVBUF, M_NOWAIT); 509 if (bq == NULL) 510 panic("bufq_default_alloc: no memory"); 511 512 memset(bq, 0, sizeof(*bq)); 513 bq->bufq.bufq_free = bufq_default_free; 514 bq->bufq.bufq_add = bufq_default_add; 515 bq->bufq.bufq_get = bufq_default_get; 516 517 return ((struct bufq *)bq); 518 } 519 520 void 521 bufq_default_free(struct bufq *bq) 522 { 523 free(bq, M_DEVBUF); 524 } 525 526 void 527 bufq_default_add(struct bufq *bq, struct buf *bp) 528 { 529 struct bufq_default *bufq = (struct bufq_default *)bq; 530 struct proc *p = bp->b_proc; 531 struct buf *head; 532 533 if (p == NULL || p->p_nice < NZERO) 534 head = &bufq->bufq_head[0]; 535 else if (p->p_nice == NZERO) 536 head = &bufq->bufq_head[1]; 537 else 538 head = &bufq->bufq_head[2]; 539 540 disksort(head, bp); 541 } 542 543 struct buf * 544 bufq_default_get(struct bufq *bq) 545 { 546 struct bufq_default *bufq = (struct bufq_default *)bq; 547 struct buf *bp, *head; 548 int i; 549 550 for (i = 0; i < 3; i++) { 551 head = &bufq->bufq_head[i]; 552 if ((bp = head->b_actf)) 553 break; 554 } 555 if (bp == NULL) 556 return (NULL); 557 head->b_actf = bp->b_actf; 558 return (bp); 559 } 560