1 /* $NetBSD: subr_disk.c,v 1.64 2004/10/28 07:07:46 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * Copyright (c) 1982, 1986, 1988, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.64 2004/10/28 07:07:46 yamt Exp $"); 78 79 #include "opt_compat_netbsd.h" 80 #include "opt_bufq.h" 81 82 #include <sys/param.h> 83 #include <sys/kernel.h> 84 #include <sys/malloc.h> 85 #include <sys/buf.h> 86 #include <sys/bufq.h> 87 #include <sys/syslog.h> 88 #include <sys/disklabel.h> 89 #include <sys/disk.h> 90 #include <sys/sysctl.h> 91 #include <lib/libkern/libkern.h> 92 93 /* 94 * A global list of all disks attached to the system. May grow or 95 * shrink over time. 96 */ 97 struct disklist_head disklist = TAILQ_HEAD_INITIALIZER(disklist); 98 int disk_count; /* number of drives in global disklist */ 99 struct simplelock disklist_slock = SIMPLELOCK_INITIALIZER; 100 101 #ifdef NEW_BUFQ_STRATEGY 102 int bufq_disk_default_strat = BUFQ_READ_PRIO; 103 #else /* NEW_BUFQ_STRATEGY */ 104 int bufq_disk_default_strat = BUFQ_DISKSORT; 105 #endif /* NEW_BUFQ_STRATEGY */ 106 107 /* 108 * Compute checksum for disk label. 109 */ 110 u_int 111 dkcksum(struct disklabel *lp) 112 { 113 u_short *start, *end; 114 u_short sum = 0; 115 116 start = (u_short *)lp; 117 end = (u_short *)&lp->d_partitions[lp->d_npartitions]; 118 while (start < end) 119 sum ^= *start++; 120 return (sum); 121 } 122 123 /* 124 * Disk error is the preface to plaintive error messages 125 * about failing disk transfers. It prints messages of the form 126 127 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 128 129 * if the offset of the error in the transfer and a disk label 130 * are both available. blkdone should be -1 if the position of the error 131 * is unknown; the disklabel pointer may be null from drivers that have not 132 * been converted to use them. The message is printed with printf 133 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 134 * The message should be completed (with at least a newline) with printf 135 * or addlog, respectively. There is no trailing space. 136 */ 137 #ifndef PRIdaddr 138 #define PRIdaddr PRId64 139 #endif 140 void 141 diskerr(const struct buf *bp, const char *dname, const char *what, int pri, 142 int blkdone, const struct disklabel *lp) 143 { 144 int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev); 145 void (*pr)(const char *, ...); 146 char partname = 'a' + part; 147 daddr_t sn; 148 149 if (/*CONSTCOND*/0) 150 /* Compiler will error this is the format is wrong... */ 151 printf("%" PRIdaddr, bp->b_blkno); 152 153 if (pri != LOG_PRINTF) { 154 static const char fmt[] = ""; 155 log(pri, fmt); 156 pr = addlog; 157 } else 158 pr = printf; 159 (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, 160 bp->b_flags & B_READ ? "read" : "writ"); 161 sn = bp->b_blkno; 162 if (bp->b_bcount <= DEV_BSIZE) 163 (*pr)("%" PRIdaddr, sn); 164 else { 165 if (blkdone >= 0) { 166 sn += blkdone; 167 (*pr)("%" PRIdaddr " of ", sn); 168 } 169 (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno, 170 bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); 171 } 172 if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { 173 sn += lp->d_partitions[part].p_offset; 174 (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "", 175 dname, unit, sn, sn / lp->d_secpercyl); 176 sn %= lp->d_secpercyl; 177 (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")", 178 sn / lp->d_nsectors, sn % lp->d_nsectors); 179 } 180 } 181 182 /* 183 * Searches the disklist for the disk corresponding to the 184 * name provided. 185 */ 186 struct disk * 187 disk_find(char *name) 188 { 189 struct disk *diskp; 190 191 if ((name == NULL) || (disk_count <= 0)) 192 return (NULL); 193 194 simple_lock(&disklist_slock); 195 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 196 diskp = TAILQ_NEXT(diskp, dk_link)) 197 if (strcmp(diskp->dk_name, name) == 0) { 198 simple_unlock(&disklist_slock); 199 return (diskp); 200 } 201 simple_unlock(&disklist_slock); 202 203 return (NULL); 204 } 205 206 /* 207 * Attach a disk. 208 */ 209 void 210 disk_attach(struct disk *diskp) 211 { 212 int s; 213 214 /* 215 * Allocate and initialize the disklabel structures. Note that 216 * it's not safe to sleep here, since we're probably going to be 217 * called during autoconfiguration. 218 */ 219 diskp->dk_label = malloc(sizeof(struct disklabel), M_DEVBUF, M_NOWAIT); 220 diskp->dk_cpulabel = malloc(sizeof(struct cpu_disklabel), M_DEVBUF, 221 M_NOWAIT); 222 if ((diskp->dk_label == NULL) || (diskp->dk_cpulabel == NULL)) 223 panic("disk_attach: can't allocate storage for disklabel"); 224 225 memset(diskp->dk_label, 0, sizeof(struct disklabel)); 226 memset(diskp->dk_cpulabel, 0, sizeof(struct cpu_disklabel)); 227 228 /* 229 * Initialize the wedge-related locks and other fields. 230 */ 231 lockinit(&diskp->dk_rawlock, PRIBIO, "dkrawlk", 0, 0); 232 lockinit(&diskp->dk_openlock, PRIBIO, "dkoplk", 0, 0); 233 LIST_INIT(&diskp->dk_wedges); 234 diskp->dk_nwedges = 0; 235 236 /* 237 * Set the attached timestamp. 238 */ 239 s = splclock(); 240 diskp->dk_attachtime = mono_time; 241 splx(s); 242 243 /* 244 * Link into the disklist. 245 */ 246 simple_lock(&disklist_slock); 247 TAILQ_INSERT_TAIL(&disklist, diskp, dk_link); 248 disk_count++; 249 simple_unlock(&disklist_slock); 250 } 251 252 /* 253 * Detach a disk. 254 */ 255 void 256 disk_detach(struct disk *diskp) 257 { 258 259 (void) lockmgr(&diskp->dk_openlock, LK_DRAIN, NULL); 260 261 /* 262 * Remove from the disklist. 263 */ 264 if (disk_count == 0) 265 panic("disk_detach: disk_count == 0"); 266 simple_lock(&disklist_slock); 267 TAILQ_REMOVE(&disklist, diskp, dk_link); 268 disk_count--; 269 simple_unlock(&disklist_slock); 270 271 /* 272 * Free the space used by the disklabel structures. 273 */ 274 free(diskp->dk_label, M_DEVBUF); 275 free(diskp->dk_cpulabel, M_DEVBUF); 276 } 277 278 /* 279 * Increment a disk's busy counter. If the counter is going from 280 * 0 to 1, set the timestamp. 281 */ 282 void 283 disk_busy(struct disk *diskp) 284 { 285 int s; 286 287 /* 288 * XXX We'd like to use something as accurate as microtime(), 289 * but that doesn't depend on the system TOD clock. 290 */ 291 if (diskp->dk_busy++ == 0) { 292 s = splclock(); 293 diskp->dk_timestamp = mono_time; 294 splx(s); 295 } 296 } 297 298 /* 299 * Decrement a disk's busy counter, increment the byte count, total busy 300 * time, and reset the timestamp. 301 */ 302 void 303 disk_unbusy(struct disk *diskp, long bcount, int read) 304 { 305 int s; 306 struct timeval dv_time, diff_time; 307 308 if (diskp->dk_busy-- == 0) { 309 printf("%s: dk_busy < 0\n", diskp->dk_name); 310 panic("disk_unbusy"); 311 } 312 313 s = splclock(); 314 dv_time = mono_time; 315 splx(s); 316 317 timersub(&dv_time, &diskp->dk_timestamp, &diff_time); 318 timeradd(&diskp->dk_time, &diff_time, &diskp->dk_time); 319 320 diskp->dk_timestamp = dv_time; 321 if (bcount > 0) { 322 if (read) { 323 diskp->dk_rbytes += bcount; 324 diskp->dk_rxfer++; 325 } else { 326 diskp->dk_wbytes += bcount; 327 diskp->dk_wxfer++; 328 } 329 } 330 } 331 332 /* 333 * Reset the metrics counters on the given disk. Note that we cannot 334 * reset the busy counter, as it may case a panic in disk_unbusy(). 335 * We also must avoid playing with the timestamp information, as it 336 * may skew any pending transfer results. 337 */ 338 void 339 disk_resetstat(struct disk *diskp) 340 { 341 int s = splbio(), t; 342 343 diskp->dk_rxfer = 0; 344 diskp->dk_rbytes = 0; 345 diskp->dk_wxfer = 0; 346 diskp->dk_wbytes = 0; 347 348 t = splclock(); 349 diskp->dk_attachtime = mono_time; 350 splx(t); 351 352 timerclear(&diskp->dk_time); 353 354 splx(s); 355 } 356 357 int 358 sysctl_hw_disknames(SYSCTLFN_ARGS) 359 { 360 char buf[DK_DISKNAMELEN + 1]; 361 char *where = oldp; 362 struct disk *diskp; 363 size_t needed, left, slen; 364 int error, first; 365 366 if (newp != NULL) 367 return (EPERM); 368 if (namelen != 0) 369 return (EINVAL); 370 371 first = 1; 372 error = 0; 373 needed = 0; 374 left = *oldlenp; 375 376 simple_lock(&disklist_slock); 377 for (diskp = TAILQ_FIRST(&disklist); diskp != NULL; 378 diskp = TAILQ_NEXT(diskp, dk_link)) { 379 if (where == NULL) 380 needed += strlen(diskp->dk_name) + 1; 381 else { 382 memset(buf, 0, sizeof(buf)); 383 if (first) { 384 strncpy(buf, diskp->dk_name, sizeof(buf)); 385 first = 0; 386 } else { 387 buf[0] = ' '; 388 strncpy(buf + 1, diskp->dk_name, 389 sizeof(buf) - 1); 390 } 391 buf[DK_DISKNAMELEN] = '\0'; 392 slen = strlen(buf); 393 if (left < slen + 1) 394 break; 395 /* +1 to copy out the trailing NUL byte */ 396 error = copyout(buf, where, slen + 1); 397 if (error) 398 break; 399 where += slen; 400 needed += slen; 401 left -= slen; 402 } 403 } 404 simple_unlock(&disklist_slock); 405 *oldlenp = needed; 406 return (error); 407 } 408 409 int 410 sysctl_hw_diskstats(SYSCTLFN_ARGS) 411 { 412 struct disk_sysctl sdisk; 413 struct disk *diskp; 414 char *where = oldp; 415 size_t tocopy, left; 416 int error; 417 418 if (newp != NULL) 419 return (EPERM); 420 421 /* 422 * The original hw.diskstats call was broken and did not require 423 * the userland to pass in it's size of struct disk_sysctl. This 424 * was fixed after NetBSD 1.6 was released, and any applications 425 * that do not pass in the size are given an error only, unless 426 * we care about 1.6 compatibility. 427 */ 428 if (namelen == 0) 429 #ifdef COMPAT_16 430 tocopy = offsetof(struct disk_sysctl, dk_rxfer); 431 #else 432 return (EINVAL); 433 #endif 434 else 435 tocopy = name[0]; 436 437 if (where == NULL) { 438 *oldlenp = disk_count * tocopy; 439 return (0); 440 } 441 442 error = 0; 443 left = *oldlenp; 444 memset(&sdisk, 0, sizeof(sdisk)); 445 *oldlenp = 0; 446 447 simple_lock(&disklist_slock); 448 TAILQ_FOREACH(diskp, &disklist, dk_link) { 449 if (left < tocopy) 450 break; 451 strncpy(sdisk.dk_name, diskp->dk_name, sizeof(sdisk.dk_name)); 452 sdisk.dk_xfer = diskp->dk_rxfer + diskp->dk_wxfer; 453 sdisk.dk_rxfer = diskp->dk_rxfer; 454 sdisk.dk_wxfer = diskp->dk_wxfer; 455 sdisk.dk_seek = diskp->dk_seek; 456 sdisk.dk_bytes = diskp->dk_rbytes + diskp->dk_wbytes; 457 sdisk.dk_rbytes = diskp->dk_rbytes; 458 sdisk.dk_wbytes = diskp->dk_wbytes; 459 sdisk.dk_attachtime_sec = diskp->dk_attachtime.tv_sec; 460 sdisk.dk_attachtime_usec = diskp->dk_attachtime.tv_usec; 461 sdisk.dk_timestamp_sec = diskp->dk_timestamp.tv_sec; 462 sdisk.dk_timestamp_usec = diskp->dk_timestamp.tv_usec; 463 sdisk.dk_time_sec = diskp->dk_time.tv_sec; 464 sdisk.dk_time_usec = diskp->dk_time.tv_usec; 465 sdisk.dk_busy = diskp->dk_busy; 466 467 error = copyout(&sdisk, where, min(tocopy, sizeof(sdisk))); 468 if (error) 469 break; 470 where += tocopy; 471 *oldlenp += tocopy; 472 left -= tocopy; 473 } 474 simple_unlock(&disklist_slock); 475 return (error); 476 } 477 478 /* 479 * Create a device buffer queue. 480 */ 481 void 482 bufq_alloc(struct bufq_state *bufq, int flags) 483 { 484 void (*initfn)(struct bufq_state *); 485 486 bufq->bq_flags = flags; 487 488 switch (flags & BUFQ_SORT_MASK) { 489 case BUFQ_SORT_RAWBLOCK: 490 case BUFQ_SORT_CYLINDER: 491 break; 492 case 0: 493 if ((flags & BUFQ_METHOD_MASK) == BUFQ_FCFS) 494 break; 495 /* FALLTHROUGH */ 496 default: 497 panic("bufq_alloc: sort out of range"); 498 } 499 500 switch (flags & BUFQ_METHOD_MASK) { 501 case BUFQ_FCFS: 502 initfn = bufq_fcfs_init; 503 break; 504 case BUFQ_DISKSORT: 505 initfn = bufq_disksort_init; 506 break; 507 case BUFQ_READ_PRIO: 508 initfn = bufq_readprio_init; 509 break; 510 case BUFQ_PRIOCSCAN: 511 initfn = bufq_priocscan_init; 512 break; 513 default: 514 panic("bufq_alloc: method out of range"); 515 } 516 (*initfn)(bufq); 517 } 518 519 /* 520 * Destroy a device buffer queue. 521 */ 522 void 523 bufq_free(struct bufq_state *bufq) 524 { 525 526 KASSERT(bufq->bq_private != NULL); 527 KASSERT(BUFQ_PEEK(bufq) == NULL); 528 529 FREE(bufq->bq_private, M_DEVBUF); 530 bufq->bq_get = NULL; 531 bufq->bq_put = NULL; 532 } 533 534 /* 535 * Bounds checking against the media size, used for the raw partition. 536 * The sector size passed in should currently always be DEV_BSIZE, 537 * and the media size the size of the device in DEV_BSIZE sectors. 538 */ 539 int 540 bounds_check_with_mediasize(struct buf *bp, int secsize, u_int64_t mediasize) 541 { 542 int sz; 543 544 sz = howmany(bp->b_bcount, secsize); 545 546 if (bp->b_blkno + sz > mediasize) { 547 sz = mediasize - bp->b_blkno; 548 if (sz == 0) { 549 /* If exactly at end of disk, return EOF. */ 550 bp->b_resid = bp->b_bcount; 551 goto done; 552 } 553 if (sz < 0) { 554 /* If past end of disk, return EINVAL. */ 555 bp->b_error = EINVAL; 556 goto bad; 557 } 558 /* Otherwise, truncate request. */ 559 bp->b_bcount = sz << DEV_BSHIFT; 560 } 561 562 return 1; 563 564 bad: 565 bp->b_flags |= B_ERROR; 566 done: 567 return 0; 568 } 569