1.\" $NetBSD: disk.9,v 1.46 2017/07/03 21:28:48 wiz Exp $ 2.\" 3.\" Copyright (c) 1995, 1996 Jason R. Thorpe. 4.\" All rights reserved. 5.\" 6.\" Redistribution and use in source and binary forms, with or without 7.\" modification, are permitted provided that the following conditions 8.\" are met: 9.\" 1. Redistributions of source code must retain the above copyright 10.\" notice, this list of conditions and the following disclaimer. 11.\" 2. Redistributions in binary form must reproduce the above copyright 12.\" notice, this list of conditions and the following disclaimer in the 13.\" documentation and/or other materials provided with the distribution. 14.\" 3. All advertising materials mentioning features or use of this software 15.\" must display the following acknowledgement: 16.\" This product includes software developed for the NetBSD Project 17.\" by Jason R. Thorpe. 18.\" 4. The name of the author may not be used to endorse or promote products 19.\" derived from this software without specific prior written permission. 20.\" 21.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31.\" SUCH DAMAGE. 32.\" 33.Dd March 5, 2017 34.Dt DISK 9 35.Os 36.Sh NAME 37.Nm disk , 38.Nm disk_init , 39.Nm disk_attach , 40.Nm disk_begindetach , 41.Nm disk_detach , 42.Nm disk_destroy , 43.Nm disk_wait , 44.Nm disk_busy , 45.Nm disk_unbusy , 46.Nm disk_isbusy , 47.Nm disk_find , 48.Nm disk_set_info 49.Nd generic disk framework 50.Sh SYNOPSIS 51.In sys/types.h 52.In sys/disklabel.h 53.In sys/disk.h 54.Ft void 55.Fn disk_init "struct disk *" "const char *name" "const struct dkdriver *driver" 56.Ft void 57.Fn disk_attach "struct disk *" 58.Ft void 59.Fn disk_begindetach "struct disk *" "int (*lastclose)(device_t)" "device_t self" "int flags" 60.Ft void 61.Fn disk_detach "struct disk *" 62.Ft void 63.Fn disk_destroy "struct disk *" 64.Ft void 65.Fn disk_wait "struct disk *" 66.Ft void 67.Fn disk_busy "struct disk *" 68.Ft void 69.Fn disk_unbusy "struct disk *" "long bcount" "int read" 70.Ft bool 71.Fn disk_isbusy "struct disk *" 72.Ft struct disk * 73.Fn disk_find "const char *" 74.Ft void 75.Fn disk_set_info "device_t" "struct disk *" "const char *type" 76.Sh DESCRIPTION 77The 78.Nx 79generic disk framework is designed to provide flexible, 80scalable, and consistent handling of disk state and metrics information. 81The fundamental component of this framework is the 82.Nm disk 83structure, which is defined as follows: 84.Bd -literal 85struct disk { 86 TAILQ_ENTRY(disk) dk_link; /* link in global disklist */ 87 const char *dk_name; /* disk name */ 88 prop_dictionary_t dk_info; /* reference to disk-info dictionary */ 89 int dk_bopenmask; /* block devices open */ 90 int dk_copenmask; /* character devices open */ 91 int dk_openmask; /* composite (bopen|copen) */ 92 int dk_state; /* label state ### */ 93 int dk_blkshift; /* shift to convert DEV_BSIZE to blks */ 94 int dk_byteshift; /* shift to convert bytes to blks */ 95 96 /* 97 * Metrics data; note that some metrics may have no meaning 98 * on certain types of disks. 99 */ 100 struct io_stats *dk_stats; 101 102 const struct dkdriver *dk_driver; /* pointer to driver */ 103 104 /* 105 * Information required to be the parent of a disk wedge. 106 */ 107 kmutex_t dk_rawlock; /* lock on these fields */ 108 u_int dk_rawopens; /* # of opens of rawvp */ 109 struct vnode *dk_rawvp; /* vnode for the RAW_PART bdev */ 110 111 kmutex_t dk_openlock; /* lock on these and openmask */ 112 u_int dk_nwedges; /* # of configured wedges */ 113 /* all wedges on this disk */ 114 LIST_HEAD(, dkwedge_softc) dk_wedges; 115 116 /* 117 * Disk label information. Storage for the in-core disk label 118 * must be dynamically allocated, otherwise the size of this 119 * structure becomes machine-dependent. 120 */ 121 daddr_t dk_labelsector; /* sector containing label */ 122 struct disklabel *dk_label; /* label */ 123 struct cpu_disklabel *dk_cpulabel; 124}; 125.Ed 126.Pp 127The system maintains a global linked-list of all disks attached to the 128system. 129This list, called 130.Nm disklist , 131may grow or shrink over time as disks are dynamically added and removed 132from the system. 133Drivers which currently make use of the detachment 134capability of the framework are the 135.Nm ccd , 136.Nm dm , 137and 138.Nm vnd 139pseudo-device drivers. 140.Pp 141The following is a brief description of each function in the framework: 142.Bl -tag -width ".Fn disk_set_info" 143.It Fn disk_init 144Initialize the disk structure. 145.It Fn disk_attach 146Attach a disk; allocate storage for the disklabel, set the 147.Dq attached time 148timestamp, insert the disk into the disklist, and increment the 149system disk count. 150.It Fn disk_begindetach 151Check whether the disk is open, and if not, return 0. 152If the disk is open, and 153.Dv DETACH_FORCE 154is not set in 155.Fa flags , 156return 157.Dv EBUSY . 158Otherwise, call the provided 159.Fa lastclose 160routine 161.Po 162if not 163.Dv NULL 164.Pc 165and return its exit code. 166.It Fn disk_detach 167Detach a disk; free storage for the disklabel, remove the disk 168from the disklist, and decrement the system disk count. 169If the count drops below zero, panic. 170.It Fn disk_destroy 171Release resources used by the disk structure when it is no longer 172required. 173.It Fn disk_wait 174Disk timings are measured by counting the number of queued 175requests (wait counter) and requests issued to the hardware (busy counter) 176and keeping timestamp when the counters change. 177The time interval between 178two changes of a counter is accumulated into a total and also multiplied 179by the counter value and the accumulated into a sum. 180Both values can be 181used to determine how much time is spent in the driver queue or in-flight 182to the hardware as well as the average number of requests in either state. 183.Fn disk_wait 184increment the disk's wait counter and handles the accumulation. 185.It Fn disk_busy 186Decrements the disk's wait counter and increments the disk's 187.Dq busy counter , 188and handles either accumulation. 189If the wait counter is still zero, it 190is assumed that the driver hasn't been updated to call 191.Fn disk_wait , 192then only the values from the busy counter are available. 193.It Fn disk_unbusy 194Decrement the disk's busy counter and handles the accumulation. 195The third argument 196.Ar read 197specifies the direction of I/O; 198if non-zero it means reading from the disk, 199otherwise it means writing to the disk. 200.It Fn disk_isbusy 201Returns 202.Ar true 203if disk is marked as busy and false if it is not. 204.It Fn disk_find 205Return a pointer to the disk structure corresponding to the name provided, 206or 207.Dv NULL 208if the disk does not exist. 209.It Fn disk_set_info 210Setup disk-info dictionary and other dependent values of the disk structure, 211the driver must have initialized the dk_geom member of 212.Fa struct disk 213with suitable values. 214If 215.Fa type 216is not 217.Dv NULL , 218it will be added to the dictionary. 219.El 220.Pp 221The functions typically called by device drivers are 222.Fn disk_init 223.Fn disk_attach , 224.Fn disk_begindetach , 225.Fn disk_detach , 226.Fn disk_destroy , 227.Fn disk_wait , 228.Fn disk_busy , 229.Fn disk_unbusy , 230and 231.Fn disk_set_info . 232The function 233.Fn disk_find 234is provided as a utility function. 235.Sh DISK IOCTLS 236The following ioctls should be implemented by disk drivers: 237.Bl -tag -width "xxxxxx" 238.It Dv DIOCGDINFO "struct disklabel" 239Get disklabel. 240.It Dv DIOCSDINFO "struct disklabel" 241Set in-memory disklabel. 242.It Dv DIOCWDINFO "struct disklabel" 243Set in-memory disklabel and write on-disk disklabel. 244.It Dv DIOCGPART "struct partinfo" 245Get partition information. 246This is used internally. 247.It Dv DIOCRFORMAT "struct format_op" 248Read format. 249.It Dv DIOCWFORMAT "struct format_op" 250Write format. 251.It Dv DIOCSSTEP "int" 252Set step rate. 253.It Dv DIOCSRETRIES "int" 254Set number of retries. 255.It Dv DIOCKLABEL "int" 256Specify whether to keep or drop the in-memory disklabel 257when the device is closed. 258.It Dv DIOCWLABEL "int" 259Enable or disable writing to the part of the disk that contains the label. 260.It Dv DIOCSBAD "struct dkbad" 261Set kernel dkbad. 262.It Dv DIOCEJECT "int" 263Eject removable disk. 264.It Dv DIOCLOCK "int" 265Lock or unlock disk pack. 266For devices with removable media, locking is intended to prevent 267the operator from removing the media. 268.It Dv DIOCGDEFLABEL "struct disklabel" 269Get default label. 270.It Dv DIOCCLRLABEL 271Clear disk label. 272.It Dv DIOCGCACHE "int" 273Get status of disk read and write caches. 274The result is a bitmask containing the following values: 275.Bl -tag -width DKCACHE_RCHANGE 276.It Dv DKCACHE_READ 277Read cache enabled. 278.It Dv DKCACHE_WRITE 279Write(back) cache enabled. 280.It Dv DKCACHE_RCHANGE 281Read cache enable is changeable. 282.It Dv DKCACHE_WCHANGE 283Write cache enable is changeable. 284.It Dv DKCACHE_SAVE 285Cache parameters may be saved, so that they persist across reboots 286or device detach/attach cycles. 287.El 288.It Dv DIOCSCACHE "int" 289Set status of disk read and write caches. 290The input is a bitmask in the same format as used for 291.Dv DIOCGCACHE . 292.It Dv DIOCCACHESYNC "int" 293Synchronise the disk cache. 294This causes information in the disk's write cache (if any) 295to be flushed to stable storage. 296The argument specifies whether or not to force a flush even if 297the kernel believes that there is no outstanding data. 298.It Dv DIOCBSLIST "struct disk_badsecinfo" 299Get bad sector list. 300.It Dv DIOCBSFLUSH 301Flush bad sector list. 302.It Dv DIOCAWEDGE "struct dkwedge_info" 303Add wedge. 304.It Dv DIOCGWEDGEINFO "struct dkwedge_info" 305Get wedge information. 306.It Dv DIOCDWEDGE "struct dkwedge_info" 307Delete wedge. 308.It Dv DIOCLWEDGES "struct dkwedge_list" 309List wedges. 310.It Dv DIOCGSTRATEGY "struct disk_strategy" 311Get disk buffer queue strategy. 312.It Dv DIOCSSTRATEGY "struct disk_strategy" 313Set disk buffer queue strategy. 314.It Dv DIOCGDISKINFO "struct plistref" 315Get disk-info dictionary. 316.It Dv DIOCGMEDIASIZE "off_t" 317Get disk size in bytes. 318.It Dv DIOCGSECTORSIZE "u_int" 319Get sector size in bytes. 320.El 321.Sh USING THE FRAMEWORK 322This section includes a description on basic use of the framework 323and example usage of its functions. 324Actual implementation of a device driver which uses the framework 325may vary. 326.Pp 327Each device in the system uses a 328.Dq softc 329structure which contains autoconfiguration and state information for that 330device. 331In the case of disks, the softc should also contain one instance 332of the disk structure, e.g.: 333.Bd -literal 334struct foo_softc { 335 device_t sc_dev; /* generic device information */ 336 struct disk sc_dk; /* generic disk information */ 337 [ . . . more . . . ] 338}; 339.Ed 340.Pp 341In order for the system to gather metrics data about a disk, the disk must 342be registered with the system. 343The 344.Fn disk_attach 345routine performs all of the functions currently required to register a disk 346with the system including allocation of disklabel storage space, 347recording of the time since boot that the disk was attached, and insertion 348into the disklist. 349Note that since this function allocates storage space for the disklabel, 350it must be called before the disklabel is read from the media or used in 351any other way. 352Before 353.Fn disk_attach 354is called, a portions of the disk structure must be initialized with 355data specific to that disk. 356For example, in the 357.Dq foo 358disk driver, the following would be performed in the autoconfiguration 359.Dq attach 360routine: 361.Bd -literal 362void 363fooattach(device_t parent, device_t self, void *aux) 364{ 365 struct foo_softc *sc = device_private(self); 366 [ . . . ] 367 368 /* Initialize and attach the disk structure. */ 369 disk_init(&sc->sc_dk, device_xname(self), &foodkdriver); 370 disk_attach(&sc->sc_dk); 371 372 /* Read geometry and fill in pertinent parts of disklabel. */ 373 /* Initialize geometry values of the disk structure */ 374 [ . . . ] 375 disk_set_info(&self>, &sc->sc_dk, type); 376} 377.Ed 378.Pp 379The 380.Nm foodkdriver 381above is the disk's 382.Dq driver 383switch. 384This switch currently includes pointers to several driver entry points, 385where only the 386.Nm d_strategy 387entry point is used by the disk framework. 388This switch needs to have global scope and should be initialized as follows: 389.Bd -literal 390void (foostrategy)(struct buf *); 391void (foominphys)(struct buf *); 392int (fooopen)(dev_t, int, int, struct lwp *); 393int (fooclose)(dev_t, int, int, struct lwp *); 394int (foo_discard)(device_t, off_t, off_t); 395int (foo_diskstart)(device_t, struct buf *); 396void (foo_iosize)(device_t, int *); 397int (foo_dumpblocks)(device_t, void *, daddr_t, int); 398int (foo_lastclose)(device_t); 399int (foo_firstopen)(device_t, dev_t, int, int); 400int (foo_label)(device_t, struct disklabel *); 401 402const struct dkdriver foodkdriver = { 403 .d_open = fooopen, 404 .d_close = fooclose, 405 .d_strategy = foostrategy, 406 .d_minphys = foominphys, 407 .d_discard = foo_discard, 408 .d_diskstart = foo_diskstart, /* optional */ 409 .d_dumpblocks = foo_dumpblocks, /* optional */ 410 .d_iosize = foo_iosize, /* optional */ 411 .d_firstopen = foo_firstopen, /* optional */ 412 .d_lastclose = foo_lastclose, /* optional */ 413 .d_label = foo_label, /* optional */ 414}; 415.Ed 416.Pp 417Once the disk is attached, metrics may be gathered on that disk. 418In order to gather metrics data, the driver must tell the framework when 419the disk queues, starts and stops operations. 420This functionality is provided by the 421.Fn disk_wait , 422.Fn disk_busy 423and 424.Fn disk_unbusy 425routines. 426Because 427.Nm struct disk 428is part of device driver private data it needs to be guarded. 429Mutual exclusion must be done by driver 430.Fn disk_wait , 431.Fn disk_busy 432and 433.Fn disk_unbusy 434are not thread safe. 435The 436.Fn disk_busy 437routine should be called immediately before a command to the disk is 438sent, e.g.: 439.Bd -literal 440void 441foostrategy(struct buf *bp) 442{ 443 [ . . . ] 444 445 mutex_enter(&sc->sc_dk_mtx); 446 disk_wait(&sc->sc_dk); 447 448 /* Put buffer onto drive's transfer queue */ 449 450 mutex_exit(&sc->sc_dk_mtx); 451 452 foostart(sc); 453} 454 455void 456foostart(struct foo_softc *sc) 457{ 458 [ . . . ] 459 460 /* Get buffer from drive's transfer queue. */ 461 [ . . . ] 462 463 /* Build command to send to drive. */ 464 [ . . . ] 465 466 /* Tell the disk framework we're going busy. */ 467 mutex_enter(&sc->sc_dk_mtx); 468 disk_busy(&sc->sc_dk); 469 mutex_exit(&sc->sc_dk_mtx); 470 471 /* Send command to the drive. */ 472 [ . . . ] 473} 474.Ed 475.Pp 476The routine 477.Fn disk_unbusy 478performs some consistency checks, such as ensuring that the calls to 479.Fn disk_busy 480and 481.Fn disk_unbusy 482are balanced. 483It also performs the final steps of the metrics calcuation. 484A byte count is added to the disk's running total, and if greater than 485zero, the number of transfers the disk has performed is incremented. 486The third argument 487.Ar read 488specifies the direction of I/O; 489if non-zero it means reading from the disk, 490otherwise it means writing to the disk. 491.Bd -literal 492void 493foodone(xfer) 494 struct foo_xfer *xfer; 495{ 496 struct foo_softc = (struct foo_softc *)xfer->xf_softc; 497 struct buf *bp = xfer->xf_buf; 498 long nbytes; 499 [ . . . ] 500 501 /* 502 * Get number of bytes transferred. If there is no buf 503 * associated with the xfer, we are being called at the 504 * end of a non-I/O command. 505 */ 506 if (bp == NULL) 507 nbytes = 0; 508 else 509 nbytes = bp->b_bcount - bp->b_resid; 510 511 [ . . . ] 512 513 mutex_enter(&sc->sc_dk_mtx); 514 /* Notify the disk framework that we've completed the transfer. */ 515 disk_unbusy(&sc->sc_dk, nbytes, 516 bp != NULL ? bp->b_flags & B_READ : 0); 517 mutex_exit(&sc->sc_dk_mtx); 518 519 [ . . . ] 520} 521.Ed 522.Pp 523.Fn disk_isbusy 524is used to get status of disk device it returns true if device is 525currently busy and false if it is not. 526Like 527.Fn disk_wait , 528.Fn disk_busy 529and 530.Fn disk_unbusy 531it requires explicit locking from user side. 532.Sh CODE REFERENCES 533The disk framework itself is implemented within the file 534.Pa sys/kern/subr_disk.c . 535Data structures and function prototypes for the framework are located in 536.Pa sys/sys/disk.h . 537.Pp 538The 539.Nx 540machine-independent SCSI disk and CD-ROM drivers use the 541disk framework. 542They are located in 543.Pa sys/scsi/sd.c 544and 545.Pa sys/scsi/cd.c . 546.Pp 547The 548.Nx 549.Nm ccd , 550.Nm dm , 551and 552.Nm vnd 553drivers use the detachment capability of the framework. 554They are located in 555.Pa sys/dev/ccd.c , 556.Pa sys/dev/vnd.c , 557and 558.Pa sys/dev/dm/device-mapper.c . 559.Sh SEE ALSO 560.Xr ccd 4 , 561.Xr dm 4 , 562.Xr vnd 4 , 563.Xr dksubr 9 564.Sh HISTORY 565The 566.Nx 567generic disk framework appeared in 568.Nx 1.2 . 569.Sh AUTHORS 570The 571.Nx 572generic disk framework was architected and implemented by 573.An Jason R. Thorpe 574.Aq Mt thorpej@NetBSD.org . 575