1.\" $NetBSD: disk.9,v 1.43 2017/01/23 11:42:03 abhinav Exp $ 2.\" 3.\" Copyright (c) 1995, 1996 Jason R. Thorpe. 4.\" All rights reserved. 5.\" 6.\" Redistribution and use in source and binary forms, with or without 7.\" modification, are permitted provided that the following conditions 8.\" are met: 9.\" 1. Redistributions of source code must retain the above copyright 10.\" notice, this list of conditions and the following disclaimer. 11.\" 2. Redistributions in binary form must reproduce the above copyright 12.\" notice, this list of conditions and the following disclaimer in the 13.\" documentation and/or other materials provided with the distribution. 14.\" 3. All advertising materials mentioning features or use of this software 15.\" must display the following acknowledgement: 16.\" This product includes software developed for the NetBSD Project 17.\" by Jason R. Thorpe. 18.\" 4. The name of the author may not be used to endorse or promote products 19.\" derived from this software without specific prior written permission. 20.\" 21.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31.\" SUCH DAMAGE. 32.\" 33.Dd December 29, 2014 34.Dt DISK 9 35.Os 36.Sh NAME 37.Nm disk , 38.Nm disk_init , 39.Nm disk_attach , 40.Nm disk_begindetach , 41.Nm disk_detach , 42.Nm disk_destroy , 43.Nm disk_busy , 44.Nm disk_unbusy , 45.Nm disk_isbusy , 46.Nm disk_find , 47.Nm disk_set_info 48.Nd generic disk framework 49.Sh SYNOPSIS 50.In sys/types.h 51.In sys/disklabel.h 52.In sys/disk.h 53.Ft void 54.Fn disk_init "struct disk *" "const char *name" "const struct dkdriver *driver" 55.Ft void 56.Fn disk_attach "struct disk *" 57.Ft void 58.Fn disk_begindetach "struct disk *" "int (*lastclose)(device_t)" "device_t self" "int flags" 59.Ft void 60.Fn disk_detach "struct disk *" 61.Ft void 62.Fn disk_destroy "struct disk *" 63.Ft void 64.Fn disk_busy "struct disk *" 65.Ft void 66.Fn disk_unbusy "struct disk *" "long bcount" "int read" 67.Ft bool 68.Fn disk_isbusy "struct disk *" 69.Ft struct disk * 70.Fn disk_find "const char *" 71.Ft void 72.Fn disk_set_info "device_t" "struct disk *" "const char *type" 73.Sh DESCRIPTION 74The 75.Nx 76generic disk framework is designed to provide flexible, 77scalable, and consistent handling of disk state and metrics information. 78The fundamental component of this framework is the 79.Nm disk 80structure, which is defined as follows: 81.Bd -literal 82struct disk { 83 TAILQ_ENTRY(disk) dk_link; /* link in global disklist */ 84 const char *dk_name; /* disk name */ 85 prop_dictionary_t dk_info; /* reference to disk-info dictionary */ 86 int dk_bopenmask; /* block devices open */ 87 int dk_copenmask; /* character devices open */ 88 int dk_openmask; /* composite (bopen|copen) */ 89 int dk_state; /* label state ### */ 90 int dk_blkshift; /* shift to convert DEV_BSIZE to blks */ 91 int dk_byteshift; /* shift to convert bytes to blks */ 92 93 /* 94 * Metrics data; note that some metrics may have no meaning 95 * on certain types of disks. 96 */ 97 struct io_stats *dk_stats; 98 99 const struct dkdriver *dk_driver; /* pointer to driver */ 100 101 /* 102 * Information required to be the parent of a disk wedge. 103 */ 104 kmutex_t dk_rawlock; /* lock on these fields */ 105 u_int dk_rawopens; /* # of opens of rawvp */ 106 struct vnode *dk_rawvp; /* vnode for the RAW_PART bdev */ 107 108 kmutex_t dk_openlock; /* lock on these and openmask */ 109 u_int dk_nwedges; /* # of configured wedges */ 110 /* all wedges on this disk */ 111 LIST_HEAD(, dkwedge_softc) dk_wedges; 112 113 /* 114 * Disk label information. Storage for the in-core disk label 115 * must be dynamically allocated, otherwise the size of this 116 * structure becomes machine-dependent. 117 */ 118 daddr_t dk_labelsector; /* sector containing label */ 119 struct disklabel *dk_label; /* label */ 120 struct cpu_disklabel *dk_cpulabel; 121}; 122.Ed 123.Pp 124The system maintains a global linked-list of all disks attached to the 125system. 126This list, called 127.Nm disklist , 128may grow or shrink over time as disks are dynamically added and removed 129from the system. 130Drivers which currently make use of the detachment 131capability of the framework are the 132.Nm ccd , 133.Nm dm , 134and 135.Nm vnd 136pseudo-device drivers. 137.Pp 138The following is a brief description of each function in the framework: 139.Bl -tag -width ".Fn disk_set_info" 140.It Fn disk_init 141Initialize the disk structure. 142.It Fn disk_attach 143Attach a disk; allocate storage for the disklabel, set the 144.Dq attached time 145timestamp, insert the disk into the disklist, and increment the 146system disk count. 147.It Fn disk_begindetach 148Check whether the disk is open, and if not, return 0. 149If the disk is open, and 150.Dv DETACH_FORCE 151is not set in 152.Fa flags , 153return 154.Dv EBUSY . 155Otherwise, call the provided 156.Fa lastclose 157routine 158.Po 159if not 160.Dv NULL 161.Pc 162and return its exit code. 163.It Fn disk_detach 164Detach a disk; free storage for the disklabel, remove the disk 165from the disklist, and decrement the system disk count. 166If the count drops below zero, panic. 167.It Fn disk_destroy 168Release resources used by the disk structure when it is no longer 169required. 170.It Fn disk_busy 171Increment the disk's 172.Dq busy counter . 173If this counter goes from 0 to 1, set the timestamp corresponding to 174this transfer. 175.It Fn disk_unbusy 176Decrement a disk's busy counter. 177If the count drops below zero, panic. 178Get the current time, subtract it from the disk's timestamp, and add 179the difference to the disk's running total. 180Set the disk's timestamp to the current time. 181If the provided byte count is greater than 0, add it to the disk's 182running total and increment the number of transfers performed by the disk. 183The third argument 184.Ar read 185specifies the direction of I/O; 186if non-zero it means reading from the disk, 187otherwise it means writing to the disk. 188.It Fn disk_isbusy 189Returns 190.Ar true 191if disk is marked as busy and false if it is not. 192.It Fn disk_find 193Return a pointer to the disk structure corresponding to the name provided, 194or 195.Dv NULL 196if the disk does not exist. 197.It Fn disk_set_info 198Setup disk-info dictionary and other dependent values of the disk structure, 199the driver must have initialized the dk_geom member of 200.Fa struct disk 201with suitable values. 202If 203.Fa type 204is not 205.Dv NULL , 206it will be added to the dictionary. 207.El 208.Pp 209The functions typically called by device drivers are 210.Fn disk_init 211.Fn disk_attach , 212.Fn disk_begindetach , 213.Fn disk_detach , 214.Fn disk_destroy , 215.Fn disk_busy , 216.Fn disk_unbusy , 217and 218.Fn disk_set_info . 219The function 220.Fn disk_find 221is provided as a utility function. 222.Sh DISK IOCTLS 223The following ioctls should be implemented by disk drivers: 224.Bl -tag -width "xxxxxx" 225.It Dv DIOCGDINFO "struct disklabel" 226Get disklabel. 227.It Dv DIOCSDINFO "struct disklabel" 228Set in-memory disklabel. 229.It Dv DIOCWDINFO "struct disklabel" 230Set in-memory disklabel and write on-disk disklabel. 231.It Dv DIOCGPART "struct partinfo" 232Get partition information. 233This is used internally. 234.It Dv DIOCRFORMAT "struct format_op" 235Read format. 236.It Dv DIOCWFORMAT "struct format_op" 237Write format. 238.It Dv DIOCSSTEP "int" 239Set step rate. 240.It Dv DIOCSRETRIES "int" 241Set number of retries. 242.It Dv DIOCKLABEL "int" 243Specify whether to keep or drop the in-memory disklabel 244when the device is closed. 245.It Dv DIOCWLABEL "int" 246Enable or disable writing to the part of the disk that contains the label. 247.It Dv DIOCSBAD "struct dkbad" 248Set kernel dkbad. 249.It Dv DIOCEJECT "int" 250Eject removable disk. 251.It Dv DIOCLOCK "int" 252Lock or unlock disk pack. 253For devices with removable media, locking is intended to prevent 254the operator from removing the media. 255.It Dv DIOCGDEFLABEL "struct disklabel" 256Get default label. 257.It Dv DIOCCLRLABEL 258Clear disk label. 259.It Dv DIOCGCACHE "int" 260Get status of disk read and write caches. 261The result is a bitmask containing the following values: 262.Bl -tag -width DKCACHE_RCHANGE 263.It Dv DKCACHE_READ 264Read cache enabled. 265.It Dv DKCACHE_WRITE 266Write(back) cache enabled. 267.It Dv DKCACHE_RCHANGE 268Read cache enable is changeable. 269.It Dv DKCACHE_WCHANGE 270Write cache enable is changeable. 271.It Dv DKCACHE_SAVE 272Cache parameters may be saved, so that they persist across reboots 273or device detach/attach cycles. 274.El 275.It Dv DIOCSCACHE "int" 276Set status of disk read and write caches. 277The input is a bitmask in the same format as used for 278.Dv DIOCGCACHE . 279.It Dv DIOCCACHESYNC "int" 280Synchronise the disk cache. 281This causes information in the disk's write cache (if any) 282to be flushed to stable storage. 283The argument specifies whether or not to force a flush even if 284the kernel believes that there is no outstanding data. 285.It Dv DIOCBSLIST "struct disk_badsecinfo" 286Get bad sector list. 287.It Dv DIOCBSFLUSH 288Flush bad sector list. 289.It Dv DIOCAWEDGE "struct dkwedge_info" 290Add wedge. 291.It Dv DIOCGWEDGEINFO "struct dkwedge_info" 292Get wedge information. 293.It Dv DIOCDWEDGE "struct dkwedge_info" 294Delete wedge. 295.It Dv DIOCLWEDGES "struct dkwedge_list" 296List wedges. 297.It Dv DIOCGSTRATEGY "struct disk_strategy" 298Get disk buffer queue strategy. 299.It Dv DIOCSSTRATEGY "struct disk_strategy" 300Set disk buffer queue strategy. 301.It Dv DIOCGDISKINFO "struct plistref" 302Get disk-info dictionary. 303.It Dv DIOCGMEDIASIZE "off_t" 304Get disk size in bytes. 305.It Dv DIOCGSECTORSIZE "u_int" 306Get sector size in bytes. 307.El 308.Sh USING THE FRAMEWORK 309This section includes a description on basic use of the framework 310and example usage of its functions. 311Actual implementation of a device driver which uses the framework 312may vary. 313.Pp 314Each device in the system uses a 315.Dq softc 316structure which contains autoconfiguration and state information for that 317device. 318In the case of disks, the softc should also contain one instance 319of the disk structure, e.g.: 320.Bd -literal 321struct foo_softc { 322 device_t sc_dev; /* generic device information */ 323 struct disk sc_dk; /* generic disk information */ 324 [ . . . more . . . ] 325}; 326.Ed 327.Pp 328In order for the system to gather metrics data about a disk, the disk must 329be registered with the system. 330The 331.Fn disk_attach 332routine performs all of the functions currently required to register a disk 333with the system including allocation of disklabel storage space, 334recording of the time since boot that the disk was attached, and insertion 335into the disklist. 336Note that since this function allocates storage space for the disklabel, 337it must be called before the disklabel is read from the media or used in 338any other way. 339Before 340.Fn disk_attach 341is called, a portions of the disk structure must be initialized with 342data specific to that disk. 343For example, in the 344.Dq foo 345disk driver, the following would be performed in the autoconfiguration 346.Dq attach 347routine: 348.Bd -literal 349void 350fooattach(device_t parent, device_t self, void *aux) 351{ 352 struct foo_softc *sc = device_private(self); 353 [ . . . ] 354 355 /* Initialize and attach the disk structure. */ 356 disk_init(\*[Am]sc-\*[Gt]sc_dk, device_xname(self), \*[Am]foodkdriver); 357 disk_attach(\*[Am]sc-\*[Gt]sc_dk); 358 359 /* Read geometry and fill in pertinent parts of disklabel. */ 360 /* Initialize geometry values of the disk structure */ 361 [ . . . ] 362 disk_set_info(\*[Am]self\*[Gt], \*[Am]sc-\*[Gt]sc_dk, type); 363} 364.Ed 365.Pp 366The 367.Nm foodkdriver 368above is the disk's 369.Dq driver 370switch. 371This switch currently includes pointers to several driver entry points, 372where only the 373.Nm d_strategy 374entry point is used by the disk framework. 375This switch needs to have global scope and should be initialized as follows: 376.Bd -literal 377void (foostrategy)(struct buf *); 378void (foominphys)(struct buf *); 379int (fooopen)(dev_t, int, int, struct lwp *); 380int (fooclose)(dev_t, int, int, struct lwp *); 381int (foo_discard)(device_t, off_t, off_t); 382int (foo_diskstart)(device_t, struct buf *); 383void (foo_iosize)(device_t, int *); 384int (foo_dumpblocks)(device_t, void *, daddr_t, int); 385int (foo_lastclose)(device_t); 386int (foo_firstopen)(device_t, dev_t, int, int); 387int (foo_label)(device_t, struct disklabel *); 388 389const struct dkdriver foodkdriver = { 390 .d_open = fooopen, 391 .d_close = fooclose, 392 .d_strategy = foostrategy, 393 .d_minphys = foominphys, 394 .d_discard = foo_discard, 395 .d_diskstart = foo_diskstart, /* optional */ 396 .d_dumpblocks = foo_dumpblocks, /* optional */ 397 .d_iosize = foo_iosize, /* optional */ 398 .d_firstopen = foo_firstopen, /* optional */ 399 .d_lastclose = foo_lastclose, /* optional */ 400 .d_label = foo_label, /* optional */ 401}; 402.Ed 403.Pp 404Once the disk is attached, metrics may be gathered on that disk. 405In order to gather metrics data, the driver must tell the framework when 406the disk starts and stops operations. 407This functionality is provided by the 408.Fn disk_busy 409and 410.Fn disk_unbusy 411routines. 412Because 413.Nm struct disk 414is part of device driver private data it needs to be guarded. 415Mutual exclusion must be done by driver 416.Fn disk_busy 417and 418.Fn disk_unbusy 419are not thread safe. 420The 421.Fn disk_busy 422routine should be called immediately before a command to the disk is 423sent, e.g.: 424.Bd -literal 425void 426foostart(sc) 427 struct foo_softc *sc; 428{ 429 [ . . . ] 430 431 /* Get buffer from drive's transfer queue. */ 432 [ . . . ] 433 434 /* Build command to send to drive. */ 435 [ . . . ] 436 437 /* Tell the disk framework we're going busy. */ 438 mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx); 439 disk_busy(\*[Am]sc-\*[Gt]sc_dk); 440 mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx); 441 442 /* Send command to the drive. */ 443 [ . . . ] 444} 445.Ed 446.Pp 447When 448.Fn disk_busy 449is called, a timestamp is taken if the disk's busy counter moves from 4500 to 1, indicating the disk has gone from an idle to non-idle state. 451At the end of a transaction, the 452.Fn disk_unbusy 453routine should be called. 454This routine performs some consistency checks, 455such as ensuring that the calls to 456.Fn disk_busy 457and 458.Fn disk_unbusy 459are balanced. 460This routine also performs the actual metrics calculation. 461A timestamp is taken and the difference from the timestamp taken in 462.Fn disk_busy 463is added to the disk's total running time. 464The disk's timestamp is then updated in case there is more than one 465pending transfer on the disk. 466A byte count is also added to the disk's running total, and if greater than 467zero, the number of transfers the disk has performed is incremented. 468The third argument 469.Ar read 470specifies the direction of I/O; 471if non-zero it means reading from the disk, 472otherwise it means writing to the disk. 473.Bd -literal 474void 475foodone(xfer) 476 struct foo_xfer *xfer; 477{ 478 struct foo_softc = (struct foo_softc *)xfer-\*[Gt]xf_softc; 479 struct buf *bp = xfer-\*[Gt]xf_buf; 480 long nbytes; 481 [ . . . ] 482 483 /* 484 * Get number of bytes transferred. If there is no buf 485 * associated with the xfer, we are being called at the 486 * end of a non-I/O command. 487 */ 488 if (bp == NULL) 489 nbytes = 0; 490 else 491 nbytes = bp-\*[Gt]b_bcount - bp-\*[Gt]b_resid; 492 493 [ . . . ] 494 495 mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx); 496 /* Notify the disk framework that we've completed the transfer. */ 497 disk_unbusy(\*[Am]sc-\*[Gt]sc_dk, nbytes, 498 bp != NULL ? bp-\*[Gt]b_flags \*[Am] B_READ : 0); 499 mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx); 500 501 [ . . . ] 502} 503.Ed 504.Pp 505.Fn disk_isbusy 506is used to get status of disk device it returns true if device is 507currently busy and false if it is not. 508Like 509.Fn disk_busy 510and 511.Fn disk_unbusy 512it requires explicit locking from user side. 513.Sh CODE REFERENCES 514The disk framework itself is implemented within the file 515.Pa sys/kern/subr_disk.c . 516Data structures and function prototypes for the framework are located in 517.Pa sys/sys/disk.h . 518.Pp 519The 520.Nx 521machine-independent SCSI disk and CD-ROM drivers use the 522disk framework. 523They are located in 524.Pa sys/scsi/sd.c 525and 526.Pa sys/scsi/cd.c . 527.Pp 528The 529.Nx 530.Nm ccd , 531.Nm dm , 532and 533.Nm vnd 534drivers use the detachment capability of the framework. 535They are located in 536.Pa sys/dev/ccd.c , 537.Pa sys/dev/vnd.c , 538and 539.Pa sys/dev/dm/device-mapper.c . 540.Sh SEE ALSO 541.Xr ccd 4 , 542.Xr dm 4 , 543.Xr vnd 4 , 544.Xr dksubr 9 545.Sh HISTORY 546The 547.Nx 548generic disk framework appeared in 549.Nx 1.2 . 550.Sh AUTHORS 551The 552.Nx 553generic disk framework was architected and implemented by 554.An Jason R. Thorpe 555.Aq thorpej@NetBSD.org . 556