xref: /netbsd-src/share/man/man9/disk.9 (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1.\"	$NetBSD: disk.9,v 1.35 2009/12/30 14:53:02 wiz Exp $
2.\"
3.\" Copyright (c) 1995, 1996 Jason R. Thorpe.
4.\" All rights reserved.
5.\"
6.\" Redistribution and use in source and binary forms, with or without
7.\" modification, are permitted provided that the following conditions
8.\" are met:
9.\" 1. Redistributions of source code must retain the above copyright
10.\"    notice, this list of conditions and the following disclaimer.
11.\" 2. Redistributions in binary form must reproduce the above copyright
12.\"    notice, this list of conditions and the following disclaimer in the
13.\"    documentation and/or other materials provided with the distribution.
14.\" 3. All advertising materials mentioning features or use of this software
15.\"    must display the following acknowledgement:
16.\"	This product includes software developed for the NetBSD Project
17.\"	by Jason R. Thorpe.
18.\" 4. The name of the author may not be used to endorse or promote products
19.\"    derived from this software without specific prior written permission.
20.\"
21.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.Dd December 30, 2009
34.Dt DISK 9
35.Os
36.Sh NAME
37.Nm disk ,
38.Nm disk_init ,
39.Nm disk_attach ,
40.Nm disk_begindetach ,
41.Nm disk_detach ,
42.Nm disk_destroy ,
43.Nm disk_busy ,
44.Nm disk_unbusy ,
45.Nm disk_isbusy ,
46.Nm disk_find ,
47.Nm disk_blocksize
48.Nd generic disk framework
49.Sh SYNOPSIS
50.In sys/types.h
51.In sys/disklabel.h
52.In sys/disk.h
53.Ft void
54.Fn disk_init "struct disk *" "const char *name" "const struct dkdriver *driver"
55.Ft void
56.Fn disk_attach "struct disk *"
57.Ft void
58.Fn disk_begindetach "struct disk *" "int (*lastclose)(device_t)" "device_t self" "int flags"
59.Ft void
60.Fn disk_detach "struct disk *"
61.Ft void
62.Fn disk_destroy "struct disk *"
63.Ft void
64.Fn disk_busy "struct disk *"
65.Ft void
66.Fn disk_unbusy "struct disk *" "long bcount" "int read"
67.Ft bool
68.Fn disk_isbusy "struct disk *"
69.Ft struct disk *
70.Fn disk_find "const char *"
71.Ft void
72.Fn disk_blocksize "struct disk *" "int blocksize"
73.Sh DESCRIPTION
74The
75.Nx
76generic disk framework is designed to provide flexible,
77scalable, and consistent handling of disk state and metrics information.
78The fundamental component of this framework is the
79.Nm disk
80structure, which is defined as follows:
81.Bd -literal
82struct disk {
83	TAILQ_ENTRY(disk) dk_link;	/* link in global disklist */
84	const char	*dk_name;	/* disk name */
85	prop_dictionary_t dk_info;	/* reference to disk-info dictionary */
86	int		dk_bopenmask;	/* block devices open */
87	int		dk_copenmask;	/* character devices open */
88	int		dk_openmask;	/* composite (bopen|copen) */
89	int		dk_state;	/* label state   ### */
90	int		dk_blkshift;	/* shift to convert DEV_BSIZE to blks */
91	int		dk_byteshift;	/* shift to convert bytes to blks */
92
93	/*
94	 * Metrics data; note that some metrics may have no meaning
95	 * on certain types of disks.
96	 */
97	struct io_stats	*dk_stats;
98
99	const struct dkdriver *dk_driver;	/* pointer to driver */
100
101	/*
102	 * Information required to be the parent of a disk wedge.
103	 */
104	kmutex_t	dk_rawlock;	/* lock on these fields */
105	u_int		dk_rawopens;	/* # of openes of rawvp */
106	struct vnode	*dk_rawvp;	/* vnode for the RAW_PART bdev */
107
108	kmutex_t	dk_openlock;	/* lock on these and openmask */
109	u_int		dk_nwedges;	/* # of configured wedges */
110					/* all wedges on this disk */
111	LIST_HEAD(, dkwedge_softc) dk_wedges;
112
113	/*
114	 * Disk label information.  Storage for the in-core disk label
115	 * must be dynamically allocated, otherwise the size of this
116	 * structure becomes machine-dependent.
117	 */
118	daddr_t		dk_labelsector;		/* sector containing label */
119	struct disklabel *dk_label;	/* label */
120	struct cpu_disklabel *dk_cpulabel;
121};
122.Ed
123.Pp
124The system maintains a global linked-list of all disks attached to the
125system.
126This list, called
127.Nm disklist ,
128may grow or shrink over time as disks are dynamically added and removed
129from the system.
130Drivers which currently make use of the detachment
131capability of the framework are the
132.Nm ccd ,
133.Nm dm ,
134and
135.Nm vnd
136pseudo-device drivers.
137.Pp
138The following is a brief description of each function in the framework:
139.Bl -tag -width ".Fn disk_blocksize"
140.It Fn disk_init
141Initialize the disk structure.
142.It Fn disk_attach
143Attach a disk; allocate storage for the disklabel, set the
144.Dq attached time
145timestamp, insert the disk into the disklist, and increment the
146system disk count.
147.It Fn disk_begindetach
148Check whether the disk is open, and if not, return 0.
149If the disk is open, and
150.Dv DETACH_FORCE
151is not set in
152.Fa flags ,
153return
154.Dv EBUSY .
155Otherwise, call the provided
156.Fa lastclose
157routine
158.Po
159if not
160.Dv NULL
161.Pc
162and return its exit code.
163.It Fn disk_detach
164Detach a disk; free storage for the disklabel, remove the disk
165from the disklist, and decrement the system disk count.
166If the count drops below zero, panic.
167.It Fn disk_destroy
168Release resources used by the disk structure when it is no longer
169required.
170.It Fn disk_busy
171Increment the disk's
172.Dq busy counter .
173If this counter goes from 0 to 1, set the timestamp corresponding to
174this transfer.
175.It Fn disk_unbusy
176Decrement a disk's busy counter.
177If the count drops below zero, panic.
178Get the current time, subtract it from the disk's timestamp, and add
179the difference to the disk's running total.
180Set the disk's timestamp to the current time.
181If the provided byte count is greater than 0, add it to the disk's
182running total and increment the number of transfers performed by the disk.
183The third argument
184.Ar read
185specifies the direction of I/O;
186if non-zero it means reading from the disk,
187otherwise it means writing to the disk.
188.It Fn disk_isbusy
189Returns
190.Ar true
191if disk is marked as busy and false if it is not.
192.It Fn disk_find
193Return a pointer to the disk structure corresponding to the name provided,
194or NULL if the disk does not exist.
195.It Fn disk_blocksize
196Initialize
197.Fa dk_blkshift
198and
199.Fa dk_byteshift
200members of
201.Fa struct disk
202with suitable values derived from the supplied physical blocksize.
203It is only necessary to call this function if the device's physical blocksize
204is not
205.Dv DEV_BSIZE .
206.El
207.Pp
208The functions typically called by device drivers are
209.Fn disk_init
210.Fn disk_attach ,
211.Fn disk_begindetach ,
212.Fn disk_detach ,
213.Fn disk_destroy ,
214.Fn disk_busy ,
215.Fn disk_unbusy ,
216and
217.Fn disk_blocksize .
218The function
219.Fn disk_find
220is provided as a utility function.
221.Sh DISK IOCTLS
222The following ioctls should be implemented by disk drivers:
223.Bl -tag -width "xxxxxx"
224.It Dv DIOCGDINFO "struct disklabel"
225Get disklabel.
226.It Dv DIOCSDINFO "struct disklabel"
227Set in-memory disklabel.
228.It Dv DIOCWDINFO "struct disklabel"
229Set in-memory disklabel and write on-disk disklabel.
230.It Dv DIOCGPART "struct partinfo"
231Get partition information.
232This is used internally.
233.It Dv DIOCRFORMAT "struct format_op"
234Read format.
235.It Dv DIOCWFORMAT "struct format_op"
236Write format.
237.It Dv DIOCSSTEP "int"
238Set step rate.
239.It Dv DIOCSRETRIES "int"
240Set number of retries.
241.It Dv DIOCKLABEL "int"
242Specify whether to keep or drop the in-memory disklabel
243when the device is closed.
244.It Dv DIOCWLABEL "int"
245Enable or disable writing to the part of the disk that contains the label.
246.It Dv DIOCSBAD "struct dkbad"
247Set kernel dkbad.
248.It Dv DIOCEJECT "int"
249Eject removable disk.
250.It Dv DIOCLOCK "int"
251Lock or unlock disk pack.
252For devices with removable media, locking is intended to prevent
253the operator from removing the media.
254.It Dv DIOCGDEFLABEL "struct disklabel"
255Get default label.
256.It Dv DIOCCLRLABEL
257Clear disk label.
258.It Dv DIOCGCACHE "int"
259Get status of disk read and write caches.
260The result is a bitmask containing the following values:
261.Bl -tag -width DKCACHE_RCHANGE
262.It Dv DKCACHE_READ
263Read cache enabled.
264.It Dv DKCACHE_WRITE
265Write(back) cache enabled.
266.It Dv DKCACHE_RCHANGE
267Read cache enable is changeable.
268.It Dv DKCACHE_WCHANGE
269Write cache enable is changeable.
270.It Dv DKCACHE_SAVE
271Cache parameters may be saved, so that they persist across reboots
272or device detach/attach cycles.
273.El
274.It Dv DIOCSCACHE "int"
275Set status of disk read and write caches.
276The input is a bitmask in the same format as used for
277.Dv DIOCGCACHE .
278.It Dv DIOCCACHESYNC "int"
279Synchronise the disk cache.
280This causes information in the disk's write cache (if any)
281to be flushed to stable storage.
282The argument specifies whether or not to force a flush even if
283the kernel believes that there is no outstanding data.
284.It Dv DIOCBSLIST "struct disk_badsecinfo"
285Get bad sector list.
286.It Dv DIOCBSFLUSH
287Flush bad sector list.
288.It Dv DIOCAWEDGE "struct dkwedge_info"
289Add wedge.
290.It Dv DIOCGWEDGEINFO "struct dkwedge_info"
291Get wedge information.
292.It Dv DIOCDWEDGE "struct dkwedge_info"
293Delete wedge.
294.It Dv DIOCLWEDGES "struct dkwedge_list"
295List wedges.
296.It Dv DIOCGSTRATEGY "struct disk_strategy"
297Get disk buffer queue strategy.
298.It Dv DIOCSSTRATEGY "struct disk_strategy"
299Set disk buffer queue strategy.
300.It Dv DIOCGDISKINFO "struct plistref"
301Get disk-info dictionary.
302.El
303.Sh USING THE FRAMEWORK
304This section includes a description on basic use of the framework
305and example usage of its functions.
306Actual implementation of a device driver which uses the framework
307may vary.
308.Pp
309Each device in the system uses a
310.Dq softc
311structure which contains autoconfiguration and state information for that
312device.
313In the case of disks, the softc should also contain one instance
314of the disk structure, e.g.:
315.Bd -literal
316struct foo_softc {
317	device_t	sc_dev;		/* generic device information */
318	struct	disk	sc_dk;		/* generic disk information */
319	[ . . . more . . . ]
320};
321.Ed
322.Pp
323In order for the system to gather metrics data about a disk, the disk must
324be registered with the system.
325The
326.Fn disk_attach
327routine performs all of the functions currently required to register a disk
328with the system including allocation of disklabel storage space,
329recording of the time since boot that the disk was attached, and insertion
330into the disklist.
331Note that since this function allocates storage space for the disklabel,
332it must be called before the disklabel is read from the media or used in
333any other way.
334Before
335.Fn disk_attach
336is called, a portions of the disk structure must be initialized with
337data specific to that disk.
338For example, in the
339.Dq foo
340disk driver, the following would be performed in the autoconfiguration
341.Dq attach
342routine:
343.Bd -literal
344void
345fooattach(device_t parent, device_t self, void *aux)
346{
347	struct foo_softc *sc = device_private(self);
348	[ . . . ]
349
350	/* Initialize and attach the disk structure. */
351	disk_init(\*[Am]sc-\*[Gt]sc_dk, device_xname(self), \*[Am]foodkdriver);
352	disk_attach(\*[Am]sc-\*[Gt]sc_dk);
353
354	/* Read geometry and fill in pertinent parts of disklabel. */
355	[ . . . ]
356	disk_blocksize(\*[Am]sc-\*[Gt]sc_dk, bytes_per_sector);
357}
358.Ed
359.Pp
360The
361.Nm foodkdriver
362above is the disk's
363.Dq driver
364switch.
365This switch currently includes a pointer to the disk's
366.Dq strategy
367routine.
368This switch needs to have global scope and should be initialized as follows:
369.Bd -literal
370void foostrategy(struct buf *);
371
372const struct dkdriver foodkdriver = {
373	.d_strategy = foostrategy,
374};
375.Ed
376.Pp
377Once the disk is attached, metrics may be gathered on that disk.
378In order to gather metrics data, the driver must tell the framework when
379the disk starts and stops operations.
380This functionality is provided by the
381.Fn disk_busy
382and
383.Fn disk_unbusy
384routines.
385Because
386.Nm struct disk
387is part of device driver private data it needs to be guarded.
388Mutual exclusion must be done by driver
389.Fn disk_busy
390and
391.Fn disk_unbusy
392are not thread safe.
393The
394.Fn disk_busy
395routine should be called immediately before a command to the disk is
396sent, e.g.:
397.Bd -literal
398void
399foostart(sc)
400	struct foo_softc *sc;
401{
402	[ . . . ]
403
404	/* Get buffer from drive's transfer queue. */
405	[ . . . ]
406
407	/* Build command to send to drive. */
408	[ . . . ]
409
410	/* Tell the disk framework we're going busy. */
411	mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx);
412	disk_busy(\*[Am]sc-\*[Gt]sc_dk);
413	mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx);
414
415	/* Send command to the drive. */
416	[ . . . ]
417}
418.Ed
419.Pp
420When
421.Fn disk_busy
422is called, a timestamp is taken if the disk's busy counter moves from
4230 to 1, indicating the disk has gone from an idle to non-idle state.
424At the end of a transaction, the
425.Fn disk_unbusy
426routine should be called.
427This routine performs some consistency checks,
428such as ensuring that the calls to
429.Fn disk_busy
430and
431.Fn disk_unbusy
432are balanced.
433This routine also performs the actual metrics calculation.
434A timestamp is taken and the difference from the timestamp taken in
435.Fn disk_busy
436is added to the disk's total running time.
437The disk's timestamp is then updated in case there is more than one
438pending transfer on the disk.
439A byte count is also added to the disk's running total, and if greater than
440zero, the number of transfers the disk has performed is incremented.
441The third argument
442.Ar read
443specifies the direction of I/O;
444if non-zero it means reading from the disk,
445otherwise it means writing to the disk.
446.Bd -literal
447void
448foodone(xfer)
449	struct foo_xfer *xfer;
450{
451	struct foo_softc = (struct foo_softc *)xfer-\*[Gt]xf_softc;
452	struct buf *bp = xfer-\*[Gt]xf_buf;
453	long nbytes;
454	[ . . . ]
455
456	/*
457	 * Get number of bytes transferred.  If there is no buf
458	 * associated with the xfer, we are being called at the
459	 * end of a non-I/O command.
460	 */
461	if (bp == NULL)
462		nbytes = 0;
463	else
464		nbytes = bp-\*[Gt]b_bcount - bp-\*[Gt]b_resid;
465
466	[ . . . ]
467
468	mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx);
469	/* Notify the disk framework that we've completed the transfer. */
470	disk_unbusy(\*[Am]sc-\*[Gt]sc_dk, nbytes,
471	    bp != NULL ? bp-\*[Gt]b_flags \*[Am] B_READ : 0);
472	mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx);
473
474	[ . . . ]
475}
476.Ed
477.Pp
478.Fn disk_isbusy
479is used to get status of disk device it returns true if device is
480currently busy and false if it is not.
481Like
482.Fn disk_busy
483and
484.Fn disk_unbusy
485it requires explicit locking from user side.
486.Sh CODE REFERENCES
487This section describes places within the
488.Nx
489source tree where actual
490code implementing or using the disk framework can be found.
491All pathnames are relative to
492.Pa /usr/src .
493.Pp
494The disk framework itself is implemented within the file
495.Pa sys/kern/subr_disk.c .
496Data structures and function prototypes for the framework are located in
497.Pa sys/sys/disk.h .
498.Pp
499The
500.Nx
501machine-independent SCSI disk and CD-ROM drivers use the
502disk framework.
503They are located in
504.Pa sys/scsi/sd.c
505and
506.Pa sys/scsi/cd.c .
507.Pp
508The
509.Nx
510.Nm ccd ,
511.Nm dm ,
512and
513.Nm vnd
514drivers use the detachment capability of the framework.
515They are located in
516.Pa sys/dev/ccd.c ,
517.Pa sys/dev/vnd.c ,
518and
519.Pa sys/dev/dm/device-mapper.c .
520.Sh SEE ALSO
521.Xr ccd 4 ,
522.Xr dm 4 ,
523.Xr vnd 4
524.Sh HISTORY
525The
526.Nx
527generic disk framework appeared in
528.Nx 1.2 .
529.Sh AUTHORS
530The
531.Nx
532generic disk framework was architected and implemented by
533.An Jason R. Thorpe
534.Aq thorpej@NetBSD.org .
535