xref: /netbsd-src/share/man/man9/disk.9 (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1.\"	$NetBSD: disk.9,v 1.43 2017/01/23 11:42:03 abhinav Exp $
2.\"
3.\" Copyright (c) 1995, 1996 Jason R. Thorpe.
4.\" All rights reserved.
5.\"
6.\" Redistribution and use in source and binary forms, with or without
7.\" modification, are permitted provided that the following conditions
8.\" are met:
9.\" 1. Redistributions of source code must retain the above copyright
10.\"    notice, this list of conditions and the following disclaimer.
11.\" 2. Redistributions in binary form must reproduce the above copyright
12.\"    notice, this list of conditions and the following disclaimer in the
13.\"    documentation and/or other materials provided with the distribution.
14.\" 3. All advertising materials mentioning features or use of this software
15.\"    must display the following acknowledgement:
16.\"	This product includes software developed for the NetBSD Project
17.\"	by Jason R. Thorpe.
18.\" 4. The name of the author may not be used to endorse or promote products
19.\"    derived from this software without specific prior written permission.
20.\"
21.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.Dd December 29, 2014
34.Dt DISK 9
35.Os
36.Sh NAME
37.Nm disk ,
38.Nm disk_init ,
39.Nm disk_attach ,
40.Nm disk_begindetach ,
41.Nm disk_detach ,
42.Nm disk_destroy ,
43.Nm disk_busy ,
44.Nm disk_unbusy ,
45.Nm disk_isbusy ,
46.Nm disk_find ,
47.Nm disk_set_info
48.Nd generic disk framework
49.Sh SYNOPSIS
50.In sys/types.h
51.In sys/disklabel.h
52.In sys/disk.h
53.Ft void
54.Fn disk_init "struct disk *" "const char *name" "const struct dkdriver *driver"
55.Ft void
56.Fn disk_attach "struct disk *"
57.Ft void
58.Fn disk_begindetach "struct disk *" "int (*lastclose)(device_t)" "device_t self" "int flags"
59.Ft void
60.Fn disk_detach "struct disk *"
61.Ft void
62.Fn disk_destroy "struct disk *"
63.Ft void
64.Fn disk_busy "struct disk *"
65.Ft void
66.Fn disk_unbusy "struct disk *" "long bcount" "int read"
67.Ft bool
68.Fn disk_isbusy "struct disk *"
69.Ft struct disk *
70.Fn disk_find "const char *"
71.Ft void
72.Fn disk_set_info "device_t" "struct disk *" "const char *type"
73.Sh DESCRIPTION
74The
75.Nx
76generic disk framework is designed to provide flexible,
77scalable, and consistent handling of disk state and metrics information.
78The fundamental component of this framework is the
79.Nm disk
80structure, which is defined as follows:
81.Bd -literal
82struct disk {
83	TAILQ_ENTRY(disk) dk_link;	/* link in global disklist */
84	const char	*dk_name;	/* disk name */
85	prop_dictionary_t dk_info;	/* reference to disk-info dictionary */
86	int		dk_bopenmask;	/* block devices open */
87	int		dk_copenmask;	/* character devices open */
88	int		dk_openmask;	/* composite (bopen|copen) */
89	int		dk_state;	/* label state   ### */
90	int		dk_blkshift;	/* shift to convert DEV_BSIZE to blks */
91	int		dk_byteshift;	/* shift to convert bytes to blks */
92
93	/*
94	 * Metrics data; note that some metrics may have no meaning
95	 * on certain types of disks.
96	 */
97	struct io_stats	*dk_stats;
98
99	const struct dkdriver *dk_driver;	/* pointer to driver */
100
101	/*
102	 * Information required to be the parent of a disk wedge.
103	 */
104	kmutex_t	dk_rawlock;	/* lock on these fields */
105	u_int		dk_rawopens;	/* # of opens of rawvp */
106	struct vnode	*dk_rawvp;	/* vnode for the RAW_PART bdev */
107
108	kmutex_t	dk_openlock;	/* lock on these and openmask */
109	u_int		dk_nwedges;	/* # of configured wedges */
110					/* all wedges on this disk */
111	LIST_HEAD(, dkwedge_softc) dk_wedges;
112
113	/*
114	 * Disk label information.  Storage for the in-core disk label
115	 * must be dynamically allocated, otherwise the size of this
116	 * structure becomes machine-dependent.
117	 */
118	daddr_t		dk_labelsector;		/* sector containing label */
119	struct disklabel *dk_label;	/* label */
120	struct cpu_disklabel *dk_cpulabel;
121};
122.Ed
123.Pp
124The system maintains a global linked-list of all disks attached to the
125system.
126This list, called
127.Nm disklist ,
128may grow or shrink over time as disks are dynamically added and removed
129from the system.
130Drivers which currently make use of the detachment
131capability of the framework are the
132.Nm ccd ,
133.Nm dm ,
134and
135.Nm vnd
136pseudo-device drivers.
137.Pp
138The following is a brief description of each function in the framework:
139.Bl -tag -width ".Fn disk_set_info"
140.It Fn disk_init
141Initialize the disk structure.
142.It Fn disk_attach
143Attach a disk; allocate storage for the disklabel, set the
144.Dq attached time
145timestamp, insert the disk into the disklist, and increment the
146system disk count.
147.It Fn disk_begindetach
148Check whether the disk is open, and if not, return 0.
149If the disk is open, and
150.Dv DETACH_FORCE
151is not set in
152.Fa flags ,
153return
154.Dv EBUSY .
155Otherwise, call the provided
156.Fa lastclose
157routine
158.Po
159if not
160.Dv NULL
161.Pc
162and return its exit code.
163.It Fn disk_detach
164Detach a disk; free storage for the disklabel, remove the disk
165from the disklist, and decrement the system disk count.
166If the count drops below zero, panic.
167.It Fn disk_destroy
168Release resources used by the disk structure when it is no longer
169required.
170.It Fn disk_busy
171Increment the disk's
172.Dq busy counter .
173If this counter goes from 0 to 1, set the timestamp corresponding to
174this transfer.
175.It Fn disk_unbusy
176Decrement a disk's busy counter.
177If the count drops below zero, panic.
178Get the current time, subtract it from the disk's timestamp, and add
179the difference to the disk's running total.
180Set the disk's timestamp to the current time.
181If the provided byte count is greater than 0, add it to the disk's
182running total and increment the number of transfers performed by the disk.
183The third argument
184.Ar read
185specifies the direction of I/O;
186if non-zero it means reading from the disk,
187otherwise it means writing to the disk.
188.It Fn disk_isbusy
189Returns
190.Ar true
191if disk is marked as busy and false if it is not.
192.It Fn disk_find
193Return a pointer to the disk structure corresponding to the name provided,
194or
195.Dv NULL
196if the disk does not exist.
197.It Fn disk_set_info
198Setup disk-info dictionary and other dependent values of the disk structure,
199the driver must have initialized the dk_geom member of
200.Fa struct disk
201with suitable values.
202If
203.Fa type
204is not
205.Dv NULL ,
206it will be added to the dictionary.
207.El
208.Pp
209The functions typically called by device drivers are
210.Fn disk_init
211.Fn disk_attach ,
212.Fn disk_begindetach ,
213.Fn disk_detach ,
214.Fn disk_destroy ,
215.Fn disk_busy ,
216.Fn disk_unbusy ,
217and
218.Fn disk_set_info .
219The function
220.Fn disk_find
221is provided as a utility function.
222.Sh DISK IOCTLS
223The following ioctls should be implemented by disk drivers:
224.Bl -tag -width "xxxxxx"
225.It Dv DIOCGDINFO "struct disklabel"
226Get disklabel.
227.It Dv DIOCSDINFO "struct disklabel"
228Set in-memory disklabel.
229.It Dv DIOCWDINFO "struct disklabel"
230Set in-memory disklabel and write on-disk disklabel.
231.It Dv DIOCGPART "struct partinfo"
232Get partition information.
233This is used internally.
234.It Dv DIOCRFORMAT "struct format_op"
235Read format.
236.It Dv DIOCWFORMAT "struct format_op"
237Write format.
238.It Dv DIOCSSTEP "int"
239Set step rate.
240.It Dv DIOCSRETRIES "int"
241Set number of retries.
242.It Dv DIOCKLABEL "int"
243Specify whether to keep or drop the in-memory disklabel
244when the device is closed.
245.It Dv DIOCWLABEL "int"
246Enable or disable writing to the part of the disk that contains the label.
247.It Dv DIOCSBAD "struct dkbad"
248Set kernel dkbad.
249.It Dv DIOCEJECT "int"
250Eject removable disk.
251.It Dv DIOCLOCK "int"
252Lock or unlock disk pack.
253For devices with removable media, locking is intended to prevent
254the operator from removing the media.
255.It Dv DIOCGDEFLABEL "struct disklabel"
256Get default label.
257.It Dv DIOCCLRLABEL
258Clear disk label.
259.It Dv DIOCGCACHE "int"
260Get status of disk read and write caches.
261The result is a bitmask containing the following values:
262.Bl -tag -width DKCACHE_RCHANGE
263.It Dv DKCACHE_READ
264Read cache enabled.
265.It Dv DKCACHE_WRITE
266Write(back) cache enabled.
267.It Dv DKCACHE_RCHANGE
268Read cache enable is changeable.
269.It Dv DKCACHE_WCHANGE
270Write cache enable is changeable.
271.It Dv DKCACHE_SAVE
272Cache parameters may be saved, so that they persist across reboots
273or device detach/attach cycles.
274.El
275.It Dv DIOCSCACHE "int"
276Set status of disk read and write caches.
277The input is a bitmask in the same format as used for
278.Dv DIOCGCACHE .
279.It Dv DIOCCACHESYNC "int"
280Synchronise the disk cache.
281This causes information in the disk's write cache (if any)
282to be flushed to stable storage.
283The argument specifies whether or not to force a flush even if
284the kernel believes that there is no outstanding data.
285.It Dv DIOCBSLIST "struct disk_badsecinfo"
286Get bad sector list.
287.It Dv DIOCBSFLUSH
288Flush bad sector list.
289.It Dv DIOCAWEDGE "struct dkwedge_info"
290Add wedge.
291.It Dv DIOCGWEDGEINFO "struct dkwedge_info"
292Get wedge information.
293.It Dv DIOCDWEDGE "struct dkwedge_info"
294Delete wedge.
295.It Dv DIOCLWEDGES "struct dkwedge_list"
296List wedges.
297.It Dv DIOCGSTRATEGY "struct disk_strategy"
298Get disk buffer queue strategy.
299.It Dv DIOCSSTRATEGY "struct disk_strategy"
300Set disk buffer queue strategy.
301.It Dv DIOCGDISKINFO "struct plistref"
302Get disk-info dictionary.
303.It Dv DIOCGMEDIASIZE "off_t"
304Get disk size in bytes.
305.It Dv DIOCGSECTORSIZE "u_int"
306Get sector size in bytes.
307.El
308.Sh USING THE FRAMEWORK
309This section includes a description on basic use of the framework
310and example usage of its functions.
311Actual implementation of a device driver which uses the framework
312may vary.
313.Pp
314Each device in the system uses a
315.Dq softc
316structure which contains autoconfiguration and state information for that
317device.
318In the case of disks, the softc should also contain one instance
319of the disk structure, e.g.:
320.Bd -literal
321struct foo_softc {
322	device_t	sc_dev;		/* generic device information */
323	struct	disk	sc_dk;		/* generic disk information */
324	[ . . . more . . . ]
325};
326.Ed
327.Pp
328In order for the system to gather metrics data about a disk, the disk must
329be registered with the system.
330The
331.Fn disk_attach
332routine performs all of the functions currently required to register a disk
333with the system including allocation of disklabel storage space,
334recording of the time since boot that the disk was attached, and insertion
335into the disklist.
336Note that since this function allocates storage space for the disklabel,
337it must be called before the disklabel is read from the media or used in
338any other way.
339Before
340.Fn disk_attach
341is called, a portions of the disk structure must be initialized with
342data specific to that disk.
343For example, in the
344.Dq foo
345disk driver, the following would be performed in the autoconfiguration
346.Dq attach
347routine:
348.Bd -literal
349void
350fooattach(device_t parent, device_t self, void *aux)
351{
352	struct foo_softc *sc = device_private(self);
353	[ . . . ]
354
355	/* Initialize and attach the disk structure. */
356	disk_init(\*[Am]sc-\*[Gt]sc_dk, device_xname(self), \*[Am]foodkdriver);
357	disk_attach(\*[Am]sc-\*[Gt]sc_dk);
358
359	/* Read geometry and fill in pertinent parts of disklabel. */
360	/* Initialize geometry values of the disk structure */
361	[ . . . ]
362	disk_set_info(\*[Am]self\*[Gt], \*[Am]sc-\*[Gt]sc_dk, type);
363}
364.Ed
365.Pp
366The
367.Nm foodkdriver
368above is the disk's
369.Dq driver
370switch.
371This switch currently includes pointers to several driver entry points,
372where only the
373.Nm d_strategy
374entry point is used by the disk framework.
375This switch needs to have global scope and should be initialized as follows:
376.Bd -literal
377void    (foostrategy)(struct buf *);
378void    (foominphys)(struct buf *);
379int     (fooopen)(dev_t, int, int, struct lwp *);
380int     (fooclose)(dev_t, int, int, struct lwp *);
381int     (foo_discard)(device_t, off_t, off_t);
382int     (foo_diskstart)(device_t, struct buf *);
383void    (foo_iosize)(device_t, int *);
384int     (foo_dumpblocks)(device_t, void *, daddr_t, int);
385int     (foo_lastclose)(device_t);
386int     (foo_firstopen)(device_t, dev_t, int, int);
387int     (foo_label)(device_t, struct disklabel *);
388
389const struct dkdriver foodkdriver = {
390	.d_open = fooopen,
391	.d_close = fooclose,
392	.d_strategy = foostrategy,
393	.d_minphys = foominphys,
394	.d_discard = foo_discard,
395	.d_diskstart = foo_diskstart,	/* optional */
396	.d_dumpblocks = foo_dumpblocks,	/* optional */
397	.d_iosize = foo_iosize,		/* optional */
398	.d_firstopen = foo_firstopen,	/* optional */
399	.d_lastclose = foo_lastclose,	/* optional */
400	.d_label = foo_label,		/* optional */
401};
402.Ed
403.Pp
404Once the disk is attached, metrics may be gathered on that disk.
405In order to gather metrics data, the driver must tell the framework when
406the disk starts and stops operations.
407This functionality is provided by the
408.Fn disk_busy
409and
410.Fn disk_unbusy
411routines.
412Because
413.Nm struct disk
414is part of device driver private data it needs to be guarded.
415Mutual exclusion must be done by driver
416.Fn disk_busy
417and
418.Fn disk_unbusy
419are not thread safe.
420The
421.Fn disk_busy
422routine should be called immediately before a command to the disk is
423sent, e.g.:
424.Bd -literal
425void
426foostart(sc)
427	struct foo_softc *sc;
428{
429	[ . . . ]
430
431	/* Get buffer from drive's transfer queue. */
432	[ . . . ]
433
434	/* Build command to send to drive. */
435	[ . . . ]
436
437	/* Tell the disk framework we're going busy. */
438	mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx);
439	disk_busy(\*[Am]sc-\*[Gt]sc_dk);
440	mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx);
441
442	/* Send command to the drive. */
443	[ . . . ]
444}
445.Ed
446.Pp
447When
448.Fn disk_busy
449is called, a timestamp is taken if the disk's busy counter moves from
4500 to 1, indicating the disk has gone from an idle to non-idle state.
451At the end of a transaction, the
452.Fn disk_unbusy
453routine should be called.
454This routine performs some consistency checks,
455such as ensuring that the calls to
456.Fn disk_busy
457and
458.Fn disk_unbusy
459are balanced.
460This routine also performs the actual metrics calculation.
461A timestamp is taken and the difference from the timestamp taken in
462.Fn disk_busy
463is added to the disk's total running time.
464The disk's timestamp is then updated in case there is more than one
465pending transfer on the disk.
466A byte count is also added to the disk's running total, and if greater than
467zero, the number of transfers the disk has performed is incremented.
468The third argument
469.Ar read
470specifies the direction of I/O;
471if non-zero it means reading from the disk,
472otherwise it means writing to the disk.
473.Bd -literal
474void
475foodone(xfer)
476	struct foo_xfer *xfer;
477{
478	struct foo_softc = (struct foo_softc *)xfer-\*[Gt]xf_softc;
479	struct buf *bp = xfer-\*[Gt]xf_buf;
480	long nbytes;
481	[ . . . ]
482
483	/*
484	 * Get number of bytes transferred.  If there is no buf
485	 * associated with the xfer, we are being called at the
486	 * end of a non-I/O command.
487	 */
488	if (bp == NULL)
489		nbytes = 0;
490	else
491		nbytes = bp-\*[Gt]b_bcount - bp-\*[Gt]b_resid;
492
493	[ . . . ]
494
495	mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx);
496	/* Notify the disk framework that we've completed the transfer. */
497	disk_unbusy(\*[Am]sc-\*[Gt]sc_dk, nbytes,
498	    bp != NULL ? bp-\*[Gt]b_flags \*[Am] B_READ : 0);
499	mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx);
500
501	[ . . . ]
502}
503.Ed
504.Pp
505.Fn disk_isbusy
506is used to get status of disk device it returns true if device is
507currently busy and false if it is not.
508Like
509.Fn disk_busy
510and
511.Fn disk_unbusy
512it requires explicit locking from user side.
513.Sh CODE REFERENCES
514The disk framework itself is implemented within the file
515.Pa sys/kern/subr_disk.c .
516Data structures and function prototypes for the framework are located in
517.Pa sys/sys/disk.h .
518.Pp
519The
520.Nx
521machine-independent SCSI disk and CD-ROM drivers use the
522disk framework.
523They are located in
524.Pa sys/scsi/sd.c
525and
526.Pa sys/scsi/cd.c .
527.Pp
528The
529.Nx
530.Nm ccd ,
531.Nm dm ,
532and
533.Nm vnd
534drivers use the detachment capability of the framework.
535They are located in
536.Pa sys/dev/ccd.c ,
537.Pa sys/dev/vnd.c ,
538and
539.Pa sys/dev/dm/device-mapper.c .
540.Sh SEE ALSO
541.Xr ccd 4 ,
542.Xr dm 4 ,
543.Xr vnd 4 ,
544.Xr dksubr 9
545.Sh HISTORY
546The
547.Nx
548generic disk framework appeared in
549.Nx 1.2 .
550.Sh AUTHORS
551The
552.Nx
553generic disk framework was architected and implemented by
554.An Jason R. Thorpe
555.Aq thorpej@NetBSD.org .
556