xref: /netbsd-src/share/man/man9/disk.9 (revision 01869ca4d24a86379a68731bf9706a9f0820fe4e)
1.\"	$NetBSD: disk.9,v 1.46 2017/07/03 21:28:48 wiz Exp $
2.\"
3.\" Copyright (c) 1995, 1996 Jason R. Thorpe.
4.\" All rights reserved.
5.\"
6.\" Redistribution and use in source and binary forms, with or without
7.\" modification, are permitted provided that the following conditions
8.\" are met:
9.\" 1. Redistributions of source code must retain the above copyright
10.\"    notice, this list of conditions and the following disclaimer.
11.\" 2. Redistributions in binary form must reproduce the above copyright
12.\"    notice, this list of conditions and the following disclaimer in the
13.\"    documentation and/or other materials provided with the distribution.
14.\" 3. All advertising materials mentioning features or use of this software
15.\"    must display the following acknowledgement:
16.\"	This product includes software developed for the NetBSD Project
17.\"	by Jason R. Thorpe.
18.\" 4. The name of the author may not be used to endorse or promote products
19.\"    derived from this software without specific prior written permission.
20.\"
21.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31.\" SUCH DAMAGE.
32.\"
33.Dd March 5, 2017
34.Dt DISK 9
35.Os
36.Sh NAME
37.Nm disk ,
38.Nm disk_init ,
39.Nm disk_attach ,
40.Nm disk_begindetach ,
41.Nm disk_detach ,
42.Nm disk_destroy ,
43.Nm disk_wait ,
44.Nm disk_busy ,
45.Nm disk_unbusy ,
46.Nm disk_isbusy ,
47.Nm disk_find ,
48.Nm disk_set_info
49.Nd generic disk framework
50.Sh SYNOPSIS
51.In sys/types.h
52.In sys/disklabel.h
53.In sys/disk.h
54.Ft void
55.Fn disk_init "struct disk *" "const char *name" "const struct dkdriver *driver"
56.Ft void
57.Fn disk_attach "struct disk *"
58.Ft void
59.Fn disk_begindetach "struct disk *" "int (*lastclose)(device_t)" "device_t self" "int flags"
60.Ft void
61.Fn disk_detach "struct disk *"
62.Ft void
63.Fn disk_destroy "struct disk *"
64.Ft void
65.Fn disk_wait "struct disk *"
66.Ft void
67.Fn disk_busy "struct disk *"
68.Ft void
69.Fn disk_unbusy "struct disk *" "long bcount" "int read"
70.Ft bool
71.Fn disk_isbusy "struct disk *"
72.Ft struct disk *
73.Fn disk_find "const char *"
74.Ft void
75.Fn disk_set_info "device_t" "struct disk *" "const char *type"
76.Sh DESCRIPTION
77The
78.Nx
79generic disk framework is designed to provide flexible,
80scalable, and consistent handling of disk state and metrics information.
81The fundamental component of this framework is the
82.Nm disk
83structure, which is defined as follows:
84.Bd -literal
85struct disk {
86	TAILQ_ENTRY(disk) dk_link;	/* link in global disklist */
87	const char	*dk_name;	/* disk name */
88	prop_dictionary_t dk_info;	/* reference to disk-info dictionary */
89	int		dk_bopenmask;	/* block devices open */
90	int		dk_copenmask;	/* character devices open */
91	int		dk_openmask;	/* composite (bopen|copen) */
92	int		dk_state;	/* label state   ### */
93	int		dk_blkshift;	/* shift to convert DEV_BSIZE to blks */
94	int		dk_byteshift;	/* shift to convert bytes to blks */
95
96	/*
97	 * Metrics data; note that some metrics may have no meaning
98	 * on certain types of disks.
99	 */
100	struct io_stats	*dk_stats;
101
102	const struct dkdriver *dk_driver;	/* pointer to driver */
103
104	/*
105	 * Information required to be the parent of a disk wedge.
106	 */
107	kmutex_t	dk_rawlock;	/* lock on these fields */
108	u_int		dk_rawopens;	/* # of opens of rawvp */
109	struct vnode	*dk_rawvp;	/* vnode for the RAW_PART bdev */
110
111	kmutex_t	dk_openlock;	/* lock on these and openmask */
112	u_int		dk_nwedges;	/* # of configured wedges */
113					/* all wedges on this disk */
114	LIST_HEAD(, dkwedge_softc) dk_wedges;
115
116	/*
117	 * Disk label information.  Storage for the in-core disk label
118	 * must be dynamically allocated, otherwise the size of this
119	 * structure becomes machine-dependent.
120	 */
121	daddr_t		dk_labelsector;		/* sector containing label */
122	struct disklabel *dk_label;	/* label */
123	struct cpu_disklabel *dk_cpulabel;
124};
125.Ed
126.Pp
127The system maintains a global linked-list of all disks attached to the
128system.
129This list, called
130.Nm disklist ,
131may grow or shrink over time as disks are dynamically added and removed
132from the system.
133Drivers which currently make use of the detachment
134capability of the framework are the
135.Nm ccd ,
136.Nm dm ,
137and
138.Nm vnd
139pseudo-device drivers.
140.Pp
141The following is a brief description of each function in the framework:
142.Bl -tag -width ".Fn disk_set_info"
143.It Fn disk_init
144Initialize the disk structure.
145.It Fn disk_attach
146Attach a disk; allocate storage for the disklabel, set the
147.Dq attached time
148timestamp, insert the disk into the disklist, and increment the
149system disk count.
150.It Fn disk_begindetach
151Check whether the disk is open, and if not, return 0.
152If the disk is open, and
153.Dv DETACH_FORCE
154is not set in
155.Fa flags ,
156return
157.Dv EBUSY .
158Otherwise, call the provided
159.Fa lastclose
160routine
161.Po
162if not
163.Dv NULL
164.Pc
165and return its exit code.
166.It Fn disk_detach
167Detach a disk; free storage for the disklabel, remove the disk
168from the disklist, and decrement the system disk count.
169If the count drops below zero, panic.
170.It Fn disk_destroy
171Release resources used by the disk structure when it is no longer
172required.
173.It Fn disk_wait
174Disk timings are measured by counting the number of queued
175requests (wait counter) and requests issued to the hardware (busy counter)
176and keeping timestamp when the counters change.
177The time interval between
178two changes of a counter is accumulated into a total and also multiplied
179by the counter value and the accumulated into a sum.
180Both values can be
181used to determine how much time is spent in the driver queue or in-flight
182to the hardware as well as the average number of requests in either state.
183.Fn disk_wait
184increment the disk's wait counter and handles the accumulation.
185.It Fn disk_busy
186Decrements the disk's wait counter and increments the disk's
187.Dq busy counter ,
188and handles either accumulation.
189If the wait counter is still zero, it
190is assumed that the driver hasn't been updated to call
191.Fn disk_wait ,
192then only the values from the busy counter are available.
193.It Fn disk_unbusy
194Decrement the disk's busy counter and handles the accumulation.
195The third argument
196.Ar read
197specifies the direction of I/O;
198if non-zero it means reading from the disk,
199otherwise it means writing to the disk.
200.It Fn disk_isbusy
201Returns
202.Ar true
203if disk is marked as busy and false if it is not.
204.It Fn disk_find
205Return a pointer to the disk structure corresponding to the name provided,
206or
207.Dv NULL
208if the disk does not exist.
209.It Fn disk_set_info
210Setup disk-info dictionary and other dependent values of the disk structure,
211the driver must have initialized the dk_geom member of
212.Fa struct disk
213with suitable values.
214If
215.Fa type
216is not
217.Dv NULL ,
218it will be added to the dictionary.
219.El
220.Pp
221The functions typically called by device drivers are
222.Fn disk_init
223.Fn disk_attach ,
224.Fn disk_begindetach ,
225.Fn disk_detach ,
226.Fn disk_destroy ,
227.Fn disk_wait ,
228.Fn disk_busy ,
229.Fn disk_unbusy ,
230and
231.Fn disk_set_info .
232The function
233.Fn disk_find
234is provided as a utility function.
235.Sh DISK IOCTLS
236The following ioctls should be implemented by disk drivers:
237.Bl -tag -width "xxxxxx"
238.It Dv DIOCGDINFO "struct disklabel"
239Get disklabel.
240.It Dv DIOCSDINFO "struct disklabel"
241Set in-memory disklabel.
242.It Dv DIOCWDINFO "struct disklabel"
243Set in-memory disklabel and write on-disk disklabel.
244.It Dv DIOCGPART "struct partinfo"
245Get partition information.
246This is used internally.
247.It Dv DIOCRFORMAT "struct format_op"
248Read format.
249.It Dv DIOCWFORMAT "struct format_op"
250Write format.
251.It Dv DIOCSSTEP "int"
252Set step rate.
253.It Dv DIOCSRETRIES "int"
254Set number of retries.
255.It Dv DIOCKLABEL "int"
256Specify whether to keep or drop the in-memory disklabel
257when the device is closed.
258.It Dv DIOCWLABEL "int"
259Enable or disable writing to the part of the disk that contains the label.
260.It Dv DIOCSBAD "struct dkbad"
261Set kernel dkbad.
262.It Dv DIOCEJECT "int"
263Eject removable disk.
264.It Dv DIOCLOCK "int"
265Lock or unlock disk pack.
266For devices with removable media, locking is intended to prevent
267the operator from removing the media.
268.It Dv DIOCGDEFLABEL "struct disklabel"
269Get default label.
270.It Dv DIOCCLRLABEL
271Clear disk label.
272.It Dv DIOCGCACHE "int"
273Get status of disk read and write caches.
274The result is a bitmask containing the following values:
275.Bl -tag -width DKCACHE_RCHANGE
276.It Dv DKCACHE_READ
277Read cache enabled.
278.It Dv DKCACHE_WRITE
279Write(back) cache enabled.
280.It Dv DKCACHE_RCHANGE
281Read cache enable is changeable.
282.It Dv DKCACHE_WCHANGE
283Write cache enable is changeable.
284.It Dv DKCACHE_SAVE
285Cache parameters may be saved, so that they persist across reboots
286or device detach/attach cycles.
287.El
288.It Dv DIOCSCACHE "int"
289Set status of disk read and write caches.
290The input is a bitmask in the same format as used for
291.Dv DIOCGCACHE .
292.It Dv DIOCCACHESYNC "int"
293Synchronise the disk cache.
294This causes information in the disk's write cache (if any)
295to be flushed to stable storage.
296The argument specifies whether or not to force a flush even if
297the kernel believes that there is no outstanding data.
298.It Dv DIOCBSLIST "struct disk_badsecinfo"
299Get bad sector list.
300.It Dv DIOCBSFLUSH
301Flush bad sector list.
302.It Dv DIOCAWEDGE "struct dkwedge_info"
303Add wedge.
304.It Dv DIOCGWEDGEINFO "struct dkwedge_info"
305Get wedge information.
306.It Dv DIOCDWEDGE "struct dkwedge_info"
307Delete wedge.
308.It Dv DIOCLWEDGES "struct dkwedge_list"
309List wedges.
310.It Dv DIOCGSTRATEGY "struct disk_strategy"
311Get disk buffer queue strategy.
312.It Dv DIOCSSTRATEGY "struct disk_strategy"
313Set disk buffer queue strategy.
314.It Dv DIOCGDISKINFO "struct plistref"
315Get disk-info dictionary.
316.It Dv DIOCGMEDIASIZE "off_t"
317Get disk size in bytes.
318.It Dv DIOCGSECTORSIZE "u_int"
319Get sector size in bytes.
320.El
321.Sh USING THE FRAMEWORK
322This section includes a description on basic use of the framework
323and example usage of its functions.
324Actual implementation of a device driver which uses the framework
325may vary.
326.Pp
327Each device in the system uses a
328.Dq softc
329structure which contains autoconfiguration and state information for that
330device.
331In the case of disks, the softc should also contain one instance
332of the disk structure, e.g.:
333.Bd -literal
334struct foo_softc {
335	device_t	sc_dev;		/* generic device information */
336	struct	disk	sc_dk;		/* generic disk information */
337	[ . . . more . . . ]
338};
339.Ed
340.Pp
341In order for the system to gather metrics data about a disk, the disk must
342be registered with the system.
343The
344.Fn disk_attach
345routine performs all of the functions currently required to register a disk
346with the system including allocation of disklabel storage space,
347recording of the time since boot that the disk was attached, and insertion
348into the disklist.
349Note that since this function allocates storage space for the disklabel,
350it must be called before the disklabel is read from the media or used in
351any other way.
352Before
353.Fn disk_attach
354is called, a portions of the disk structure must be initialized with
355data specific to that disk.
356For example, in the
357.Dq foo
358disk driver, the following would be performed in the autoconfiguration
359.Dq attach
360routine:
361.Bd -literal
362void
363fooattach(device_t parent, device_t self, void *aux)
364{
365	struct foo_softc *sc = device_private(self);
366	[ . . . ]
367
368	/* Initialize and attach the disk structure. */
369	disk_init(&sc->sc_dk, device_xname(self), &foodkdriver);
370	disk_attach(&sc->sc_dk);
371
372	/* Read geometry and fill in pertinent parts of disklabel. */
373	/* Initialize geometry values of the disk structure */
374	[ . . . ]
375	disk_set_info(&self>, &sc->sc_dk, type);
376}
377.Ed
378.Pp
379The
380.Nm foodkdriver
381above is the disk's
382.Dq driver
383switch.
384This switch currently includes pointers to several driver entry points,
385where only the
386.Nm d_strategy
387entry point is used by the disk framework.
388This switch needs to have global scope and should be initialized as follows:
389.Bd -literal
390void    (foostrategy)(struct buf *);
391void    (foominphys)(struct buf *);
392int     (fooopen)(dev_t, int, int, struct lwp *);
393int     (fooclose)(dev_t, int, int, struct lwp *);
394int     (foo_discard)(device_t, off_t, off_t);
395int     (foo_diskstart)(device_t, struct buf *);
396void    (foo_iosize)(device_t, int *);
397int     (foo_dumpblocks)(device_t, void *, daddr_t, int);
398int     (foo_lastclose)(device_t);
399int     (foo_firstopen)(device_t, dev_t, int, int);
400int     (foo_label)(device_t, struct disklabel *);
401
402const struct dkdriver foodkdriver = {
403	.d_open = fooopen,
404	.d_close = fooclose,
405	.d_strategy = foostrategy,
406	.d_minphys = foominphys,
407	.d_discard = foo_discard,
408	.d_diskstart = foo_diskstart,	/* optional */
409	.d_dumpblocks = foo_dumpblocks,	/* optional */
410	.d_iosize = foo_iosize,		/* optional */
411	.d_firstopen = foo_firstopen,	/* optional */
412	.d_lastclose = foo_lastclose,	/* optional */
413	.d_label = foo_label,		/* optional */
414};
415.Ed
416.Pp
417Once the disk is attached, metrics may be gathered on that disk.
418In order to gather metrics data, the driver must tell the framework when
419the disk queues, starts and stops operations.
420This functionality is provided by the
421.Fn disk_wait ,
422.Fn disk_busy
423and
424.Fn disk_unbusy
425routines.
426Because
427.Nm struct disk
428is part of device driver private data it needs to be guarded.
429Mutual exclusion must be done by driver
430.Fn disk_wait ,
431.Fn disk_busy
432and
433.Fn disk_unbusy
434are not thread safe.
435The
436.Fn disk_busy
437routine should be called immediately before a command to the disk is
438sent, e.g.:
439.Bd -literal
440void
441foostrategy(struct buf *bp)
442{
443	[ . . . ]
444
445	mutex_enter(&sc->sc_dk_mtx);
446	disk_wait(&sc->sc_dk);
447
448	/* Put buffer onto drive's transfer queue */
449
450	mutex_exit(&sc->sc_dk_mtx);
451
452	foostart(sc);
453}
454
455void
456foostart(struct foo_softc *sc)
457{
458	[ . . . ]
459
460	/* Get buffer from drive's transfer queue. */
461	[ . . . ]
462
463	/* Build command to send to drive. */
464	[ . . . ]
465
466	/* Tell the disk framework we're going busy. */
467	mutex_enter(&sc->sc_dk_mtx);
468	disk_busy(&sc->sc_dk);
469	mutex_exit(&sc->sc_dk_mtx);
470
471	/* Send command to the drive. */
472	[ . . . ]
473}
474.Ed
475.Pp
476The routine
477.Fn disk_unbusy
478performs some consistency checks, such as ensuring that the calls to
479.Fn disk_busy
480and
481.Fn disk_unbusy
482are balanced.
483It also performs the final steps of the metrics calcuation.
484A byte count is added to the disk's running total, and if greater than
485zero, the number of transfers the disk has performed is incremented.
486The third argument
487.Ar read
488specifies the direction of I/O;
489if non-zero it means reading from the disk,
490otherwise it means writing to the disk.
491.Bd -literal
492void
493foodone(xfer)
494	struct foo_xfer *xfer;
495{
496	struct foo_softc = (struct foo_softc *)xfer->xf_softc;
497	struct buf *bp = xfer->xf_buf;
498	long nbytes;
499	[ . . . ]
500
501	/*
502	 * Get number of bytes transferred.  If there is no buf
503	 * associated with the xfer, we are being called at the
504	 * end of a non-I/O command.
505	 */
506	if (bp == NULL)
507		nbytes = 0;
508	else
509		nbytes = bp->b_bcount - bp->b_resid;
510
511	[ . . . ]
512
513	mutex_enter(&sc->sc_dk_mtx);
514	/* Notify the disk framework that we've completed the transfer. */
515	disk_unbusy(&sc->sc_dk, nbytes,
516	    bp != NULL ? bp->b_flags & B_READ : 0);
517	mutex_exit(&sc->sc_dk_mtx);
518
519	[ . . . ]
520}
521.Ed
522.Pp
523.Fn disk_isbusy
524is used to get status of disk device it returns true if device is
525currently busy and false if it is not.
526Like
527.Fn disk_wait ,
528.Fn disk_busy
529and
530.Fn disk_unbusy
531it requires explicit locking from user side.
532.Sh CODE REFERENCES
533The disk framework itself is implemented within the file
534.Pa sys/kern/subr_disk.c .
535Data structures and function prototypes for the framework are located in
536.Pa sys/sys/disk.h .
537.Pp
538The
539.Nx
540machine-independent SCSI disk and CD-ROM drivers use the
541disk framework.
542They are located in
543.Pa sys/scsi/sd.c
544and
545.Pa sys/scsi/cd.c .
546.Pp
547The
548.Nx
549.Nm ccd ,
550.Nm dm ,
551and
552.Nm vnd
553drivers use the detachment capability of the framework.
554They are located in
555.Pa sys/dev/ccd.c ,
556.Pa sys/dev/vnd.c ,
557and
558.Pa sys/dev/dm/device-mapper.c .
559.Sh SEE ALSO
560.Xr ccd 4 ,
561.Xr dm 4 ,
562.Xr vnd 4 ,
563.Xr dksubr 9
564.Sh HISTORY
565The
566.Nx
567generic disk framework appeared in
568.Nx 1.2 .
569.Sh AUTHORS
570The
571.Nx
572generic disk framework was architected and implemented by
573.An Jason R. Thorpe
574.Aq Mt thorpej@NetBSD.org .
575