xref: /netbsd-src/share/man/man9/wapbl.9 (revision 88f86411180858c659b44a4139ca3d7d3963d6b3)
1.\"	$NetBSD: wapbl.9,v 1.15 2017/03/18 19:01:01 riastradh Exp $
2.\"
3.\" Copyright (c) 2015 The NetBSD Foundation, Inc.
4.\" All rights reserved.
5.\"
6.\" This code is derived from software contributed to The NetBSD Foundation
7.\" by Taylor R. Campbell.
8.\"
9.\" Redistribution and use in source and binary forms, with or without
10.\" modification, are permitted provided that the following conditions
11.\" are met:
12.\" 1. Redistributions of source code must retain the above copyright
13.\"    notice, this list of conditions and the following disclaimer.
14.\" 2. Redistributions in binary form must reproduce the above copyright
15.\"    notice, this list of conditions and the following disclaimer in the
16.\"    documentation and/or other materials provided with the distribution.
17.\"
18.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28.\" POSSIBILITY OF SUCH DAMAGE.
29.\"
30.Dd March 26, 2015
31.Dt WAPBL 9
32.Os
33.Sh NAME
34.Nm WAPBL ,
35.Nm wapbl_start ,
36.Nm wapbl_stop ,
37.Nm wapbl_begin ,
38.Nm wapbl_end ,
39.Nm wapbl_flush ,
40.Nm wapbl_discard ,
41.Nm wapbl_add_buf ,
42.Nm wapbl_remove_buf ,
43.Nm wapbl_resize_buf ,
44.Nm wapbl_register_inode ,
45.Nm wapbl_unregister_inode ,
46.Nm wapbl_register_deallocation ,
47.Nm wapbl_jlock_assert ,
48.Nm wapbl_junlock_assert
49.Nd write-ahead physical block logging for file systems
50.Sh SYNOPSIS
51.In sys/wapbl.h
52.Vt typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int) ;
53.Ft int
54.Fn wapbl_start "struct wapbl **wlp" "struct mount *mp" "struct vnode *devvp" \
55        "daddr_t off" "size_t count" "size_t blksize" \
56        "struct wapbl_replay *wr" \
57        "wapbl_flush_fn_t flushfn" "wapbl_flush_fn_t flushabortfn"
58.Ft int
59.Fn wapbl_stop "struct wapbl *wl" "int force"
60.Ft int
61.Fn wapbl_begin "struct wapbl *wl" "const char *file" "int line"
62.Ft void
63.Fn wapbl_end "struct wapbl *wl"
64.Ft int
65.Fn wapbl_flush "struct wapbl *wl" "int wait"
66.Ft void
67.Fn wapbl_discard "struct wapbl *wl"
68.Ft void
69.Fn wapbl_add_buf "struct wapbl *wl" "struct buf *bp"
70.Ft void
71.Fn wapbl_remove_buf "struct wapbl *wl" "struct buf *bp"
72.Ft void
73.Fn wapbl_resize_buf "struct wapbl *wl" "struct buf *bp" "long oldsz" \
74       "long oldcnt"
75.Ft void
76.Fn wapbl_register_inode "struct wapbl *wl" "ino_t ino" "mode_t mode"
77.Ft void
78.Fn wapbl_unregister_inode "struct wapbl *wl" "ino_t ino" "mode_t mode"
79.Ft void
80.Fn wapbl_register_deallocation "struct wapbl *wl" "daddr_t blk" "int len"
81.Ft void
82.Fn wapbl_jlock_assert "struct wapbl *wl"
83.Ft void
84.Fn wapbl_junlock_assert "struct wapbl *wl"
85.Sh DESCRIPTION
86.Nm ,
87or
88.Em write-ahead physical block logging ,
89is an abstraction for file systems to write physical blocks in the
90.Xr buffercache 9
91to a bounded-size log first before their real destinations on disk.
92The name means:
93.Bl -tag -width "physical block" -offset abcd
94.It logging
95batches of writes are issued atomically via a log
96.It physical block
97only physical blocks, not logical file system operations, are stored in
98the log
99.It write-ahead
100before writing a block to disk, its new content, rather than its old
101content for roll-back, is recorded in the log
102.El
103.Pp
104When a file system using
105.Nm
106issues writes (as in
107.Xr bwrite 9
108or
109.Xr bdwrite 9 ) ,
110they are grouped in batches called
111.Em transactions
112in memory, which are serialized to be consistent with program order
113before
114.Nm
115submits them to disk atomically.
116.Pp
117Thus, within a transaction, after one write, another write need not
118wait for disk I/O, and if the system is interrupted, e.g. by a crash or
119by power failure, either both writes will appear on disk, or neither
120will.
121.Pp
122When a transaction is full, it is written to a circular buffer on
123disk called the
124.Em log .
125When the transaction has been written to disk, every write in the
126transaction is submitted to disk asynchronously.
127Finally, the file system may issue new writes via
128.Nm
129once enough writes submitted to disk have completed.
130.Pp
131After interruption, such as a crash or power failure, some writes
132issued by the file system may not have completed.
133However, the log is written consistently with program order and before
134file system writes are submitted to disk.
135Hence a consistent program-order view of the file system can be
136attained by resubmitting the writes that were successfully stored in
137the log using
138.Xr wapbl_replay 9 .
139This may not be the same state just before interruption \(em writes in
140transactions that did not reach the disk will be excluded.
141.Pp
142For a file system to use
143.Nm ,
144its
145.Xr VFS_MOUNT 9
146method should first replay any journal on disk using
147.Xr wapbl_replay 9 ,
148and then, if the mount is read/write, initialize
149.Nm
150for the mount by calling
151.Fn wapbl_start .
152The
153.Xr VFS_UNMOUNT 9
154method should call
155.Fn wapbl_stop .
156.Pp
157Before issuing any
158.Xr buffercache 9
159writes, the file system must acquire a shared lock on the current
160.Nm
161transaction with
162.Fn wapbl_begin ,
163which may sleep until there is room in the transaction for new writes.
164After issuing the writes, the file system must release its shared lock
165on the transaction with
166.Fn wapbl_end .
167Either all writes issued between
168.Fn wapbl_begin
169and
170.Fn wapbl_end
171will complete, or none of them will.
172.Pp
173File systems may also witness an
174.Em exclusive
175lock on the current transaction when
176.Nm
177is flushing the transaction to disk, or aborting a flush, and invokes a
178file system's callback.
179File systems can assert that the transaction is locked with
180.Fn wapbl_jlock_assert ,
181or not
182.Em exclusively
183locked, with
184.Fn wapbl_junlock_assert .
185.Pp
186If a file system requires multiple transactions to initialize an
187inode, and needs to destroy partially initialized inodes during replay,
188it can register them by
189.Vt ino_t
190inode number before initialization with
191.Fn wapbl_register_inode
192and unregister them with
193.Fn wapbl_unregister_inode
194once initialization is complete.
195.Nm
196does not actually concern itself whether the objects identified by
197.Vt ino_t
198values are
199.Sq inodes
200or
201.Sq quaggas
202or anything else \(em file systems may use this to list any objects
203keyed by
204.Vt ino_t
205value in the log.
206.Pp
207When a file system frees resources on disk and issues writes to reflect
208the fact, it cannot then reuse the resources until the writes have
209reached the disk.
210However, as far as the
211.Xr buffercache 9
212is concerned, as soon as the file system issues the writes, they will
213appear to have been written.
214So the file system must not attempt to reuse the resource until the
215current
216.Nm
217transaction has been flushed to disk.
218.Pp
219The file system can defer freeing a resource by calling
220.Fn wapbl_register_deallocation
221to record the disk address of the resource and length in bytes of the
222resource.
223Then, when
224.Nm
225next flushes the transaction to disk, it will pass an array of the disk
226addresses and lengths in bytes to a file-system-supplied callback.
227(Again,
228.Nm
229does not care whether the
230.Sq disk address
231or
232.Sq length in bytes
233is actually that; it will pass along
234.Vt daddr_t
235and
236.Vt int
237values.)
238.Sh FUNCTIONS
239.Bl -tag -width abcd
240.It Fn wapbl_start wlp mp devvp off count blksize wr flushfn flushabortfn
241Start using
242.Nm
243for the file system mounted at
244.Fa mp ,
245storing a log of
246.Fa count
247disk sectors at disk address
248.Fa off
249on the block device
250.Fa devvp
251writing blocks in units of
252.Fa blksize
253bytes.
254On success, stores an opaque
255.Vt "struct wapbl *"
256cookie in
257.Li * Ns Fa wlp
258for use with the other
259.Nm
260routines and returns zero.
261On failure, returns an error number.
262.Pp
263If the file system had replayed the log with
264.Xr wapbl_replay 9 ,
265then
266.Fa wr
267must be the
268.Vt "struct wapbl_replay *"
269cookie used to replay it, and
270.Fn wapbl_start
271will register any inodes that were in the log as if with
272.Fn wapbl_register_inode ;
273otherwise
274.Fa wr
275must be
276.Dv NULL .
277.Pp
278.Fa flushfn
279is a callback that
280.Nm
281will invoke as
282.Fa flushfn ( Fa mp , Fa deallocblks , Fa dealloclens , Fa dealloccnt )
283just before it flushes a transaction to disk, with the an exclusive
284lock held on the transaction, where
285.Fa mp
286is the mount point passed to
287.Fn wapbl_start ,
288.Fa deallocblks
289is an array of
290.Fa dealloccnt
291disk addresses, and
292.Fa dealloclens
293is an array of
294.Fa dealloccnt
295lengths, corresponding to the addresses and lengths the file system
296passed to
297.Fn wapbl_register_deallocation .
298If flushing the transaction to disk fails,
299.Nm
300will call
301.Fa flushabortfn
302with the same arguments to undo any effects that
303.Fa flushfn
304had.
305.It Fn wapbl_stop wl force
306Flush the current transaction to disk and stop using
307.Nm .
308If flushing the transaction fails and
309.Fa force
310is zero,
311return error.
312If flushing the transaction fails and
313.Fa force
314is nonzero, discard the transaction, permanently losing any writes in
315it.
316If flushing the transaction is successful or if
317.Fa force
318is nonzero,
319free memory associated with
320.Fa wl
321and return zero.
322.It Fn wapbl_begin wl file line
323Wait for space in the current transaction for new writes, flushing it
324if necessary, and acquire a shared lock on it.
325.Pp
326The lock is not exclusive: other threads may acquire shared locks on
327the transaction too.
328The lock is not recursive: a thread may not acquire it again without
329calling
330.Fa wapbl_end
331first.
332.Pp
333May sleep.
334.Pp
335.Fa file
336and
337.Fa line
338are the file name and line number of the caller for debugging
339purposes.
340.It Fn wapbl_end wl
341Release a shared lock on the transaction acquired with
342.Fn wapbl_begin .
343.It Fn wapbl_flush wl wait
344Flush the current transaction to disk.
345If
346.Fa wait
347is nonzero, wait for all writes in the current transaction to
348complete.
349.Pp
350The current transaction must not be locked.
351.It Fn wapbl_discard wl
352Discard the current transaction, permanently losing any writes in it.
353.Pp
354The current transaction must not be locked.
355.It Fn wapbl_add_buf wl bp
356Add the buffer
357.Fa bp
358to the current transaction, which must be locked, because someone has
359asked to write it.
360.Pp
361This is meant to be called from within
362.Xr buffercache 9 ,
363not by file systems directly.
364.It Fn wapbl_remove_buf wl bp
365Remove the buffer
366.Fa bp ,
367which must have been added using
368.Fa wapbl_add_buf ,
369from the current transaction, which must be locked, because it has been
370invalidated (or XXX ???).
371.Pp
372This is meant to be called from within
373.Xr buffercache 9 ,
374not by file systems directly.
375.It Fn wapbl_resize_buf wl bp oldsz oldcnt
376Note that the buffer
377.Fa bp ,
378which must have been added using
379.Fa wapbl_add_buf ,
380has changed size, where
381.Fa oldsz
382is the previous allocated size in bytes and
383.Fa oldcnt
384is the previous number of valid bytes in
385.Fa bp .
386.Pp
387This is meant to be called from within
388.Xr buffercache 9 ,
389not by file systems directly.
390.It Fn wapbl_register_inode wl ino mode
391Register
392.Fa ino
393with the mode
394.Fa mode
395as commencing initialization.
396.It Fn wapbl_unregister_inode wl ino mode
397Unregister
398.Fa ino ,
399which must have previously been registered with
400.Fa wapbl_register_inode
401using the same
402.Fa mode ,
403now that its initialization has completed.
404.It Fn wapbl_register_deallocation wl blk len
405Register
406.Fa len
407bytes at the disk address
408.Fa blk
409as ready for deallocation, so that they will be passed to the
410.Fa flushfn
411that was given to
412.Fn wapbl_start .
413.It Fn wapbl_jlock_assert wl
414Assert that the current transaction is locked.
415.Pp
416Note that it might not be locked by the current thread: this assertion
417passes if
418.Em any
419thread has it locked.
420.It Fn wapbl_junlock_assert wl
421Assert that the current transaction is not exclusively locked by the
422current thread.
423.Pp
424Users of
425.Nm
426observe exclusive locks only in the
427.Fa flushfn
428and
429.Fa flushabortfn
430callbacks to
431.Fn wapbl_start .
432Outside of such contexts, the transaction is never exclusively locked,
433even between
434.Fn wapbl_begin
435and
436.Fn wapbl_end .
437.Pp
438There is no way to assert that the current transaction is not locked at
439all \(em i.e., that the caller may acquire a shared lock on the
440transaction with
441.Fn wapbl_begin
442without danger of deadlock.
443.El
444.Sh CODE REFERENCES
445The
446.Nm
447subsystem is implemented in
448.Pa sys/kern/vfs_wapbl.c ,
449with hooks in
450.Pa sys/kern/vfs_bio.c .
451.Sh SEE ALSO
452.Xr buffercache 9 ,
453.Xr vfsops 9 ,
454.Xr wapbl_replay 9
455.Sh BUGS
456.Nm
457works only for file system metadata managed via the
458.Xr buffercache 9 ,
459and provides no way to log writes via the page cache, as in
460.Xr VOP_GETPAGES 9 ,
461.Xr VOP_PUTPAGES 9 ,
462and
463.Xr ubc_uiomove 9 ,
464which is normally used for file data.
465.Pp
466Not only is
467.Nm
468unable to log writes via the page cache, it is also unable to defer
469.Xr buffercache 9
470writes until cached pages have been written.
471This manifests as the well-known garbage-data-appended-after-crash bug
472in FFS: when appending to a file, the pages containing new data may not
473reach the disk before the inode update reporting its new size.
474After a crash, the inode update will be on disk, but the new data will
475not be \(em instead, whatever garbage data in the free space will
476appear to have been appended to the file.
477.Nm
478exacerbates the problem by increasing the throughput of metadata
479writes, because it can issue many metadata writes asynchronously that
480FFS without
481.Nm
482would need to issue synchronously in order for
483.Xr fsck 8
484to work.
485.Pp
486The criteria for when the transaction must be flushed to disk before
487.Fn wapbl_begin
488returns are heuristic, i.e. wrong.
489There is no way for a file system to communicate to
490.Fn wapbl_begin
491how many buffers, inodes, and deallocations it will issue via
492.Nm
493in the transaction.
494.Pp
495.Nm
496mainly supports write-ahead, and has only limited support for rolling
497back operations, in the form of
498.Fn wapbl_register_inode
499and
500.Fn wapbl_unregister_inode .
501Consequently, for example, large writes appending to a file, which
502requires multiple disk block allocations and an inode update, must
503occur in a single transaction \(em there is no way to roll back the
504disk block allocations if the write fails in the middle, e.g. because
505of a fault in the middle of the user buffer.
506.Pp
507.Fn wapbl_jlock_assert
508does not guarantee that the current thread has the current transaction
509locked.
510.Fn wapbl_junlock_assert
511does not guarantee that the current thread does not have the current
512transaction locked at all.
513.Pp
514There is only one
515.Nm
516transaction for each file system at any given time, and only one
517.Nm
518log on disk.
519Consequently, all writes are serialized.
520Extending
521.Nm
522to support multiple logs per file system, partitioned according to an
523appropriate scheme, is left as an exercise for the reader.
524.Pp
525There is no reason for
526.Nm
527to require its own hooks in
528.Xr buffercache 9 .
529.Pp
530The on-disk format used by
531.Nm
532is undocumented.
533