xref: /onnv-gate/usr/src/uts/common/fs/zfs/zil.c (revision 789)
1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens #include <sys/zfs_context.h>
30*789Sahrens #include <sys/spa.h>
31*789Sahrens #include <sys/dmu.h>
32*789Sahrens #include <sys/zap.h>
33*789Sahrens #include <sys/arc.h>
34*789Sahrens #include <sys/stat.h>
35*789Sahrens #include <sys/resource.h>
36*789Sahrens #include <sys/zil.h>
37*789Sahrens #include <sys/zil_impl.h>
38*789Sahrens #include <sys/dsl_dataset.h>
39*789Sahrens #include <sys/vdev.h>
40*789Sahrens 
41*789Sahrens 
42*789Sahrens /*
43*789Sahrens  * The zfs intent log (ZIL) saves transaction records of system calls
44*789Sahrens  * that change the file system in memory with enough information
45*789Sahrens  * to be able to replay them. These are stored in memory until
46*789Sahrens  * either the DMU transaction group (txg) commits them to the stable pool
47*789Sahrens  * and they can be discarded, or they are flushed to the stable log
48*789Sahrens  * (also in the pool) due to a fsync, O_DSYNC or other synchronous
49*789Sahrens  * requirement. In the event of a panic or power fail then those log
50*789Sahrens  * records (transactions) are replayed.
51*789Sahrens  *
52*789Sahrens  * There is one ZIL per file system. Its on-disk (pool) format consists
53*789Sahrens  * of 3 parts:
54*789Sahrens  *
55*789Sahrens  * 	- ZIL header
56*789Sahrens  * 	- ZIL blocks
57*789Sahrens  * 	- ZIL records
58*789Sahrens  *
59*789Sahrens  * A log record holds a system call transaction. Log blocks can
60*789Sahrens  * hold many log records and the blocks are chained together.
61*789Sahrens  * Each ZIL block contains a block pointer (blkptr_t) to the next
62*789Sahrens  * ZIL block in the chain. The ZIL header points to the first
63*789Sahrens  * block in the chain. Note there is not a fixed place in the pool
64*789Sahrens  * to hold blocks. They are dynamically allocated and freed as
65*789Sahrens  * needed from the blocks available. Figure X shows the ZIL structure:
66*789Sahrens  */
67*789Sahrens 
68*789Sahrens /*
69*789Sahrens  * These global ZIL switches affect all pools
70*789Sahrens  */
71*789Sahrens int zil_disable = 0;	/* disable intent logging */
72*789Sahrens int zil_always = 0;	/* make every transaction synchronous */
73*789Sahrens int zil_purge = 0;	/* at pool open, just throw everything away */
74*789Sahrens int zil_noflush = 0;	/* don't flush write cache buffers on disks */
75*789Sahrens 
76*789Sahrens static kmem_cache_t *zil_lwb_cache;
77*789Sahrens 
78*789Sahrens static int
79*789Sahrens zil_dva_compare(const void *x1, const void *x2)
80*789Sahrens {
81*789Sahrens 	const dva_t *dva1 = x1;
82*789Sahrens 	const dva_t *dva2 = x2;
83*789Sahrens 
84*789Sahrens 	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
85*789Sahrens 		return (-1);
86*789Sahrens 	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
87*789Sahrens 		return (1);
88*789Sahrens 
89*789Sahrens 	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
90*789Sahrens 		return (-1);
91*789Sahrens 	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
92*789Sahrens 		return (1);
93*789Sahrens 
94*789Sahrens 	return (0);
95*789Sahrens }
96*789Sahrens 
97*789Sahrens static void
98*789Sahrens zil_dva_tree_init(avl_tree_t *t)
99*789Sahrens {
100*789Sahrens 	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
101*789Sahrens 	    offsetof(zil_dva_node_t, zn_node));
102*789Sahrens }
103*789Sahrens 
104*789Sahrens static void
105*789Sahrens zil_dva_tree_fini(avl_tree_t *t)
106*789Sahrens {
107*789Sahrens 	zil_dva_node_t *zn;
108*789Sahrens 	void *cookie = NULL;
109*789Sahrens 
110*789Sahrens 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
111*789Sahrens 		kmem_free(zn, sizeof (zil_dva_node_t));
112*789Sahrens 
113*789Sahrens 	avl_destroy(t);
114*789Sahrens }
115*789Sahrens 
116*789Sahrens static int
117*789Sahrens zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
118*789Sahrens {
119*789Sahrens 	zil_dva_node_t *zn;
120*789Sahrens 	avl_index_t where;
121*789Sahrens 
122*789Sahrens 	if (avl_find(t, dva, &where) != NULL)
123*789Sahrens 		return (EEXIST);
124*789Sahrens 
125*789Sahrens 	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
126*789Sahrens 	zn->zn_dva = *dva;
127*789Sahrens 	avl_insert(t, zn, where);
128*789Sahrens 
129*789Sahrens 	return (0);
130*789Sahrens }
131*789Sahrens 
132*789Sahrens /*
133*789Sahrens  * Read a log block, make sure it's valid, and byteswap it if necessary.
134*789Sahrens  */
135*789Sahrens static int
136*789Sahrens zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
137*789Sahrens {
138*789Sahrens 	uint64_t blksz = BP_GET_LSIZE(bp);
139*789Sahrens 	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
140*789Sahrens 	zio_cksum_t cksum;
141*789Sahrens 	int error;
142*789Sahrens 
143*789Sahrens 	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
144*789Sahrens 	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
145*789Sahrens 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
146*789Sahrens 	if (error) {
147*789Sahrens 		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
148*789Sahrens 		    zilog, bp, error);
149*789Sahrens 		return (error);
150*789Sahrens 	}
151*789Sahrens 
152*789Sahrens 	if (BP_SHOULD_BYTESWAP(bp))
153*789Sahrens 		byteswap_uint64_array(buf, blksz);
154*789Sahrens 
155*789Sahrens 	/*
156*789Sahrens 	 * Sequence numbers should be... sequential.  The checksum verifier for
157*789Sahrens 	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
158*789Sahrens 	 */
159*789Sahrens 	cksum = bp->blk_cksum;
160*789Sahrens 	cksum.zc_word[3]++;
161*789Sahrens 	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
162*789Sahrens 		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
163*789Sahrens 		return (ESTALE);
164*789Sahrens 	}
165*789Sahrens 
166*789Sahrens 	if (BP_IS_HOLE(&ztp->zit_next_blk)) {
167*789Sahrens 		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
168*789Sahrens 		return (ENOENT);
169*789Sahrens 	}
170*789Sahrens 
171*789Sahrens 	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
172*789Sahrens 		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
173*789Sahrens 		return (EOVERFLOW);
174*789Sahrens 	}
175*789Sahrens 
176*789Sahrens 	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
177*789Sahrens 
178*789Sahrens 	return (0);
179*789Sahrens }
180*789Sahrens 
181*789Sahrens /*
182*789Sahrens  * Parse the intent log, and call parse_func for each valid record within.
183*789Sahrens  */
184*789Sahrens void
185*789Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
186*789Sahrens     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
187*789Sahrens {
188*789Sahrens 	blkptr_t blk;
189*789Sahrens 	char *lrbuf, *lrp;
190*789Sahrens 	zil_trailer_t *ztp;
191*789Sahrens 	int reclen, error;
192*789Sahrens 
193*789Sahrens 	blk = zilog->zl_header->zh_log;
194*789Sahrens 	if (BP_IS_HOLE(&blk))
195*789Sahrens 		return;
196*789Sahrens 
197*789Sahrens 	/*
198*789Sahrens 	 * Starting at the block pointed to by zh_log we read the log chain.
199*789Sahrens 	 * For each block in the chain we strongly check that block to
200*789Sahrens 	 * ensure its validity.  We stop when an invalid block is found.
201*789Sahrens 	 * For each block pointer in the chain we call parse_blk_func().
202*789Sahrens 	 * For each record in each valid block we call parse_lr_func().
203*789Sahrens 	 */
204*789Sahrens 	zil_dva_tree_init(&zilog->zl_dva_tree);
205*789Sahrens 	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
206*789Sahrens 	for (;;) {
207*789Sahrens 		error = zil_read_log_block(zilog, &blk, lrbuf);
208*789Sahrens 
209*789Sahrens 		if (parse_blk_func != NULL)
210*789Sahrens 			parse_blk_func(zilog, &blk, arg, txg);
211*789Sahrens 
212*789Sahrens 		if (error)
213*789Sahrens 			break;
214*789Sahrens 
215*789Sahrens 		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
216*789Sahrens 		blk = ztp->zit_next_blk;
217*789Sahrens 
218*789Sahrens 		if (parse_lr_func == NULL)
219*789Sahrens 			continue;
220*789Sahrens 
221*789Sahrens 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
222*789Sahrens 			lr_t *lr = (lr_t *)lrp;
223*789Sahrens 			reclen = lr->lrc_reclen;
224*789Sahrens 			ASSERT3U(reclen, >=, sizeof (lr_t));
225*789Sahrens 			parse_lr_func(zilog, lr, arg, txg);
226*789Sahrens 		}
227*789Sahrens 	}
228*789Sahrens 	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
229*789Sahrens 	zil_dva_tree_fini(&zilog->zl_dva_tree);
230*789Sahrens }
231*789Sahrens 
232*789Sahrens /* ARGSUSED */
233*789Sahrens static void
234*789Sahrens zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
235*789Sahrens {
236*789Sahrens 	spa_t *spa = zilog->zl_spa;
237*789Sahrens 	int err;
238*789Sahrens 
239*789Sahrens 	dprintf_bp(bp, "first_txg %llu: ", first_txg);
240*789Sahrens 
241*789Sahrens 	/*
242*789Sahrens 	 * Claim log block if not already committed and not already claimed.
243*789Sahrens 	 */
244*789Sahrens 	if (bp->blk_birth >= first_txg &&
245*789Sahrens 	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
246*789Sahrens 		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
247*789Sahrens 		ASSERT(err == 0);
248*789Sahrens 	}
249*789Sahrens }
250*789Sahrens 
251*789Sahrens static void
252*789Sahrens zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
253*789Sahrens {
254*789Sahrens 	if (lrc->lrc_txtype == TX_WRITE) {
255*789Sahrens 		lr_write_t *lr = (lr_write_t *)lrc;
256*789Sahrens 		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
257*789Sahrens 	}
258*789Sahrens }
259*789Sahrens 
260*789Sahrens /* ARGSUSED */
261*789Sahrens static void
262*789Sahrens zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
263*789Sahrens {
264*789Sahrens 	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
265*789Sahrens }
266*789Sahrens 
267*789Sahrens static void
268*789Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
269*789Sahrens {
270*789Sahrens 	/*
271*789Sahrens 	 * If we previously claimed it, we need to free it.
272*789Sahrens 	 */
273*789Sahrens 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
274*789Sahrens 		lr_write_t *lr = (lr_write_t *)lrc;
275*789Sahrens 		blkptr_t *bp = &lr->lr_blkptr;
276*789Sahrens 		if (bp->blk_birth >= claim_txg &&
277*789Sahrens 		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
278*789Sahrens 			(void) arc_free(NULL, zilog->zl_spa,
279*789Sahrens 			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
280*789Sahrens 		}
281*789Sahrens 	}
282*789Sahrens }
283*789Sahrens 
284*789Sahrens /*
285*789Sahrens  * Create an on-disk intent log.
286*789Sahrens  */
287*789Sahrens static void
288*789Sahrens zil_create(zilog_t *zilog)
289*789Sahrens {
290*789Sahrens 	lwb_t *lwb;
291*789Sahrens 	uint64_t txg;
292*789Sahrens 	dmu_tx_t *tx;
293*789Sahrens 	blkptr_t blk;
294*789Sahrens 	int error;
295*789Sahrens 
296*789Sahrens 	ASSERT(zilog->zl_header->zh_claim_txg == 0);
297*789Sahrens 	ASSERT(zilog->zl_header->zh_replay_seq == 0);
298*789Sahrens 
299*789Sahrens 	/*
300*789Sahrens 	 * Initialize the log header block.
301*789Sahrens 	 */
302*789Sahrens 	tx = dmu_tx_create(zilog->zl_os);
303*789Sahrens 	(void) dmu_tx_assign(tx, TXG_WAIT);
304*789Sahrens 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
305*789Sahrens 	txg = dmu_tx_get_txg(tx);
306*789Sahrens 
307*789Sahrens 	/*
308*789Sahrens 	 * Allocate the first log block and assign its checksum verifier.
309*789Sahrens 	 */
310*789Sahrens 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
311*789Sahrens 	    ZIL_MIN_BLKSZ, &blk, txg);
312*789Sahrens 	if (error == 0) {
313*789Sahrens 		ZIO_SET_CHECKSUM(&blk.blk_cksum,
314*789Sahrens 		    spa_get_random(-1ULL), spa_get_random(-1ULL),
315*789Sahrens 		    dmu_objset_id(zilog->zl_os), 1ULL);
316*789Sahrens 
317*789Sahrens 		/*
318*789Sahrens 		 * Allocate a log write buffer (lwb) for the first log block.
319*789Sahrens 		 */
320*789Sahrens 		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
321*789Sahrens 		lwb->lwb_zilog = zilog;
322*789Sahrens 		lwb->lwb_blk = blk;
323*789Sahrens 		lwb->lwb_nused = 0;
324*789Sahrens 		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
325*789Sahrens 		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
326*789Sahrens 		lwb->lwb_max_txg = txg;
327*789Sahrens 		lwb->lwb_seq = 0;
328*789Sahrens 		lwb->lwb_state = UNWRITTEN;
329*789Sahrens 		mutex_enter(&zilog->zl_lock);
330*789Sahrens 		list_insert_tail(&zilog->zl_lwb_list, lwb);
331*789Sahrens 		mutex_exit(&zilog->zl_lock);
332*789Sahrens 	}
333*789Sahrens 
334*789Sahrens 	dmu_tx_commit(tx);
335*789Sahrens 	txg_wait_synced(zilog->zl_dmu_pool, txg);
336*789Sahrens }
337*789Sahrens 
338*789Sahrens /*
339*789Sahrens  * In one tx, free all log blocks and clear the log header.
340*789Sahrens  */
341*789Sahrens void
342*789Sahrens zil_destroy(zilog_t *zilog)
343*789Sahrens {
344*789Sahrens 	dmu_tx_t *tx;
345*789Sahrens 	uint64_t txg;
346*789Sahrens 
347*789Sahrens 	mutex_enter(&zilog->zl_destroy_lock);
348*789Sahrens 
349*789Sahrens 	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
350*789Sahrens 		mutex_exit(&zilog->zl_destroy_lock);
351*789Sahrens 		return;
352*789Sahrens 	}
353*789Sahrens 
354*789Sahrens 	tx = dmu_tx_create(zilog->zl_os);
355*789Sahrens 	(void) dmu_tx_assign(tx, TXG_WAIT);
356*789Sahrens 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
357*789Sahrens 	txg = dmu_tx_get_txg(tx);
358*789Sahrens 
359*789Sahrens 	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
360*789Sahrens 	    zilog->zl_header->zh_claim_txg);
361*789Sahrens 	zilog->zl_destroy_txg = txg;
362*789Sahrens 
363*789Sahrens 	dmu_tx_commit(tx);
364*789Sahrens 	txg_wait_synced(zilog->zl_dmu_pool, txg);
365*789Sahrens 
366*789Sahrens 	mutex_exit(&zilog->zl_destroy_lock);
367*789Sahrens }
368*789Sahrens 
369*789Sahrens void
370*789Sahrens zil_claim(char *osname, void *txarg)
371*789Sahrens {
372*789Sahrens 	dmu_tx_t *tx = txarg;
373*789Sahrens 	uint64_t first_txg = dmu_tx_get_txg(tx);
374*789Sahrens 	zilog_t *zilog;
375*789Sahrens 	zil_header_t *zh;
376*789Sahrens 	objset_t *os;
377*789Sahrens 	int error;
378*789Sahrens 
379*789Sahrens 	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
380*789Sahrens 	if (error) {
381*789Sahrens 		cmn_err(CE_WARN, "can't process intent log for %s", osname);
382*789Sahrens 		return;
383*789Sahrens 	}
384*789Sahrens 
385*789Sahrens 	zilog = dmu_objset_zil(os);
386*789Sahrens 	zh = zilog->zl_header;
387*789Sahrens 
388*789Sahrens 	/*
389*789Sahrens 	 * Claim all log blocks if we haven't already done so.
390*789Sahrens 	 */
391*789Sahrens 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
392*789Sahrens 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
393*789Sahrens 		zh->zh_claim_txg = first_txg;
394*789Sahrens 		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
395*789Sahrens 		    tx, first_txg);
396*789Sahrens 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
397*789Sahrens 	}
398*789Sahrens 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
399*789Sahrens 	dmu_objset_close(os);
400*789Sahrens }
401*789Sahrens 
402*789Sahrens void
403*789Sahrens zil_add_vdev(zilog_t *zilog, uint64_t vdev, uint64_t seq)
404*789Sahrens {
405*789Sahrens 	zil_vdev_t *zv;
406*789Sahrens 
407*789Sahrens 	if (zil_noflush)
408*789Sahrens 		return;
409*789Sahrens 
410*789Sahrens 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
411*789Sahrens 	zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
412*789Sahrens 	zv->vdev = vdev;
413*789Sahrens 	zv->seq = seq;
414*789Sahrens 	list_insert_tail(&zilog->zl_vdev_list, zv);
415*789Sahrens }
416*789Sahrens 
417*789Sahrens 
418*789Sahrens void
419*789Sahrens zil_flush_vdevs(zilog_t *zilog, uint64_t seq)
420*789Sahrens {
421*789Sahrens 	vdev_t *vd;
422*789Sahrens 	zil_vdev_t *zv, *zv2;
423*789Sahrens 	zio_t *zio;
424*789Sahrens 	spa_t *spa;
425*789Sahrens 	uint64_t vdev;
426*789Sahrens 
427*789Sahrens 	if (zil_noflush)
428*789Sahrens 		return;
429*789Sahrens 
430*789Sahrens 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
431*789Sahrens 
432*789Sahrens 	spa = zilog->zl_spa;
433*789Sahrens 	zio = NULL;
434*789Sahrens 
435*789Sahrens 	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL &&
436*789Sahrens 	    zv->seq <= seq) {
437*789Sahrens 		vdev = zv->vdev;
438*789Sahrens 		list_remove(&zilog->zl_vdev_list, zv);
439*789Sahrens 		kmem_free(zv, sizeof (zil_vdev_t));
440*789Sahrens 
441*789Sahrens 		/*
442*789Sahrens 		 * remove all chained entries <= seq with same vdev
443*789Sahrens 		 */
444*789Sahrens 		zv = list_head(&zilog->zl_vdev_list);
445*789Sahrens 		while (zv && zv->seq <= seq) {
446*789Sahrens 			zv2 = list_next(&zilog->zl_vdev_list, zv);
447*789Sahrens 			if (zv->vdev == vdev) {
448*789Sahrens 				list_remove(&zilog->zl_vdev_list, zv);
449*789Sahrens 				kmem_free(zv, sizeof (zil_vdev_t));
450*789Sahrens 			}
451*789Sahrens 			zv = zv2;
452*789Sahrens 		}
453*789Sahrens 
454*789Sahrens 		/* flush the write cache for this vdev */
455*789Sahrens 		mutex_exit(&zilog->zl_lock);
456*789Sahrens 		if (zio == NULL)
457*789Sahrens 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
458*789Sahrens 		vd = vdev_lookup_top(spa, vdev);
459*789Sahrens 		ASSERT(vd);
460*789Sahrens 		(void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
461*789Sahrens 		    NULL, NULL, ZIO_PRIORITY_NOW,
462*789Sahrens 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
463*789Sahrens 		mutex_enter(&zilog->zl_lock);
464*789Sahrens 	}
465*789Sahrens 
466*789Sahrens 	/*
467*789Sahrens 	 * Wait for all the flushes to complete.  Not all devices actually
468*789Sahrens 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
469*789Sahrens 	 */
470*789Sahrens 	if (zio != NULL)
471*789Sahrens 		(void) zio_wait(zio);
472*789Sahrens }
473*789Sahrens 
474*789Sahrens /*
475*789Sahrens  * Function called when a log block write completes
476*789Sahrens  */
477*789Sahrens static void
478*789Sahrens zil_lwb_write_done(zio_t *zio)
479*789Sahrens {
480*789Sahrens 	lwb_t *prev;
481*789Sahrens 	lwb_t *lwb = zio->io_private;
482*789Sahrens 	zilog_t *zilog = lwb->lwb_zilog;
483*789Sahrens 	uint64_t max_seq;
484*789Sahrens 
485*789Sahrens 	/*
486*789Sahrens 	 * Now that we've written this log block, we have a stable pointer
487*789Sahrens 	 * to the next block in the chain, so it's OK to let the txg in
488*789Sahrens 	 * which we allocated the next block sync.
489*789Sahrens 	 */
490*789Sahrens 	txg_rele_to_sync(&lwb->lwb_txgh);
491*789Sahrens 
492*789Sahrens 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
493*789Sahrens 	mutex_enter(&zilog->zl_lock);
494*789Sahrens 	lwb->lwb_buf = NULL;
495*789Sahrens 	if (zio->io_error) {
496*789Sahrens 		zilog->zl_log_error = B_TRUE;
497*789Sahrens 		mutex_exit(&zilog->zl_lock);
498*789Sahrens 		cv_broadcast(&zilog->zl_cv_seq);
499*789Sahrens 		return;
500*789Sahrens 	}
501*789Sahrens 
502*789Sahrens 	prev = list_prev(&zilog->zl_lwb_list, lwb);
503*789Sahrens 	if (prev && prev->lwb_state != SEQ_COMPLETE) {
504*789Sahrens 		/* There's an unwritten buffer in the chain before this one */
505*789Sahrens 		lwb->lwb_state = SEQ_INCOMPLETE;
506*789Sahrens 		mutex_exit(&zilog->zl_lock);
507*789Sahrens 		return;
508*789Sahrens 	}
509*789Sahrens 
510*789Sahrens 	max_seq = lwb->lwb_seq;
511*789Sahrens 	lwb->lwb_state = SEQ_COMPLETE;
512*789Sahrens 	/*
513*789Sahrens 	 * We must also follow up the chain for already written buffers
514*789Sahrens 	 * to see if we can set zl_ss_seq even higher.
515*789Sahrens 	 */
516*789Sahrens 	while (lwb = list_next(&zilog->zl_lwb_list, lwb)) {
517*789Sahrens 		if (lwb->lwb_state != SEQ_INCOMPLETE)
518*789Sahrens 			break;
519*789Sahrens 		lwb->lwb_state = SEQ_COMPLETE;
520*789Sahrens 		/* lwb_seq will be zero if we've written an empty buffer */
521*789Sahrens 		if (lwb->lwb_seq) {
522*789Sahrens 			ASSERT3U(max_seq, <, lwb->lwb_seq);
523*789Sahrens 			max_seq = lwb->lwb_seq;
524*789Sahrens 		}
525*789Sahrens 	}
526*789Sahrens 	zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
527*789Sahrens 	mutex_exit(&zilog->zl_lock);
528*789Sahrens 	cv_broadcast(&zilog->zl_cv_seq);
529*789Sahrens }
530*789Sahrens 
531*789Sahrens /*
532*789Sahrens  * Start a log block write and advance to the next log block.
533*789Sahrens  * Calls are serialized.
534*789Sahrens  */
535*789Sahrens static lwb_t *
536*789Sahrens zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
537*789Sahrens {
538*789Sahrens 	lwb_t *nlwb;
539*789Sahrens 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
540*789Sahrens 	uint64_t txg;
541*789Sahrens 	uint64_t zil_blksz;
542*789Sahrens 	int error;
543*789Sahrens 
544*789Sahrens 	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
545*789Sahrens 
546*789Sahrens 	/*
547*789Sahrens 	 * Allocate the next block and save its address in this block
548*789Sahrens 	 * before writing it in order to establish the log chain.
549*789Sahrens 	 * Note that if the allocation of nlwb synced before we wrote
550*789Sahrens 	 * the block that points at it (lwb), we'd leak it if we crashed.
551*789Sahrens 	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
552*789Sahrens 	 */
553*789Sahrens 	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
554*789Sahrens 	txg_rele_to_quiesce(&lwb->lwb_txgh);
555*789Sahrens 
556*789Sahrens 	/*
557*789Sahrens 	 * Pick a ZIL blocksize based upon the size of the outstanding
558*789Sahrens 	 * in-memory transactions, or if none the same size as the
559*789Sahrens 	 * last block.
560*789Sahrens 	 */
561*789Sahrens 	if (zilog->zl_itx_list_sz) {
562*789Sahrens 		zil_blksz = zilog->zl_itx_list_sz + sizeof (*ztp);
563*789Sahrens 		zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
564*789Sahrens 		if (zil_blksz > ZIL_MAX_BLKSZ)
565*789Sahrens 			zil_blksz = ZIL_MAX_BLKSZ;
566*789Sahrens 		zilog->zl_prev_blk_sz = zil_blksz;
567*789Sahrens 	} else {
568*789Sahrens 		zil_blksz = zilog->zl_prev_blk_sz;
569*789Sahrens 	}
570*789Sahrens 
571*789Sahrens 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
572*789Sahrens 	    zil_blksz, &ztp->zit_next_blk, txg);
573*789Sahrens 	if (error) {
574*789Sahrens 		txg_rele_to_sync(&lwb->lwb_txgh);
575*789Sahrens 		return (NULL);
576*789Sahrens 	}
577*789Sahrens 
578*789Sahrens 	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
579*789Sahrens 	ztp->zit_nused = lwb->lwb_nused;
580*789Sahrens 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
581*789Sahrens 	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
582*789Sahrens 	ztp->zit_next_blk.blk_cksum.zc_word[3]++;
583*789Sahrens 
584*789Sahrens 	/*
585*789Sahrens 	 * Allocate a new log write buffer (lwb).
586*789Sahrens 	 */
587*789Sahrens 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
588*789Sahrens 
589*789Sahrens 	nlwb->lwb_zilog = zilog;
590*789Sahrens 	nlwb->lwb_blk = ztp->zit_next_blk;
591*789Sahrens 	nlwb->lwb_nused = 0;
592*789Sahrens 	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
593*789Sahrens 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
594*789Sahrens 	nlwb->lwb_max_txg = txg;
595*789Sahrens 	nlwb->lwb_seq = 0;
596*789Sahrens 	nlwb->lwb_state = UNWRITTEN;
597*789Sahrens 
598*789Sahrens 	/*
599*789Sahrens 	 * Put new lwb at the end of the log chain,
600*789Sahrens 	 * and record the vdev for later flushing
601*789Sahrens 	 */
602*789Sahrens 	mutex_enter(&zilog->zl_lock);
603*789Sahrens 	list_insert_tail(&zilog->zl_lwb_list, nlwb);
604*789Sahrens 	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))),
605*789Sahrens 	    lwb->lwb_seq);
606*789Sahrens 	mutex_exit(&zilog->zl_lock);
607*789Sahrens 
608*789Sahrens 	/*
609*789Sahrens 	 * write the old log block
610*789Sahrens 	 */
611*789Sahrens 	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
612*789Sahrens 	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
613*789Sahrens 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
614*789Sahrens 	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
615*789Sahrens 
616*789Sahrens 	return (nlwb);
617*789Sahrens }
618*789Sahrens 
619*789Sahrens static lwb_t *
620*789Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
621*789Sahrens {
622*789Sahrens 	lr_t *lrc = &itx->itx_lr; /* common log record */
623*789Sahrens 	uint64_t seq = lrc->lrc_seq;
624*789Sahrens 	uint64_t txg = lrc->lrc_txg;
625*789Sahrens 	uint64_t reclen = lrc->lrc_reclen;
626*789Sahrens 	int error;
627*789Sahrens 
628*789Sahrens 	if (lwb == NULL)
629*789Sahrens 		return (NULL);
630*789Sahrens 	ASSERT(lwb->lwb_buf != NULL);
631*789Sahrens 
632*789Sahrens 	/*
633*789Sahrens 	 * If it's a write, fetch the data or get its blkptr as appropriate.
634*789Sahrens 	 */
635*789Sahrens 	if (lrc->lrc_txtype == TX_WRITE) {
636*789Sahrens 		lr_write_t *lr = (lr_write_t *)lrc;
637*789Sahrens 		if (txg > spa_freeze_txg(zilog->zl_spa))
638*789Sahrens 			txg_wait_synced(zilog->zl_dmu_pool, txg);
639*789Sahrens 
640*789Sahrens 		if (!itx->itx_data_copied &&
641*789Sahrens 		    (error = zilog->zl_get_data(itx->itx_private, lr)) != 0) {
642*789Sahrens 			if (error != ENOENT && error != EALREADY) {
643*789Sahrens 				txg_wait_synced(zilog->zl_dmu_pool, txg);
644*789Sahrens 				mutex_enter(&zilog->zl_lock);
645*789Sahrens 				zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
646*789Sahrens 				zil_add_vdev(zilog,
647*789Sahrens 				    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))),
648*789Sahrens 				    seq);
649*789Sahrens 				mutex_exit(&zilog->zl_lock);
650*789Sahrens 				return (lwb);
651*789Sahrens 			}
652*789Sahrens 			mutex_enter(&zilog->zl_lock);
653*789Sahrens 			zil_add_vdev(zilog,
654*789Sahrens 			    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), seq);
655*789Sahrens 			mutex_exit(&zilog->zl_lock);
656*789Sahrens 			return (lwb);
657*789Sahrens 		}
658*789Sahrens 	}
659*789Sahrens 
660*789Sahrens 	/*
661*789Sahrens 	 * If this record won't fit in the current log block, start a new one.
662*789Sahrens 	 */
663*789Sahrens 	if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
664*789Sahrens 		lwb = zil_lwb_write_start(zilog, lwb);
665*789Sahrens 		if (lwb == NULL)
666*789Sahrens 			return (NULL);
667*789Sahrens 		if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
668*789Sahrens 			txg_wait_synced(zilog->zl_dmu_pool, txg);
669*789Sahrens 			mutex_enter(&zilog->zl_lock);
670*789Sahrens 			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
671*789Sahrens 			mutex_exit(&zilog->zl_lock);
672*789Sahrens 			return (lwb);
673*789Sahrens 		}
674*789Sahrens 	}
675*789Sahrens 
676*789Sahrens 	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
677*789Sahrens 	lwb->lwb_nused += reclen;
678*789Sahrens 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
679*789Sahrens 	ASSERT3U(lwb->lwb_seq, <, seq);
680*789Sahrens 	lwb->lwb_seq = seq;
681*789Sahrens 	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
682*789Sahrens 	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
683*789Sahrens 
684*789Sahrens 	return (lwb);
685*789Sahrens }
686*789Sahrens 
687*789Sahrens itx_t *
688*789Sahrens zil_itx_create(int txtype, size_t lrsize)
689*789Sahrens {
690*789Sahrens 	itx_t *itx;
691*789Sahrens 
692*789Sahrens 	lrsize = P2ROUNDUP(lrsize, sizeof (uint64_t));
693*789Sahrens 
694*789Sahrens 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
695*789Sahrens 	itx->itx_lr.lrc_txtype = txtype;
696*789Sahrens 	itx->itx_lr.lrc_reclen = lrsize;
697*789Sahrens 	itx->itx_lr.lrc_seq = 0;	/* defensive */
698*789Sahrens 
699*789Sahrens 	return (itx);
700*789Sahrens }
701*789Sahrens 
702*789Sahrens uint64_t
703*789Sahrens zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
704*789Sahrens {
705*789Sahrens 	uint64_t seq;
706*789Sahrens 
707*789Sahrens 	ASSERT(itx->itx_lr.lrc_seq == 0);
708*789Sahrens 
709*789Sahrens 	mutex_enter(&zilog->zl_lock);
710*789Sahrens 	list_insert_tail(&zilog->zl_itx_list, itx);
711*789Sahrens 	zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
712*789Sahrens 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
713*789Sahrens 	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
714*789Sahrens 	mutex_exit(&zilog->zl_lock);
715*789Sahrens 
716*789Sahrens 	return (seq);
717*789Sahrens }
718*789Sahrens 
719*789Sahrens /*
720*789Sahrens  * Free up all in-memory intent log transactions that have now been synced.
721*789Sahrens  */
722*789Sahrens static void
723*789Sahrens zil_itx_clean(zilog_t *zilog)
724*789Sahrens {
725*789Sahrens 	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
726*789Sahrens 	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
727*789Sahrens 	uint64_t max_seq = 0;
728*789Sahrens 	itx_t *itx;
729*789Sahrens 
730*789Sahrens 	mutex_enter(&zilog->zl_lock);
731*789Sahrens 	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
732*789Sahrens 	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
733*789Sahrens 		list_remove(&zilog->zl_itx_list, itx);
734*789Sahrens 		zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
735*789Sahrens 		ASSERT3U(max_seq, <, itx->itx_lr.lrc_seq);
736*789Sahrens 		max_seq = itx->itx_lr.lrc_seq;
737*789Sahrens 		kmem_free(itx, offsetof(itx_t, itx_lr)
738*789Sahrens 		    + itx->itx_lr.lrc_reclen);
739*789Sahrens 	}
740*789Sahrens 	if (max_seq > zilog->zl_ss_seq) {
741*789Sahrens 		zilog->zl_ss_seq = max_seq;
742*789Sahrens 		cv_broadcast(&zilog->zl_cv_seq);
743*789Sahrens 	}
744*789Sahrens 	mutex_exit(&zilog->zl_lock);
745*789Sahrens }
746*789Sahrens 
747*789Sahrens void
748*789Sahrens zil_clean(zilog_t *zilog)
749*789Sahrens {
750*789Sahrens 	/*
751*789Sahrens 	 * Check for any log blocks that can be freed.
752*789Sahrens 	 * Log blocks are only freed when the log block allocation and
753*789Sahrens 	 * log records contained within are both known to be committed.
754*789Sahrens 	 */
755*789Sahrens 	mutex_enter(&zilog->zl_lock);
756*789Sahrens 	if (list_head(&zilog->zl_itx_list) != NULL)
757*789Sahrens 		(void) taskq_dispatch(zilog->zl_clean_taskq,
758*789Sahrens 		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
759*789Sahrens 	mutex_exit(&zilog->zl_lock);
760*789Sahrens }
761*789Sahrens 
762*789Sahrens /*
763*789Sahrens  * Push zfs transactions to stable storage up to the supplied sequence number.
764*789Sahrens  */
765*789Sahrens void
766*789Sahrens zil_commit(zilog_t *zilog, uint64_t seq, int ioflag)
767*789Sahrens {
768*789Sahrens 	uint64_t txg;
769*789Sahrens 	uint64_t max_seq;
770*789Sahrens 	uint64_t reclen;
771*789Sahrens 	itx_t *itx;
772*789Sahrens 	lwb_t *lwb;
773*789Sahrens 	spa_t *spa;
774*789Sahrens 
775*789Sahrens 	if (zilog == NULL || seq == 0 ||
776*789Sahrens 	    ((ioflag & (FSYNC | FDSYNC | FRSYNC)) == 0 && !zil_always))
777*789Sahrens 		return;
778*789Sahrens 
779*789Sahrens 	spa = zilog->zl_spa;
780*789Sahrens 	mutex_enter(&zilog->zl_lock);
781*789Sahrens 
782*789Sahrens 	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
783*789Sahrens 
784*789Sahrens 	for (;;) {
785*789Sahrens 		if (zilog->zl_ss_seq >= seq) {	/* already on stable storage */
786*789Sahrens 			cv_signal(&zilog->zl_cv_write);
787*789Sahrens 			mutex_exit(&zilog->zl_lock);
788*789Sahrens 			return;
789*789Sahrens 		}
790*789Sahrens 
791*789Sahrens 		if (zilog->zl_writer == B_FALSE) /* no one writing, do it */
792*789Sahrens 			break;
793*789Sahrens 
794*789Sahrens 		cv_wait(&zilog->zl_cv_write, &zilog->zl_lock);
795*789Sahrens 	}
796*789Sahrens 
797*789Sahrens 	zilog->zl_writer = B_TRUE;
798*789Sahrens 	max_seq = 0;
799*789Sahrens 
800*789Sahrens 	if (zilog->zl_suspend) {
801*789Sahrens 		lwb = NULL;
802*789Sahrens 	} else {
803*789Sahrens 		lwb = list_tail(&zilog->zl_lwb_list);
804*789Sahrens 		if (lwb == NULL) {
805*789Sahrens 			mutex_exit(&zilog->zl_lock);
806*789Sahrens 			zil_create(zilog);
807*789Sahrens 			mutex_enter(&zilog->zl_lock);
808*789Sahrens 			lwb = list_tail(&zilog->zl_lwb_list);
809*789Sahrens 		}
810*789Sahrens 	}
811*789Sahrens 
812*789Sahrens 	/*
813*789Sahrens 	 * Loop through in-memory log transactions filling log blocks,
814*789Sahrens 	 * until we reach the given sequence number and there's no more
815*789Sahrens 	 * room in the write buffer.
816*789Sahrens 	 */
817*789Sahrens 	for (;;) {
818*789Sahrens 		itx = list_head(&zilog->zl_itx_list);
819*789Sahrens 		if (itx == NULL)
820*789Sahrens 			break;
821*789Sahrens 
822*789Sahrens 		reclen = itx->itx_lr.lrc_reclen;
823*789Sahrens 		if ((itx->itx_lr.lrc_seq > seq) &&
824*789Sahrens 		    ((lwb == NULL) || (lwb->lwb_nused + reclen >
825*789Sahrens 		    ZIL_BLK_DATA_SZ(lwb))))
826*789Sahrens 			break;
827*789Sahrens 
828*789Sahrens 		list_remove(&zilog->zl_itx_list, itx);
829*789Sahrens 		txg = itx->itx_lr.lrc_txg;
830*789Sahrens 		ASSERT(txg);
831*789Sahrens 
832*789Sahrens 		mutex_exit(&zilog->zl_lock);
833*789Sahrens 		if (txg > spa_last_synced_txg(spa) ||
834*789Sahrens 		    txg > spa_freeze_txg(spa))
835*789Sahrens 			lwb = zil_lwb_commit(zilog, itx, lwb);
836*789Sahrens 		else
837*789Sahrens 			max_seq = itx->itx_lr.lrc_seq;
838*789Sahrens 		kmem_free(itx, offsetof(itx_t, itx_lr)
839*789Sahrens 		    + itx->itx_lr.lrc_reclen);
840*789Sahrens 		mutex_enter(&zilog->zl_lock);
841*789Sahrens 		zilog->zl_itx_list_sz -= reclen;
842*789Sahrens 	}
843*789Sahrens 
844*789Sahrens 	mutex_exit(&zilog->zl_lock);
845*789Sahrens 
846*789Sahrens 	/* write the last block out */
847*789Sahrens 	if (lwb != NULL && lwb->lwb_nused != 0)
848*789Sahrens 		lwb = zil_lwb_write_start(zilog, lwb);
849*789Sahrens 
850*789Sahrens 	/* wake up others waiting to start a write */
851*789Sahrens 	mutex_enter(&zilog->zl_lock);
852*789Sahrens 	zilog->zl_writer = B_FALSE;
853*789Sahrens 	cv_signal(&zilog->zl_cv_write);
854*789Sahrens 
855*789Sahrens 	if (max_seq > zilog->zl_ss_seq) {
856*789Sahrens 		zilog->zl_ss_seq = max_seq;
857*789Sahrens 		cv_broadcast(&zilog->zl_cv_seq);
858*789Sahrens 	}
859*789Sahrens 	/*
860*789Sahrens 	 * Wait if necessary for our seq to be committed.
861*789Sahrens 	 */
862*789Sahrens 	if (lwb) {
863*789Sahrens 		while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
864*789Sahrens 			cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
865*789Sahrens 		zil_flush_vdevs(zilog, seq);
866*789Sahrens 	}
867*789Sahrens 	if (zilog->zl_log_error || lwb == NULL) {
868*789Sahrens 		zilog->zl_log_error = 0;
869*789Sahrens 		max_seq = zilog->zl_itx_seq;
870*789Sahrens 		mutex_exit(&zilog->zl_lock);
871*789Sahrens 		txg_wait_synced(zilog->zl_dmu_pool, 0);
872*789Sahrens 		mutex_enter(&zilog->zl_lock);
873*789Sahrens 		zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
874*789Sahrens 		cv_broadcast(&zilog->zl_cv_seq);
875*789Sahrens 	}
876*789Sahrens 	mutex_exit(&zilog->zl_lock);
877*789Sahrens }
878*789Sahrens 
879*789Sahrens /*
880*789Sahrens  * Called in syncing context to free committed log blocks and update log header.
881*789Sahrens  */
882*789Sahrens void
883*789Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx)
884*789Sahrens {
885*789Sahrens 	uint64_t txg = dmu_tx_get_txg(tx);
886*789Sahrens 	spa_t *spa = zilog->zl_spa;
887*789Sahrens 	lwb_t *lwb;
888*789Sahrens 
889*789Sahrens 	ASSERT(zilog->zl_stop_sync == 0);
890*789Sahrens 
891*789Sahrens 	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
892*789Sahrens 
893*789Sahrens 	if (zilog->zl_destroy_txg == txg) {
894*789Sahrens 		bzero(zilog->zl_header, sizeof (zil_header_t));
895*789Sahrens 		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
896*789Sahrens 		zilog->zl_destroy_txg = 0;
897*789Sahrens 	}
898*789Sahrens 
899*789Sahrens 	mutex_enter(&zilog->zl_lock);
900*789Sahrens 	for (;;) {
901*789Sahrens 		lwb = list_head(&zilog->zl_lwb_list);
902*789Sahrens 		if (lwb == NULL) {
903*789Sahrens 			mutex_exit(&zilog->zl_lock);
904*789Sahrens 			return;
905*789Sahrens 		}
906*789Sahrens 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
907*789Sahrens 			break;
908*789Sahrens 		list_remove(&zilog->zl_lwb_list, lwb);
909*789Sahrens 		zio_free_blk(spa, &lwb->lwb_blk, txg);
910*789Sahrens 		kmem_cache_free(zil_lwb_cache, lwb);
911*789Sahrens 	}
912*789Sahrens 	zilog->zl_header->zh_log = lwb->lwb_blk;
913*789Sahrens 	mutex_exit(&zilog->zl_lock);
914*789Sahrens }
915*789Sahrens 
916*789Sahrens void
917*789Sahrens zil_init(void)
918*789Sahrens {
919*789Sahrens 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
920*789Sahrens 	    sizeof (struct lwb), NULL, NULL, NULL, NULL, NULL, NULL, 0);
921*789Sahrens }
922*789Sahrens 
923*789Sahrens void
924*789Sahrens zil_fini(void)
925*789Sahrens {
926*789Sahrens 	kmem_cache_destroy(zil_lwb_cache);
927*789Sahrens }
928*789Sahrens 
929*789Sahrens zilog_t *
930*789Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys)
931*789Sahrens {
932*789Sahrens 	zilog_t *zilog;
933*789Sahrens 
934*789Sahrens 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
935*789Sahrens 
936*789Sahrens 	zilog->zl_header = zh_phys;
937*789Sahrens 	zilog->zl_os = os;
938*789Sahrens 	zilog->zl_spa = dmu_objset_spa(os);
939*789Sahrens 	zilog->zl_dmu_pool = dmu_objset_pool(os);
940*789Sahrens 	zilog->zl_prev_blk_sz = ZIL_MIN_BLKSZ;
941*789Sahrens 
942*789Sahrens 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
943*789Sahrens 	    offsetof(itx_t, itx_node));
944*789Sahrens 
945*789Sahrens 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
946*789Sahrens 	    offsetof(lwb_t, lwb_node));
947*789Sahrens 
948*789Sahrens 	list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
949*789Sahrens 	    offsetof(zil_vdev_t, vdev_seq_node));
950*789Sahrens 
951*789Sahrens 	return (zilog);
952*789Sahrens }
953*789Sahrens 
954*789Sahrens void
955*789Sahrens zil_free(zilog_t *zilog)
956*789Sahrens {
957*789Sahrens 	lwb_t *lwb;
958*789Sahrens 	zil_vdev_t *zv;
959*789Sahrens 
960*789Sahrens 	zilog->zl_stop_sync = 1;
961*789Sahrens 
962*789Sahrens 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
963*789Sahrens 		list_remove(&zilog->zl_lwb_list, lwb);
964*789Sahrens 		if (lwb->lwb_buf != NULL)
965*789Sahrens 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
966*789Sahrens 		kmem_cache_free(zil_lwb_cache, lwb);
967*789Sahrens 	}
968*789Sahrens 	list_destroy(&zilog->zl_lwb_list);
969*789Sahrens 
970*789Sahrens 	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
971*789Sahrens 		list_remove(&zilog->zl_vdev_list, zv);
972*789Sahrens 		kmem_free(zv, sizeof (zil_vdev_t));
973*789Sahrens 	}
974*789Sahrens 	list_destroy(&zilog->zl_vdev_list);
975*789Sahrens 
976*789Sahrens 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
977*789Sahrens 	list_destroy(&zilog->zl_itx_list);
978*789Sahrens 
979*789Sahrens 	kmem_free(zilog, sizeof (zilog_t));
980*789Sahrens }
981*789Sahrens 
982*789Sahrens /*
983*789Sahrens  * Open an intent log.
984*789Sahrens  */
985*789Sahrens zilog_t *
986*789Sahrens zil_open(objset_t *os, zil_get_data_t *get_data)
987*789Sahrens {
988*789Sahrens 	zilog_t *zilog = dmu_objset_zil(os);
989*789Sahrens 
990*789Sahrens 	zilog->zl_get_data = get_data;
991*789Sahrens 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
992*789Sahrens 	    2, 2, TASKQ_PREPOPULATE);
993*789Sahrens 
994*789Sahrens 	return (zilog);
995*789Sahrens }
996*789Sahrens 
997*789Sahrens /*
998*789Sahrens  * Close an intent log.
999*789Sahrens  */
1000*789Sahrens void
1001*789Sahrens zil_close(zilog_t *zilog)
1002*789Sahrens {
1003*789Sahrens 	txg_wait_synced(zilog->zl_dmu_pool, 0);
1004*789Sahrens 	taskq_destroy(zilog->zl_clean_taskq);
1005*789Sahrens 	zilog->zl_clean_taskq = NULL;
1006*789Sahrens 	zilog->zl_get_data = NULL;
1007*789Sahrens 
1008*789Sahrens 	zil_itx_clean(zilog);
1009*789Sahrens 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
1010*789Sahrens }
1011*789Sahrens 
1012*789Sahrens /*
1013*789Sahrens  * Suspend an intent log.  While in suspended mode, we still honor
1014*789Sahrens  * synchronous semantics, but we rely on txg_wait_synced() to do it.
1015*789Sahrens  * We suspend the log briefly when taking a snapshot so that the snapshot
1016*789Sahrens  * contains all the data it's supposed to, and has an empty intent log.
1017*789Sahrens  */
1018*789Sahrens int
1019*789Sahrens zil_suspend(zilog_t *zilog)
1020*789Sahrens {
1021*789Sahrens 	lwb_t *lwb;
1022*789Sahrens 
1023*789Sahrens 	mutex_enter(&zilog->zl_lock);
1024*789Sahrens 	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */
1025*789Sahrens 		mutex_exit(&zilog->zl_lock);
1026*789Sahrens 		return (EBUSY);
1027*789Sahrens 	}
1028*789Sahrens 	zilog->zl_suspend++;
1029*789Sahrens 	mutex_exit(&zilog->zl_lock);
1030*789Sahrens 
1031*789Sahrens 	zil_commit(zilog, UINT64_MAX, FSYNC);
1032*789Sahrens 
1033*789Sahrens 	mutex_enter(&zilog->zl_lock);
1034*789Sahrens 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
1035*789Sahrens 		if (lwb->lwb_buf != NULL) {
1036*789Sahrens 			/*
1037*789Sahrens 			 * Wait for the buffer if it's in the process of
1038*789Sahrens 			 * being written.
1039*789Sahrens 			 */
1040*789Sahrens 			if ((lwb->lwb_seq != 0) &&
1041*789Sahrens 			    (lwb->lwb_state != SEQ_COMPLETE)) {
1042*789Sahrens 				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
1043*789Sahrens 				continue;
1044*789Sahrens 			}
1045*789Sahrens 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
1046*789Sahrens 		}
1047*789Sahrens 		list_remove(&zilog->zl_lwb_list, lwb);
1048*789Sahrens 		kmem_cache_free(zil_lwb_cache, lwb);
1049*789Sahrens 	}
1050*789Sahrens 	mutex_exit(&zilog->zl_lock);
1051*789Sahrens 
1052*789Sahrens 	zil_destroy(zilog);
1053*789Sahrens 
1054*789Sahrens 	return (0);
1055*789Sahrens }
1056*789Sahrens 
1057*789Sahrens void
1058*789Sahrens zil_resume(zilog_t *zilog)
1059*789Sahrens {
1060*789Sahrens 	mutex_enter(&zilog->zl_lock);
1061*789Sahrens 	ASSERT(zilog->zl_suspend != 0);
1062*789Sahrens 	zilog->zl_suspend--;
1063*789Sahrens 	mutex_exit(&zilog->zl_lock);
1064*789Sahrens }
1065*789Sahrens 
1066*789Sahrens typedef struct zil_replay_arg {
1067*789Sahrens 	objset_t	*zr_os;
1068*789Sahrens 	zil_replay_func_t **zr_replay;
1069*789Sahrens 	void		*zr_arg;
1070*789Sahrens 	void		(*zr_rm_sync)(void *arg);
1071*789Sahrens 	uint64_t	*zr_txgp;
1072*789Sahrens 	boolean_t	zr_byteswap;
1073*789Sahrens 	char		*zr_lrbuf;
1074*789Sahrens } zil_replay_arg_t;
1075*789Sahrens 
1076*789Sahrens static void
1077*789Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
1078*789Sahrens {
1079*789Sahrens 	zil_replay_arg_t *zr = zra;
1080*789Sahrens 	zil_header_t *zh = zilog->zl_header;
1081*789Sahrens 	uint64_t reclen = lr->lrc_reclen;
1082*789Sahrens 	uint64_t txtype = lr->lrc_txtype;
1083*789Sahrens 	int pass, error;
1084*789Sahrens 
1085*789Sahrens 	if (zilog->zl_stop_replay)
1086*789Sahrens 		return;
1087*789Sahrens 
1088*789Sahrens 	if (lr->lrc_txg < claim_txg)		/* already committed */
1089*789Sahrens 		return;
1090*789Sahrens 
1091*789Sahrens 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
1092*789Sahrens 		return;
1093*789Sahrens 
1094*789Sahrens 	/*
1095*789Sahrens 	 * Make a copy of the data so we can revise and extend it.
1096*789Sahrens 	 */
1097*789Sahrens 	bcopy(lr, zr->zr_lrbuf, reclen);
1098*789Sahrens 
1099*789Sahrens 	/*
1100*789Sahrens 	 * The log block containing this lr may have been byteswapped
1101*789Sahrens 	 * so that we can easily examine common fields like lrc_txtype.
1102*789Sahrens 	 * However, the log is a mix of different data types, and only the
1103*789Sahrens 	 * replay vectors know how to byteswap their records.  Therefore, if
1104*789Sahrens 	 * the lr was byteswapped, undo it before invoking the replay vector.
1105*789Sahrens 	 */
1106*789Sahrens 	if (zr->zr_byteswap)
1107*789Sahrens 		byteswap_uint64_array(zr->zr_lrbuf, reclen);
1108*789Sahrens 
1109*789Sahrens 	/*
1110*789Sahrens 	 * If this is a TX_WRITE with a blkptr, suck in the data.
1111*789Sahrens 	 */
1112*789Sahrens 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
1113*789Sahrens 		lr_write_t *lrw = (lr_write_t *)lr;
1114*789Sahrens 		blkptr_t *wbp = &lrw->lr_blkptr;
1115*789Sahrens 		uint64_t wlen = lrw->lr_length;
1116*789Sahrens 		char *wbuf = zr->zr_lrbuf + reclen;
1117*789Sahrens 
1118*789Sahrens 		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
1119*789Sahrens 			bzero(wbuf, wlen);
1120*789Sahrens 		} else {
1121*789Sahrens 			/*
1122*789Sahrens 			 * A subsequent write may have overwritten this block,
1123*789Sahrens 			 * in which case wbp may have been been freed and
1124*789Sahrens 			 * reallocated, and our read of wbp may fail with a
1125*789Sahrens 			 * checksum error.  We can safely ignore this because
1126*789Sahrens 			 * the later write will provide the correct data.
1127*789Sahrens 			 */
1128*789Sahrens 			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
1129*789Sahrens 			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
1130*789Sahrens 			    ZIO_PRIORITY_SYNC_READ,
1131*789Sahrens 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
1132*789Sahrens 			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
1133*789Sahrens 		}
1134*789Sahrens 	}
1135*789Sahrens 
1136*789Sahrens 	/*
1137*789Sahrens 	 * We must now do two things atomically: replay this log record,
1138*789Sahrens 	 * and update the log header to reflect the fact that we did so.
1139*789Sahrens 	 * We use the DMU's ability to assign into a specific txg to do this.
1140*789Sahrens 	 */
1141*789Sahrens 	for (pass = 1; /* CONSTANTCONDITION */; pass++) {
1142*789Sahrens 		uint64_t replay_txg;
1143*789Sahrens 		dmu_tx_t *replay_tx;
1144*789Sahrens 
1145*789Sahrens 		replay_tx = dmu_tx_create(zr->zr_os);
1146*789Sahrens 		error = dmu_tx_assign(replay_tx, TXG_WAIT);
1147*789Sahrens 		if (error) {
1148*789Sahrens 			dmu_tx_abort(replay_tx);
1149*789Sahrens 			break;
1150*789Sahrens 		}
1151*789Sahrens 
1152*789Sahrens 		replay_txg = dmu_tx_get_txg(replay_tx);
1153*789Sahrens 
1154*789Sahrens 		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
1155*789Sahrens 			error = EINVAL;
1156*789Sahrens 		} else {
1157*789Sahrens 			/*
1158*789Sahrens 			 * On the first pass, arrange for the replay vector
1159*789Sahrens 			 * to fail its dmu_tx_assign().  That's the only way
1160*789Sahrens 			 * to ensure that those code paths remain well tested.
1161*789Sahrens 			 */
1162*789Sahrens 			*zr->zr_txgp = replay_txg - (pass == 1);
1163*789Sahrens 			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
1164*789Sahrens 			    zr->zr_byteswap);
1165*789Sahrens 			*zr->zr_txgp = TXG_NOWAIT;
1166*789Sahrens 		}
1167*789Sahrens 
1168*789Sahrens 		if (error == 0) {
1169*789Sahrens 			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
1170*789Sahrens 			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
1171*789Sahrens 			    lr->lrc_seq;
1172*789Sahrens 		}
1173*789Sahrens 
1174*789Sahrens 		dmu_tx_commit(replay_tx);
1175*789Sahrens 
1176*789Sahrens 		if (error != ERESTART)
1177*789Sahrens 			break;
1178*789Sahrens 
1179*789Sahrens 		if (pass != 1)
1180*789Sahrens 			txg_wait_open(spa_get_dsl(zilog->zl_spa),
1181*789Sahrens 			    replay_txg + 1);
1182*789Sahrens 
1183*789Sahrens 		dprintf("pass %d, retrying\n", pass);
1184*789Sahrens 	}
1185*789Sahrens 
1186*789Sahrens 	if (error) {
1187*789Sahrens 		char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1188*789Sahrens 		dmu_objset_name(zr->zr_os, name);
1189*789Sahrens 		cmn_err(CE_WARN, "ZFS replay transaction error %d, "
1190*789Sahrens 		    "dataset %s, seq 0x%llx, txtype %llu\n",
1191*789Sahrens 		    error, name,
1192*789Sahrens 		    (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
1193*789Sahrens 		zilog->zl_stop_replay = 1;
1194*789Sahrens 		kmem_free(name, MAXNAMELEN);
1195*789Sahrens 	}
1196*789Sahrens 
1197*789Sahrens 	/*
1198*789Sahrens 	 * The DMU's dnode layer doesn't see removes until the txg commits,
1199*789Sahrens 	 * so a subsequent claim can spuriously fail with EEXIST.
1200*789Sahrens 	 * To prevent this, if we might have removed an object,
1201*789Sahrens 	 * wait for the delete thread to delete it, and then
1202*789Sahrens 	 * wait for the transaction group to sync.
1203*789Sahrens 	 */
1204*789Sahrens 	if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
1205*789Sahrens 		if (zr->zr_rm_sync != NULL)
1206*789Sahrens 			zr->zr_rm_sync(zr->zr_arg);
1207*789Sahrens 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
1208*789Sahrens 	}
1209*789Sahrens }
1210*789Sahrens 
1211*789Sahrens /*
1212*789Sahrens  * If this dataset has an intent log, replay it and destroy it.
1213*789Sahrens  */
1214*789Sahrens void
1215*789Sahrens zil_replay(objset_t *os, void *arg, uint64_t *txgp,
1216*789Sahrens 	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
1217*789Sahrens {
1218*789Sahrens 	zilog_t *zilog = dmu_objset_zil(os);
1219*789Sahrens 	zil_replay_arg_t zr;
1220*789Sahrens 
1221*789Sahrens 	zr.zr_os = os;
1222*789Sahrens 	zr.zr_replay = replay_func;
1223*789Sahrens 	zr.zr_arg = arg;
1224*789Sahrens 	zr.zr_rm_sync = rm_sync;
1225*789Sahrens 	zr.zr_txgp = txgp;
1226*789Sahrens 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
1227*789Sahrens 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
1228*789Sahrens 
1229*789Sahrens 	/*
1230*789Sahrens 	 * Wait for in-progress removes to sync before starting replay.
1231*789Sahrens 	 */
1232*789Sahrens 	if (rm_sync != NULL)
1233*789Sahrens 		rm_sync(arg);
1234*789Sahrens 	txg_wait_synced(zilog->zl_dmu_pool, 0);
1235*789Sahrens 
1236*789Sahrens 	zilog->zl_stop_replay = 0;
1237*789Sahrens 	zil_parse(zilog, NULL, zil_replay_log_record, &zr,
1238*789Sahrens 	    zilog->zl_header->zh_claim_txg);
1239*789Sahrens 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
1240*789Sahrens 
1241*789Sahrens 	zil_destroy(zilog);
1242*789Sahrens }
1243