1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51472Sperrin * Common Development and Distribution License (the "License"). 61472Sperrin * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 2212294SMark.Musante@Sun.COM * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23789Sahrens */ 24789Sahrens 2512294SMark.Musante@Sun.COM /* Portions Copyright 2010 Robert Milkowski */ 2612294SMark.Musante@Sun.COM 27789Sahrens #include <sys/zfs_context.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/dmu.h> 30789Sahrens #include <sys/zap.h> 31789Sahrens #include <sys/arc.h> 32789Sahrens #include <sys/stat.h> 33789Sahrens #include <sys/resource.h> 34789Sahrens #include <sys/zil.h> 35789Sahrens #include <sys/zil_impl.h> 36789Sahrens #include <sys/dsl_dataset.h> 37789Sahrens #include <sys/vdev.h> 383668Sgw25295 #include <sys/dmu_tx.h> 3912296SLin.Ling@Sun.COM #include <sys/dsl_pool.h> 40789Sahrens 41789Sahrens /* 42789Sahrens * The zfs intent log (ZIL) saves transaction records of system calls 43789Sahrens * that change the file system in memory with enough information 44789Sahrens * to be able to replay them. These are stored in memory until 45789Sahrens * either the DMU transaction group (txg) commits them to the stable pool 46789Sahrens * and they can be discarded, or they are flushed to the stable log 47789Sahrens * (also in the pool) due to a fsync, O_DSYNC or other synchronous 48789Sahrens * requirement. In the event of a panic or power fail then those log 49789Sahrens * records (transactions) are replayed. 50789Sahrens * 51789Sahrens * There is one ZIL per file system. Its on-disk (pool) format consists 52789Sahrens * of 3 parts: 53789Sahrens * 54789Sahrens * - ZIL header 55789Sahrens * - ZIL blocks 56789Sahrens * - ZIL records 57789Sahrens * 58789Sahrens * A log record holds a system call transaction. Log blocks can 59789Sahrens * hold many log records and the blocks are chained together. 60789Sahrens * Each ZIL block contains a block pointer (blkptr_t) to the next 61789Sahrens * ZIL block in the chain. The ZIL header points to the first 62789Sahrens * block in the chain. Note there is not a fixed place in the pool 63789Sahrens * to hold blocks. They are dynamically allocated and freed as 64789Sahrens * needed from the blocks available. Figure X shows the ZIL structure: 65789Sahrens */ 66789Sahrens 67789Sahrens /* 682986Sek110237 * This global ZIL switch affects all pools 69789Sahrens */ 7012294SMark.Musante@Sun.COM int zil_replay_disable = 0; /* disable intent logging replay */ 712986Sek110237 722986Sek110237 /* 732986Sek110237 * Tunable parameter for debugging or performance analysis. Setting 742986Sek110237 * zfs_nocacheflush will cause corruption on power loss if a volatile 752986Sek110237 * out-of-order write cache is enabled. 762986Sek110237 */ 772986Sek110237 boolean_t zfs_nocacheflush = B_FALSE; 78789Sahrens 79789Sahrens static kmem_cache_t *zil_lwb_cache; 80789Sahrens 8110685SGeorge.Wilson@Sun.COM static boolean_t zil_empty(zilog_t *zilog); 8210685SGeorge.Wilson@Sun.COM 8311670SNeil.Perrin@Sun.COM #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 8411670SNeil.Perrin@Sun.COM sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 8511670SNeil.Perrin@Sun.COM 8611670SNeil.Perrin@Sun.COM 8712699SNeil.Perrin@Sun.COM /* 8812699SNeil.Perrin@Sun.COM * ziltest is by and large an ugly hack, but very useful in 8912699SNeil.Perrin@Sun.COM * checking replay without tedious work. 9012699SNeil.Perrin@Sun.COM * When running ziltest we want to keep all itx's and so maintain 9112699SNeil.Perrin@Sun.COM * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG 9212699SNeil.Perrin@Sun.COM * We subtract TXG_CONCURRENT_STATES to allow for common code. 9312699SNeil.Perrin@Sun.COM */ 9412699SNeil.Perrin@Sun.COM #define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) 9512699SNeil.Perrin@Sun.COM 96789Sahrens static int 9710922SJeff.Bonwick@Sun.COM zil_bp_compare(const void *x1, const void *x2) 98789Sahrens { 9910922SJeff.Bonwick@Sun.COM const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 10010922SJeff.Bonwick@Sun.COM const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 101789Sahrens 102789Sahrens if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 103789Sahrens return (-1); 104789Sahrens if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 105789Sahrens return (1); 106789Sahrens 107789Sahrens if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 108789Sahrens return (-1); 109789Sahrens if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 110789Sahrens return (1); 111789Sahrens 112789Sahrens return (0); 113789Sahrens } 114789Sahrens 115789Sahrens static void 11610922SJeff.Bonwick@Sun.COM zil_bp_tree_init(zilog_t *zilog) 117789Sahrens { 11810922SJeff.Bonwick@Sun.COM avl_create(&zilog->zl_bp_tree, zil_bp_compare, 11910922SJeff.Bonwick@Sun.COM sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 120789Sahrens } 121789Sahrens 122789Sahrens static void 12310922SJeff.Bonwick@Sun.COM zil_bp_tree_fini(zilog_t *zilog) 124789Sahrens { 12510922SJeff.Bonwick@Sun.COM avl_tree_t *t = &zilog->zl_bp_tree; 12610922SJeff.Bonwick@Sun.COM zil_bp_node_t *zn; 127789Sahrens void *cookie = NULL; 128789Sahrens 129789Sahrens while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 13010922SJeff.Bonwick@Sun.COM kmem_free(zn, sizeof (zil_bp_node_t)); 131789Sahrens 132789Sahrens avl_destroy(t); 133789Sahrens } 134789Sahrens 13510922SJeff.Bonwick@Sun.COM int 13610922SJeff.Bonwick@Sun.COM zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 137789Sahrens { 13810922SJeff.Bonwick@Sun.COM avl_tree_t *t = &zilog->zl_bp_tree; 13910922SJeff.Bonwick@Sun.COM const dva_t *dva = BP_IDENTITY(bp); 14010922SJeff.Bonwick@Sun.COM zil_bp_node_t *zn; 141789Sahrens avl_index_t where; 142789Sahrens 143789Sahrens if (avl_find(t, dva, &where) != NULL) 144789Sahrens return (EEXIST); 145789Sahrens 14610922SJeff.Bonwick@Sun.COM zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 147789Sahrens zn->zn_dva = *dva; 148789Sahrens avl_insert(t, zn, where); 149789Sahrens 150789Sahrens return (0); 151789Sahrens } 152789Sahrens 1531807Sbonwick static zil_header_t * 1541807Sbonwick zil_header_in_syncing_context(zilog_t *zilog) 1551807Sbonwick { 1561807Sbonwick return ((zil_header_t *)zilog->zl_header); 1571807Sbonwick } 1581807Sbonwick 1591807Sbonwick static void 1601807Sbonwick zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 1611807Sbonwick { 1621807Sbonwick zio_cksum_t *zc = &bp->blk_cksum; 1631807Sbonwick 1641807Sbonwick zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 1651807Sbonwick zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 1661807Sbonwick zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 1671807Sbonwick zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 1681807Sbonwick } 1691807Sbonwick 170789Sahrens /* 17110922SJeff.Bonwick@Sun.COM * Read a log block and make sure it's valid. 172789Sahrens */ 173789Sahrens static int 17411670SNeil.Perrin@Sun.COM zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 17511670SNeil.Perrin@Sun.COM char **end) 176789Sahrens { 17710922SJeff.Bonwick@Sun.COM enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 17810922SJeff.Bonwick@Sun.COM uint32_t aflags = ARC_WAIT; 17910922SJeff.Bonwick@Sun.COM arc_buf_t *abuf = NULL; 1801544Seschrock zbookmark_t zb; 181789Sahrens int error; 182789Sahrens 18310922SJeff.Bonwick@Sun.COM if (zilog->zl_header->zh_claim_txg == 0) 18410922SJeff.Bonwick@Sun.COM zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 1851807Sbonwick 18610922SJeff.Bonwick@Sun.COM if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 18710922SJeff.Bonwick@Sun.COM zio_flags |= ZIO_FLAG_SPECULATIVE; 1881807Sbonwick 18910922SJeff.Bonwick@Sun.COM SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 19010922SJeff.Bonwick@Sun.COM ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 19110922SJeff.Bonwick@Sun.COM 19212296SLin.Ling@Sun.COM error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 19310922SJeff.Bonwick@Sun.COM ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 1941807Sbonwick 1951807Sbonwick if (error == 0) { 1961807Sbonwick zio_cksum_t cksum = bp->blk_cksum; 1971544Seschrock 1981807Sbonwick /* 1997522SNeil.Perrin@Sun.COM * Validate the checksummed log block. 2007522SNeil.Perrin@Sun.COM * 2011807Sbonwick * Sequence numbers should be... sequential. The checksum 2021807Sbonwick * verifier for the next block should be bp's checksum plus 1. 2037522SNeil.Perrin@Sun.COM * 2047522SNeil.Perrin@Sun.COM * Also check the log chain linkage and size used. 2051807Sbonwick */ 2061807Sbonwick cksum.zc_word[ZIL_ZC_SEQ]++; 2071807Sbonwick 20811670SNeil.Perrin@Sun.COM if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 20911670SNeil.Perrin@Sun.COM zil_chain_t *zilc = abuf->b_data; 21011670SNeil.Perrin@Sun.COM char *lr = (char *)(zilc + 1); 21111670SNeil.Perrin@Sun.COM uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 21211670SNeil.Perrin@Sun.COM 21311670SNeil.Perrin@Sun.COM if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 21411670SNeil.Perrin@Sun.COM sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 21511670SNeil.Perrin@Sun.COM error = ECKSUM; 21611670SNeil.Perrin@Sun.COM } else { 21711670SNeil.Perrin@Sun.COM bcopy(lr, dst, len); 21811670SNeil.Perrin@Sun.COM *end = (char *)dst + len; 21911670SNeil.Perrin@Sun.COM *nbp = zilc->zc_next_blk; 22011670SNeil.Perrin@Sun.COM } 22111670SNeil.Perrin@Sun.COM } else { 22211670SNeil.Perrin@Sun.COM char *lr = abuf->b_data; 22311670SNeil.Perrin@Sun.COM uint64_t size = BP_GET_LSIZE(bp); 22411670SNeil.Perrin@Sun.COM zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 22511670SNeil.Perrin@Sun.COM 22611670SNeil.Perrin@Sun.COM if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 22711670SNeil.Perrin@Sun.COM sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 22811670SNeil.Perrin@Sun.COM (zilc->zc_nused > (size - sizeof (*zilc)))) { 22911670SNeil.Perrin@Sun.COM error = ECKSUM; 23011670SNeil.Perrin@Sun.COM } else { 23111670SNeil.Perrin@Sun.COM bcopy(lr, dst, zilc->zc_nused); 23211670SNeil.Perrin@Sun.COM *end = (char *)dst + zilc->zc_nused; 23311670SNeil.Perrin@Sun.COM *nbp = zilc->zc_next_blk; 23411670SNeil.Perrin@Sun.COM } 23511670SNeil.Perrin@Sun.COM } 2361807Sbonwick 23710922SJeff.Bonwick@Sun.COM VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 238789Sahrens } 239789Sahrens 24010922SJeff.Bonwick@Sun.COM return (error); 24110922SJeff.Bonwick@Sun.COM } 24210922SJeff.Bonwick@Sun.COM 24310922SJeff.Bonwick@Sun.COM /* 24410922SJeff.Bonwick@Sun.COM * Read a TX_WRITE log data block. 24510922SJeff.Bonwick@Sun.COM */ 24610922SJeff.Bonwick@Sun.COM static int 24710922SJeff.Bonwick@Sun.COM zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 24810922SJeff.Bonwick@Sun.COM { 24910922SJeff.Bonwick@Sun.COM enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 25010922SJeff.Bonwick@Sun.COM const blkptr_t *bp = &lr->lr_blkptr; 25110922SJeff.Bonwick@Sun.COM uint32_t aflags = ARC_WAIT; 25210922SJeff.Bonwick@Sun.COM arc_buf_t *abuf = NULL; 25310922SJeff.Bonwick@Sun.COM zbookmark_t zb; 25410922SJeff.Bonwick@Sun.COM int error; 25510922SJeff.Bonwick@Sun.COM 25610922SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(bp)) { 25710922SJeff.Bonwick@Sun.COM if (wbuf != NULL) 25810922SJeff.Bonwick@Sun.COM bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 25910922SJeff.Bonwick@Sun.COM return (0); 26010922SJeff.Bonwick@Sun.COM } 26110922SJeff.Bonwick@Sun.COM 26210922SJeff.Bonwick@Sun.COM if (zilog->zl_header->zh_claim_txg == 0) 26310922SJeff.Bonwick@Sun.COM zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 26410922SJeff.Bonwick@Sun.COM 26510922SJeff.Bonwick@Sun.COM SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 26610922SJeff.Bonwick@Sun.COM ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 26710922SJeff.Bonwick@Sun.COM 26810922SJeff.Bonwick@Sun.COM error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 26910922SJeff.Bonwick@Sun.COM ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 27010922SJeff.Bonwick@Sun.COM 27110922SJeff.Bonwick@Sun.COM if (error == 0) { 27210922SJeff.Bonwick@Sun.COM if (wbuf != NULL) 27310922SJeff.Bonwick@Sun.COM bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 27410922SJeff.Bonwick@Sun.COM (void) arc_buf_remove_ref(abuf, &abuf); 27510922SJeff.Bonwick@Sun.COM } 276789Sahrens 2771807Sbonwick return (error); 278789Sahrens } 279789Sahrens 280789Sahrens /* 281789Sahrens * Parse the intent log, and call parse_func for each valid record within. 282789Sahrens */ 28310922SJeff.Bonwick@Sun.COM int 284789Sahrens zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 285789Sahrens zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 286789Sahrens { 2871807Sbonwick const zil_header_t *zh = zilog->zl_header; 28810922SJeff.Bonwick@Sun.COM boolean_t claimed = !!zh->zh_claim_txg; 28910922SJeff.Bonwick@Sun.COM uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 29010922SJeff.Bonwick@Sun.COM uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 29110922SJeff.Bonwick@Sun.COM uint64_t max_blk_seq = 0; 29210922SJeff.Bonwick@Sun.COM uint64_t max_lr_seq = 0; 29310922SJeff.Bonwick@Sun.COM uint64_t blk_count = 0; 29410922SJeff.Bonwick@Sun.COM uint64_t lr_count = 0; 29510922SJeff.Bonwick@Sun.COM blkptr_t blk, next_blk; 296789Sahrens char *lrbuf, *lrp; 29710922SJeff.Bonwick@Sun.COM int error = 0; 298789Sahrens 29910922SJeff.Bonwick@Sun.COM /* 30010922SJeff.Bonwick@Sun.COM * Old logs didn't record the maximum zh_claim_lr_seq. 30110922SJeff.Bonwick@Sun.COM */ 30210922SJeff.Bonwick@Sun.COM if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 30310922SJeff.Bonwick@Sun.COM claim_lr_seq = UINT64_MAX; 304789Sahrens 305789Sahrens /* 306789Sahrens * Starting at the block pointed to by zh_log we read the log chain. 307789Sahrens * For each block in the chain we strongly check that block to 308789Sahrens * ensure its validity. We stop when an invalid block is found. 309789Sahrens * For each block pointer in the chain we call parse_blk_func(). 310789Sahrens * For each record in each valid block we call parse_lr_func(). 3111807Sbonwick * If the log has been claimed, stop if we encounter a sequence 3121807Sbonwick * number greater than the highest claimed sequence number. 313789Sahrens */ 31410922SJeff.Bonwick@Sun.COM lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); 31510922SJeff.Bonwick@Sun.COM zil_bp_tree_init(zilog); 31610922SJeff.Bonwick@Sun.COM 31710922SJeff.Bonwick@Sun.COM for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 31810922SJeff.Bonwick@Sun.COM uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 31910922SJeff.Bonwick@Sun.COM int reclen; 32011670SNeil.Perrin@Sun.COM char *end; 3211807Sbonwick 32210922SJeff.Bonwick@Sun.COM if (blk_seq > claim_blk_seq) 32310922SJeff.Bonwick@Sun.COM break; 32410922SJeff.Bonwick@Sun.COM if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 32510922SJeff.Bonwick@Sun.COM break; 32611670SNeil.Perrin@Sun.COM ASSERT3U(max_blk_seq, <, blk_seq); 32710922SJeff.Bonwick@Sun.COM max_blk_seq = blk_seq; 32810922SJeff.Bonwick@Sun.COM blk_count++; 32910922SJeff.Bonwick@Sun.COM 33010922SJeff.Bonwick@Sun.COM if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 3311807Sbonwick break; 3321807Sbonwick 33311670SNeil.Perrin@Sun.COM error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 334789Sahrens if (error) 335789Sahrens break; 336789Sahrens 33711670SNeil.Perrin@Sun.COM for (lrp = lrbuf; lrp < end; lrp += reclen) { 338789Sahrens lr_t *lr = (lr_t *)lrp; 339789Sahrens reclen = lr->lrc_reclen; 340789Sahrens ASSERT3U(reclen, >=, sizeof (lr_t)); 34110922SJeff.Bonwick@Sun.COM if (lr->lrc_seq > claim_lr_seq) 34210922SJeff.Bonwick@Sun.COM goto done; 34310922SJeff.Bonwick@Sun.COM if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 34410922SJeff.Bonwick@Sun.COM goto done; 34511670SNeil.Perrin@Sun.COM ASSERT3U(max_lr_seq, <, lr->lrc_seq); 34610922SJeff.Bonwick@Sun.COM max_lr_seq = lr->lrc_seq; 34710922SJeff.Bonwick@Sun.COM lr_count++; 348789Sahrens } 349789Sahrens } 35010922SJeff.Bonwick@Sun.COM done: 35110922SJeff.Bonwick@Sun.COM zilog->zl_parse_error = error; 35210922SJeff.Bonwick@Sun.COM zilog->zl_parse_blk_seq = max_blk_seq; 35310922SJeff.Bonwick@Sun.COM zilog->zl_parse_lr_seq = max_lr_seq; 35410922SJeff.Bonwick@Sun.COM zilog->zl_parse_blk_count = blk_count; 35510922SJeff.Bonwick@Sun.COM zilog->zl_parse_lr_count = lr_count; 35610922SJeff.Bonwick@Sun.COM 35710922SJeff.Bonwick@Sun.COM ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 35810922SJeff.Bonwick@Sun.COM (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 35910922SJeff.Bonwick@Sun.COM 36010922SJeff.Bonwick@Sun.COM zil_bp_tree_fini(zilog); 36110922SJeff.Bonwick@Sun.COM zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); 36210922SJeff.Bonwick@Sun.COM 36310922SJeff.Bonwick@Sun.COM return (error); 36410922SJeff.Bonwick@Sun.COM } 36510922SJeff.Bonwick@Sun.COM 36610922SJeff.Bonwick@Sun.COM static int 36710922SJeff.Bonwick@Sun.COM zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 36810922SJeff.Bonwick@Sun.COM { 36910922SJeff.Bonwick@Sun.COM /* 37010922SJeff.Bonwick@Sun.COM * Claim log block if not already committed and not already claimed. 37110922SJeff.Bonwick@Sun.COM * If tx == NULL, just verify that the block is claimable. 37210922SJeff.Bonwick@Sun.COM */ 37310922SJeff.Bonwick@Sun.COM if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) 37410922SJeff.Bonwick@Sun.COM return (0); 3751807Sbonwick 37610922SJeff.Bonwick@Sun.COM return (zio_wait(zio_claim(NULL, zilog->zl_spa, 37710922SJeff.Bonwick@Sun.COM tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 37810922SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 37910922SJeff.Bonwick@Sun.COM } 38010922SJeff.Bonwick@Sun.COM 38110922SJeff.Bonwick@Sun.COM static int 38210922SJeff.Bonwick@Sun.COM zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 38310922SJeff.Bonwick@Sun.COM { 38410922SJeff.Bonwick@Sun.COM lr_write_t *lr = (lr_write_t *)lrc; 38510922SJeff.Bonwick@Sun.COM int error; 38610922SJeff.Bonwick@Sun.COM 38710922SJeff.Bonwick@Sun.COM if (lrc->lrc_txtype != TX_WRITE) 38810922SJeff.Bonwick@Sun.COM return (0); 38910922SJeff.Bonwick@Sun.COM 39010922SJeff.Bonwick@Sun.COM /* 39110922SJeff.Bonwick@Sun.COM * If the block is not readable, don't claim it. This can happen 39210922SJeff.Bonwick@Sun.COM * in normal operation when a log block is written to disk before 39310922SJeff.Bonwick@Sun.COM * some of the dmu_sync() blocks it points to. In this case, the 39410922SJeff.Bonwick@Sun.COM * transaction cannot have been committed to anyone (we would have 39510922SJeff.Bonwick@Sun.COM * waited for all writes to be stable first), so it is semantically 39610922SJeff.Bonwick@Sun.COM * correct to declare this the end of the log. 39710922SJeff.Bonwick@Sun.COM */ 39810922SJeff.Bonwick@Sun.COM if (lr->lr_blkptr.blk_birth >= first_txg && 39910922SJeff.Bonwick@Sun.COM (error = zil_read_log_data(zilog, lr, NULL)) != 0) 40010922SJeff.Bonwick@Sun.COM return (error); 40110922SJeff.Bonwick@Sun.COM return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 402789Sahrens } 403789Sahrens 404789Sahrens /* ARGSUSED */ 40510922SJeff.Bonwick@Sun.COM static int 40610922SJeff.Bonwick@Sun.COM zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 407789Sahrens { 40810922SJeff.Bonwick@Sun.COM zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 409789Sahrens 41010922SJeff.Bonwick@Sun.COM return (0); 411789Sahrens } 412789Sahrens 41310922SJeff.Bonwick@Sun.COM static int 414789Sahrens zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 415789Sahrens { 41610922SJeff.Bonwick@Sun.COM lr_write_t *lr = (lr_write_t *)lrc; 41710922SJeff.Bonwick@Sun.COM blkptr_t *bp = &lr->lr_blkptr; 41810922SJeff.Bonwick@Sun.COM 419789Sahrens /* 420789Sahrens * If we previously claimed it, we need to free it. 421789Sahrens */ 42210922SJeff.Bonwick@Sun.COM if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 42310922SJeff.Bonwick@Sun.COM bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) 42410922SJeff.Bonwick@Sun.COM zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 42510922SJeff.Bonwick@Sun.COM 42610922SJeff.Bonwick@Sun.COM return (0); 427789Sahrens } 428789Sahrens 42911670SNeil.Perrin@Sun.COM static lwb_t * 43011670SNeil.Perrin@Sun.COM zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) 43111670SNeil.Perrin@Sun.COM { 43211670SNeil.Perrin@Sun.COM lwb_t *lwb; 43311670SNeil.Perrin@Sun.COM 43411670SNeil.Perrin@Sun.COM lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 43511670SNeil.Perrin@Sun.COM lwb->lwb_zilog = zilog; 43611670SNeil.Perrin@Sun.COM lwb->lwb_blk = *bp; 43711670SNeil.Perrin@Sun.COM lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 43811670SNeil.Perrin@Sun.COM lwb->lwb_max_txg = txg; 43911670SNeil.Perrin@Sun.COM lwb->lwb_zio = NULL; 44011670SNeil.Perrin@Sun.COM lwb->lwb_tx = NULL; 44111670SNeil.Perrin@Sun.COM if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 44211670SNeil.Perrin@Sun.COM lwb->lwb_nused = sizeof (zil_chain_t); 44311670SNeil.Perrin@Sun.COM lwb->lwb_sz = BP_GET_LSIZE(bp); 44411670SNeil.Perrin@Sun.COM } else { 44511670SNeil.Perrin@Sun.COM lwb->lwb_nused = 0; 44611670SNeil.Perrin@Sun.COM lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 44711670SNeil.Perrin@Sun.COM } 44811670SNeil.Perrin@Sun.COM 44911670SNeil.Perrin@Sun.COM mutex_enter(&zilog->zl_lock); 45011670SNeil.Perrin@Sun.COM list_insert_tail(&zilog->zl_lwb_list, lwb); 45111670SNeil.Perrin@Sun.COM mutex_exit(&zilog->zl_lock); 45211670SNeil.Perrin@Sun.COM 45311670SNeil.Perrin@Sun.COM return (lwb); 45411670SNeil.Perrin@Sun.COM } 45511670SNeil.Perrin@Sun.COM 456789Sahrens /* 457789Sahrens * Create an on-disk intent log. 458789Sahrens */ 45911670SNeil.Perrin@Sun.COM static lwb_t * 460789Sahrens zil_create(zilog_t *zilog) 461789Sahrens { 4621807Sbonwick const zil_header_t *zh = zilog->zl_header; 46311670SNeil.Perrin@Sun.COM lwb_t *lwb = NULL; 4641807Sbonwick uint64_t txg = 0; 4651807Sbonwick dmu_tx_t *tx = NULL; 466789Sahrens blkptr_t blk; 4671807Sbonwick int error = 0; 468789Sahrens 469789Sahrens /* 4701807Sbonwick * Wait for any previous destroy to complete. 471789Sahrens */ 4721807Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 4731807Sbonwick 4741807Sbonwick ASSERT(zh->zh_claim_txg == 0); 4751807Sbonwick ASSERT(zh->zh_replay_seq == 0); 4761807Sbonwick 4771807Sbonwick blk = zh->zh_log; 478789Sahrens 479789Sahrens /* 48011670SNeil.Perrin@Sun.COM * Allocate an initial log block if: 48111670SNeil.Perrin@Sun.COM * - there isn't one already 48211670SNeil.Perrin@Sun.COM * - the existing block is the wrong endianess 483789Sahrens */ 4848109SNeil.Perrin@Sun.COM if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 4851807Sbonwick tx = dmu_tx_create(zilog->zl_os); 48610922SJeff.Bonwick@Sun.COM VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 4871807Sbonwick dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 4881807Sbonwick txg = dmu_tx_get_txg(tx); 4891807Sbonwick 4908109SNeil.Perrin@Sun.COM if (!BP_IS_HOLE(&blk)) { 49110922SJeff.Bonwick@Sun.COM zio_free_zil(zilog->zl_spa, txg, &blk); 4928109SNeil.Perrin@Sun.COM BP_ZERO(&blk); 4938109SNeil.Perrin@Sun.COM } 4948109SNeil.Perrin@Sun.COM 49510922SJeff.Bonwick@Sun.COM error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 49610922SJeff.Bonwick@Sun.COM ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); 4971807Sbonwick 4981807Sbonwick if (error == 0) 4991807Sbonwick zil_init_log_chain(zilog, &blk); 5001362Sperrin } 5011807Sbonwick 5021807Sbonwick /* 5031807Sbonwick * Allocate a log write buffer (lwb) for the first log block. 5041807Sbonwick */ 50511670SNeil.Perrin@Sun.COM if (error == 0) 50611670SNeil.Perrin@Sun.COM lwb = zil_alloc_lwb(zilog, &blk, txg); 507789Sahrens 5081807Sbonwick /* 5091807Sbonwick * If we just allocated the first log block, commit our transaction 5101807Sbonwick * and wait for zil_sync() to stuff the block poiner into zh_log. 5111807Sbonwick * (zh is part of the MOS, so we cannot modify it in open context.) 5121807Sbonwick */ 5131807Sbonwick if (tx != NULL) { 5141807Sbonwick dmu_tx_commit(tx); 5151362Sperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 5161807Sbonwick } 5171807Sbonwick 5181807Sbonwick ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 51911670SNeil.Perrin@Sun.COM 52011670SNeil.Perrin@Sun.COM return (lwb); 521789Sahrens } 522789Sahrens 523789Sahrens /* 524789Sahrens * In one tx, free all log blocks and clear the log header. 5251807Sbonwick * If keep_first is set, then we're replaying a log with no content. 5261807Sbonwick * We want to keep the first block, however, so that the first 5271807Sbonwick * synchronous transaction doesn't require a txg_wait_synced() 5281807Sbonwick * in zil_create(). We don't need to txg_wait_synced() here either 5291807Sbonwick * when keep_first is set, because both zil_create() and zil_destroy() 5301807Sbonwick * will wait for any in-progress destroys to complete. 531789Sahrens */ 532789Sahrens void 5331807Sbonwick zil_destroy(zilog_t *zilog, boolean_t keep_first) 534789Sahrens { 5351807Sbonwick const zil_header_t *zh = zilog->zl_header; 5361807Sbonwick lwb_t *lwb; 537789Sahrens dmu_tx_t *tx; 538789Sahrens uint64_t txg; 539789Sahrens 5401807Sbonwick /* 5411807Sbonwick * Wait for any previous destroy to complete. 5421807Sbonwick */ 5431807Sbonwick txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 544789Sahrens 54510922SJeff.Bonwick@Sun.COM zilog->zl_old_header = *zh; /* debugging aid */ 54610922SJeff.Bonwick@Sun.COM 5471807Sbonwick if (BP_IS_HOLE(&zh->zh_log)) 548789Sahrens return; 549789Sahrens 550789Sahrens tx = dmu_tx_create(zilog->zl_os); 55110922SJeff.Bonwick@Sun.COM VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 552789Sahrens dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 553789Sahrens txg = dmu_tx_get_txg(tx); 554789Sahrens 5551807Sbonwick mutex_enter(&zilog->zl_lock); 5561807Sbonwick 5571807Sbonwick ASSERT3U(zilog->zl_destroy_txg, <, txg); 558789Sahrens zilog->zl_destroy_txg = txg; 55910922SJeff.Bonwick@Sun.COM zilog->zl_keep_first = keep_first; 5601807Sbonwick 5611807Sbonwick if (!list_is_empty(&zilog->zl_lwb_list)) { 5621807Sbonwick ASSERT(zh->zh_claim_txg == 0); 56310922SJeff.Bonwick@Sun.COM ASSERT(!keep_first); 5641807Sbonwick while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 5651807Sbonwick list_remove(&zilog->zl_lwb_list, lwb); 5661807Sbonwick if (lwb->lwb_buf != NULL) 5671807Sbonwick zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 56810922SJeff.Bonwick@Sun.COM zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); 5691807Sbonwick kmem_cache_free(zil_lwb_cache, lwb); 5701807Sbonwick } 57110922SJeff.Bonwick@Sun.COM } else if (!keep_first) { 57210922SJeff.Bonwick@Sun.COM (void) zil_parse(zilog, zil_free_log_block, 57310922SJeff.Bonwick@Sun.COM zil_free_log_record, tx, zh->zh_claim_txg); 5741807Sbonwick } 5752638Sperrin mutex_exit(&zilog->zl_lock); 576789Sahrens 577789Sahrens dmu_tx_commit(tx); 578789Sahrens } 579789Sahrens 5802199Sahrens int 58111209SMatthew.Ahrens@Sun.COM zil_claim(const char *osname, void *txarg) 582789Sahrens { 583789Sahrens dmu_tx_t *tx = txarg; 584789Sahrens uint64_t first_txg = dmu_tx_get_txg(tx); 585789Sahrens zilog_t *zilog; 586789Sahrens zil_header_t *zh; 587789Sahrens objset_t *os; 588789Sahrens int error; 589789Sahrens 59010298SMatthew.Ahrens@Sun.COM error = dmu_objset_hold(osname, FTAG, &os); 591789Sahrens if (error) { 5927294Sperrin cmn_err(CE_WARN, "can't open objset for %s", osname); 5932199Sahrens return (0); 594789Sahrens } 595789Sahrens 596789Sahrens zilog = dmu_objset_zil(os); 5971807Sbonwick zh = zil_header_in_syncing_context(zilog); 598789Sahrens 59910922SJeff.Bonwick@Sun.COM if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 6009701SGeorge.Wilson@Sun.COM if (!BP_IS_HOLE(&zh->zh_log)) 60110922SJeff.Bonwick@Sun.COM zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 6029701SGeorge.Wilson@Sun.COM BP_ZERO(&zh->zh_log); 6039701SGeorge.Wilson@Sun.COM dsl_dataset_dirty(dmu_objset_ds(os), tx); 60410921STim.Haley@Sun.COM dmu_objset_rele(os, FTAG); 60510921STim.Haley@Sun.COM return (0); 6069701SGeorge.Wilson@Sun.COM } 6079701SGeorge.Wilson@Sun.COM 608789Sahrens /* 6091807Sbonwick * Claim all log blocks if we haven't already done so, and remember 6101807Sbonwick * the highest claimed sequence number. This ensures that if we can 6111807Sbonwick * read only part of the log now (e.g. due to a missing device), 6121807Sbonwick * but we can read the entire log later, we will not try to replay 6131807Sbonwick * or destroy beyond the last block we successfully claimed. 614789Sahrens */ 615789Sahrens ASSERT3U(zh->zh_claim_txg, <=, first_txg); 616789Sahrens if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 61710922SJeff.Bonwick@Sun.COM (void) zil_parse(zilog, zil_claim_log_block, 61810922SJeff.Bonwick@Sun.COM zil_claim_log_record, tx, first_txg); 619789Sahrens zh->zh_claim_txg = first_txg; 62010922SJeff.Bonwick@Sun.COM zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 62110922SJeff.Bonwick@Sun.COM zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 62210922SJeff.Bonwick@Sun.COM if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 62310922SJeff.Bonwick@Sun.COM zh->zh_flags |= ZIL_REPLAY_NEEDED; 62410922SJeff.Bonwick@Sun.COM zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 625789Sahrens dsl_dataset_dirty(dmu_objset_ds(os), tx); 626789Sahrens } 6271807Sbonwick 628789Sahrens ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 62910298SMatthew.Ahrens@Sun.COM dmu_objset_rele(os, FTAG); 6302199Sahrens return (0); 631789Sahrens } 632789Sahrens 6337294Sperrin /* 6347294Sperrin * Check the log by walking the log chain. 6357294Sperrin * Checksum errors are ok as they indicate the end of the chain. 6367294Sperrin * Any other error (no device or read failure) returns an error. 6377294Sperrin */ 6387294Sperrin int 63911209SMatthew.Ahrens@Sun.COM zil_check_log_chain(const char *osname, void *tx) 6407294Sperrin { 6417294Sperrin zilog_t *zilog; 6427294Sperrin objset_t *os; 6437294Sperrin int error; 6447294Sperrin 64510922SJeff.Bonwick@Sun.COM ASSERT(tx == NULL); 64610922SJeff.Bonwick@Sun.COM 64710298SMatthew.Ahrens@Sun.COM error = dmu_objset_hold(osname, FTAG, &os); 6487294Sperrin if (error) { 6497294Sperrin cmn_err(CE_WARN, "can't open objset for %s", osname); 6507294Sperrin return (0); 6517294Sperrin } 6527294Sperrin 6537294Sperrin zilog = dmu_objset_zil(os); 6547294Sperrin 65510922SJeff.Bonwick@Sun.COM /* 65610922SJeff.Bonwick@Sun.COM * Because tx == NULL, zil_claim_log_block() will not actually claim 65710922SJeff.Bonwick@Sun.COM * any blocks, but just determine whether it is possible to do so. 65810922SJeff.Bonwick@Sun.COM * In addition to checking the log chain, zil_claim_log_block() 65910922SJeff.Bonwick@Sun.COM * will invoke zio_claim() with a done func of spa_claim_notify(), 66010922SJeff.Bonwick@Sun.COM * which will update spa_max_claim_txg. See spa_load() for details. 66110922SJeff.Bonwick@Sun.COM */ 66210922SJeff.Bonwick@Sun.COM error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 66310922SJeff.Bonwick@Sun.COM zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 66410922SJeff.Bonwick@Sun.COM 66510298SMatthew.Ahrens@Sun.COM dmu_objset_rele(os, FTAG); 66610922SJeff.Bonwick@Sun.COM 66710922SJeff.Bonwick@Sun.COM return ((error == ECKSUM || error == ENOENT) ? 0 : error); 6687294Sperrin } 6697294Sperrin 6705688Sbonwick static int 6715688Sbonwick zil_vdev_compare(const void *x1, const void *x2) 672789Sahrens { 67312699SNeil.Perrin@Sun.COM const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 67412699SNeil.Perrin@Sun.COM const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 6755688Sbonwick 6765688Sbonwick if (v1 < v2) 6775688Sbonwick return (-1); 6785688Sbonwick if (v1 > v2) 6795688Sbonwick return (1); 6805688Sbonwick 6815688Sbonwick return (0); 6825688Sbonwick } 6835688Sbonwick 6845688Sbonwick void 68510922SJeff.Bonwick@Sun.COM zil_add_block(zilog_t *zilog, const blkptr_t *bp) 6865688Sbonwick { 6875688Sbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 6885688Sbonwick avl_index_t where; 6895688Sbonwick zil_vdev_node_t *zv, zvsearch; 6905688Sbonwick int ndvas = BP_GET_NDVAS(bp); 6915688Sbonwick int i; 692789Sahrens 6932986Sek110237 if (zfs_nocacheflush) 694789Sahrens return; 695789Sahrens 6965688Sbonwick ASSERT(zilog->zl_writer); 6975688Sbonwick 6985688Sbonwick /* 6995688Sbonwick * Even though we're zl_writer, we still need a lock because the 7005688Sbonwick * zl_get_data() callbacks may have dmu_sync() done callbacks 7015688Sbonwick * that will run concurrently. 7025688Sbonwick */ 7035688Sbonwick mutex_enter(&zilog->zl_vdev_lock); 7045688Sbonwick for (i = 0; i < ndvas; i++) { 7055688Sbonwick zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 7065688Sbonwick if (avl_find(t, &zvsearch, &where) == NULL) { 7075688Sbonwick zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 7085688Sbonwick zv->zv_vdev = zvsearch.zv_vdev; 7095688Sbonwick avl_insert(t, zv, where); 7103063Sperrin } 7113063Sperrin } 7125688Sbonwick mutex_exit(&zilog->zl_vdev_lock); 7133063Sperrin } 7143063Sperrin 715789Sahrens void 7162638Sperrin zil_flush_vdevs(zilog_t *zilog) 717789Sahrens { 7183063Sperrin spa_t *spa = zilog->zl_spa; 7195688Sbonwick avl_tree_t *t = &zilog->zl_vdev_tree; 7205688Sbonwick void *cookie = NULL; 7215688Sbonwick zil_vdev_node_t *zv; 7225688Sbonwick zio_t *zio; 7233063Sperrin 7243063Sperrin ASSERT(zilog->zl_writer); 725789Sahrens 7265688Sbonwick /* 7275688Sbonwick * We don't need zl_vdev_lock here because we're the zl_writer, 7285688Sbonwick * and all zl_get_data() callbacks are done. 7295688Sbonwick */ 7305688Sbonwick if (avl_numnodes(t) == 0) 7315688Sbonwick return; 7325688Sbonwick 7337754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7345688Sbonwick 7357754SJeff.Bonwick@Sun.COM zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 7365688Sbonwick 7375688Sbonwick while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 7385688Sbonwick vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 7395688Sbonwick if (vd != NULL) 7405688Sbonwick zio_flush(zio, vd); 7415688Sbonwick kmem_free(zv, sizeof (*zv)); 7423063Sperrin } 743789Sahrens 744789Sahrens /* 745789Sahrens * Wait for all the flushes to complete. Not all devices actually 746789Sahrens * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. 747789Sahrens */ 7485688Sbonwick (void) zio_wait(zio); 7495688Sbonwick 7507754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_STATE, FTAG); 751789Sahrens } 752789Sahrens 753789Sahrens /* 754789Sahrens * Function called when a log block write completes 755789Sahrens */ 756789Sahrens static void 757789Sahrens zil_lwb_write_done(zio_t *zio) 758789Sahrens { 759789Sahrens lwb_t *lwb = zio->io_private; 760789Sahrens zilog_t *zilog = lwb->lwb_zilog; 76110922SJeff.Bonwick@Sun.COM dmu_tx_t *tx = lwb->lwb_tx; 762789Sahrens 7637754SJeff.Bonwick@Sun.COM ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 7647754SJeff.Bonwick@Sun.COM ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 7657754SJeff.Bonwick@Sun.COM ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 7667754SJeff.Bonwick@Sun.COM ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 7677754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(zio->io_bp)); 7687754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_HOLE(zio->io_bp)); 7697754SJeff.Bonwick@Sun.COM ASSERT(zio->io_bp->blk_fill == 0); 7707754SJeff.Bonwick@Sun.COM 771789Sahrens /* 7729493SNeil.Perrin@Sun.COM * Ensure the lwb buffer pointer is cleared before releasing 7739493SNeil.Perrin@Sun.COM * the txg. If we have had an allocation failure and 7749493SNeil.Perrin@Sun.COM * the txg is waiting to sync then we want want zil_sync() 7759493SNeil.Perrin@Sun.COM * to remove the lwb so that it's not picked up as the next new 7769493SNeil.Perrin@Sun.COM * one in zil_commit_writer(). zil_sync() will only remove 7779493SNeil.Perrin@Sun.COM * the lwb if lwb_buf is null. 778789Sahrens */ 779789Sahrens zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 780789Sahrens mutex_enter(&zilog->zl_lock); 781789Sahrens lwb->lwb_buf = NULL; 78210922SJeff.Bonwick@Sun.COM lwb->lwb_tx = NULL; 78310922SJeff.Bonwick@Sun.COM mutex_exit(&zilog->zl_lock); 7849493SNeil.Perrin@Sun.COM 7859493SNeil.Perrin@Sun.COM /* 7869493SNeil.Perrin@Sun.COM * Now that we've written this log block, we have a stable pointer 7879493SNeil.Perrin@Sun.COM * to the next block in the chain, so it's OK to let the txg in 78810922SJeff.Bonwick@Sun.COM * which we allocated the next block sync. 7899493SNeil.Perrin@Sun.COM */ 79010922SJeff.Bonwick@Sun.COM dmu_tx_commit(tx); 791789Sahrens } 792789Sahrens 793789Sahrens /* 7942237Smaybee * Initialize the io for a log block. 7952237Smaybee */ 7962237Smaybee static void 7972237Smaybee zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) 7982237Smaybee { 7992237Smaybee zbookmark_t zb; 8002237Smaybee 80110922SJeff.Bonwick@Sun.COM SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 80210922SJeff.Bonwick@Sun.COM ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 80310922SJeff.Bonwick@Sun.COM lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 8042237Smaybee 8052638Sperrin if (zilog->zl_root_zio == NULL) { 8062638Sperrin zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, 8072638Sperrin ZIO_FLAG_CANFAIL); 8082638Sperrin } 8093063Sperrin if (lwb->lwb_zio == NULL) { 8103063Sperrin lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 81111670SNeil.Perrin@Sun.COM 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), 8129701SGeorge.Wilson@Sun.COM zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, 81310685SGeorge.Wilson@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 8143063Sperrin } 8152237Smaybee } 8162237Smaybee 8172237Smaybee /* 81811670SNeil.Perrin@Sun.COM * Define a limited set of intent log block sizes. 81911670SNeil.Perrin@Sun.COM * These must be a multiple of 4KB. Note only the amount used (again 82011670SNeil.Perrin@Sun.COM * aligned to 4KB) actually gets written. However, we can't always just 82111670SNeil.Perrin@Sun.COM * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. 82211670SNeil.Perrin@Sun.COM */ 82311670SNeil.Perrin@Sun.COM uint64_t zil_block_buckets[] = { 82411670SNeil.Perrin@Sun.COM 4096, /* non TX_WRITE */ 82511670SNeil.Perrin@Sun.COM 8192+4096, /* data base */ 82611670SNeil.Perrin@Sun.COM 32*1024 + 4096, /* NFS writes */ 82711670SNeil.Perrin@Sun.COM UINT64_MAX 82811670SNeil.Perrin@Sun.COM }; 82911670SNeil.Perrin@Sun.COM 83011670SNeil.Perrin@Sun.COM /* 83110879SNeil.Perrin@Sun.COM * Use the slog as long as the logbias is 'latency' and the current commit size 83210879SNeil.Perrin@Sun.COM * is less than the limit or the total list size is less than 2X the limit. 83310879SNeil.Perrin@Sun.COM * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. 83410879SNeil.Perrin@Sun.COM */ 83510879SNeil.Perrin@Sun.COM uint64_t zil_slog_limit = 1024 * 1024; 83610879SNeil.Perrin@Sun.COM #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ 83710879SNeil.Perrin@Sun.COM (((zilog)->zl_cur_used < zil_slog_limit) || \ 83810879SNeil.Perrin@Sun.COM ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) 83910879SNeil.Perrin@Sun.COM 84010879SNeil.Perrin@Sun.COM /* 841789Sahrens * Start a log block write and advance to the next log block. 842789Sahrens * Calls are serialized. 843789Sahrens */ 844789Sahrens static lwb_t * 845789Sahrens zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) 846789Sahrens { 84711670SNeil.Perrin@Sun.COM lwb_t *nlwb = NULL; 84811670SNeil.Perrin@Sun.COM zil_chain_t *zilc; 8491807Sbonwick spa_t *spa = zilog->zl_spa; 85011670SNeil.Perrin@Sun.COM blkptr_t *bp; 85110922SJeff.Bonwick@Sun.COM dmu_tx_t *tx; 852789Sahrens uint64_t txg; 85311813SNeil.Perrin@Sun.COM uint64_t zil_blksz, wsz; 85411670SNeil.Perrin@Sun.COM int i, error; 855789Sahrens 85611670SNeil.Perrin@Sun.COM if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 85711670SNeil.Perrin@Sun.COM zilc = (zil_chain_t *)lwb->lwb_buf; 85811670SNeil.Perrin@Sun.COM bp = &zilc->zc_next_blk; 85911670SNeil.Perrin@Sun.COM } else { 86011670SNeil.Perrin@Sun.COM zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 86111670SNeil.Perrin@Sun.COM bp = &zilc->zc_next_blk; 86211670SNeil.Perrin@Sun.COM } 86311670SNeil.Perrin@Sun.COM 86411670SNeil.Perrin@Sun.COM ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 865789Sahrens 866789Sahrens /* 867789Sahrens * Allocate the next block and save its address in this block 868789Sahrens * before writing it in order to establish the log chain. 869789Sahrens * Note that if the allocation of nlwb synced before we wrote 870789Sahrens * the block that points at it (lwb), we'd leak it if we crashed. 87110922SJeff.Bonwick@Sun.COM * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 87210922SJeff.Bonwick@Sun.COM * We dirty the dataset to ensure that zil_sync() will be called 87310922SJeff.Bonwick@Sun.COM * to clean up in the event of allocation failure or I/O failure. 874789Sahrens */ 87510922SJeff.Bonwick@Sun.COM tx = dmu_tx_create(zilog->zl_os); 87610922SJeff.Bonwick@Sun.COM VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 87710922SJeff.Bonwick@Sun.COM dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 87810922SJeff.Bonwick@Sun.COM txg = dmu_tx_get_txg(tx); 87910922SJeff.Bonwick@Sun.COM 88010922SJeff.Bonwick@Sun.COM lwb->lwb_tx = tx; 881789Sahrens 882789Sahrens /* 88311670SNeil.Perrin@Sun.COM * Log blocks are pre-allocated. Here we select the size of the next 88411670SNeil.Perrin@Sun.COM * block, based on size used in the last block. 88511670SNeil.Perrin@Sun.COM * - first find the smallest bucket that will fit the block from a 88611670SNeil.Perrin@Sun.COM * limited set of block sizes. This is because it's faster to write 88711670SNeil.Perrin@Sun.COM * blocks allocated from the same metaslab as they are adjacent or 88811670SNeil.Perrin@Sun.COM * close. 88911670SNeil.Perrin@Sun.COM * - next find the maximum from the new suggested size and an array of 89011670SNeil.Perrin@Sun.COM * previous sizes. This lessens a picket fence effect of wrongly 89111670SNeil.Perrin@Sun.COM * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 89211670SNeil.Perrin@Sun.COM * requests. 89311670SNeil.Perrin@Sun.COM * 89411670SNeil.Perrin@Sun.COM * Note we only write what is used, but we can't just allocate 89511670SNeil.Perrin@Sun.COM * the maximum block size because we can exhaust the available 89611670SNeil.Perrin@Sun.COM * pool log space. 897789Sahrens */ 89811670SNeil.Perrin@Sun.COM zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 89911670SNeil.Perrin@Sun.COM for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 90011670SNeil.Perrin@Sun.COM continue; 90111670SNeil.Perrin@Sun.COM zil_blksz = zil_block_buckets[i]; 90211670SNeil.Perrin@Sun.COM if (zil_blksz == UINT64_MAX) 90311670SNeil.Perrin@Sun.COM zil_blksz = SPA_MAXBLOCKSIZE; 90411670SNeil.Perrin@Sun.COM zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 90511670SNeil.Perrin@Sun.COM for (i = 0; i < ZIL_PREV_BLKS; i++) 90611670SNeil.Perrin@Sun.COM zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 90711670SNeil.Perrin@Sun.COM zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 908789Sahrens 9093063Sperrin BP_ZERO(bp); 9103063Sperrin /* pass the old blkptr in order to spread log blocks across devs */ 91110922SJeff.Bonwick@Sun.COM error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, 91210879SNeil.Perrin@Sun.COM USE_SLOG(zilog)); 91311670SNeil.Perrin@Sun.COM if (!error) { 91411670SNeil.Perrin@Sun.COM ASSERT3U(bp->blk_birth, ==, txg); 91511670SNeil.Perrin@Sun.COM bp->blk_cksum = lwb->lwb_blk.blk_cksum; 91611670SNeil.Perrin@Sun.COM bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 9173668Sgw25295 9183668Sgw25295 /* 91911670SNeil.Perrin@Sun.COM * Allocate a new log write buffer (lwb). 9201544Seschrock */ 92111670SNeil.Perrin@Sun.COM nlwb = zil_alloc_lwb(zilog, bp, txg); 92211670SNeil.Perrin@Sun.COM 92311670SNeil.Perrin@Sun.COM /* Record the block for later vdev flushing */ 92411670SNeil.Perrin@Sun.COM zil_add_block(zilog, &lwb->lwb_blk); 925789Sahrens } 926789Sahrens 92711670SNeil.Perrin@Sun.COM if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 92811813SNeil.Perrin@Sun.COM /* For Slim ZIL only write what is used. */ 92911813SNeil.Perrin@Sun.COM wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 93011813SNeil.Perrin@Sun.COM ASSERT3U(wsz, <=, lwb->lwb_sz); 93111813SNeil.Perrin@Sun.COM zio_shrink(lwb->lwb_zio, wsz); 93211670SNeil.Perrin@Sun.COM 93311813SNeil.Perrin@Sun.COM } else { 93411813SNeil.Perrin@Sun.COM wsz = lwb->lwb_sz; 93511813SNeil.Perrin@Sun.COM } 93611670SNeil.Perrin@Sun.COM 93711670SNeil.Perrin@Sun.COM zilc->zc_pad = 0; 93811670SNeil.Perrin@Sun.COM zilc->zc_nused = lwb->lwb_nused; 93911670SNeil.Perrin@Sun.COM zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 94011670SNeil.Perrin@Sun.COM 94111813SNeil.Perrin@Sun.COM /* 94211813SNeil.Perrin@Sun.COM * clear unused data for security 94311813SNeil.Perrin@Sun.COM */ 94411813SNeil.Perrin@Sun.COM bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 94511813SNeil.Perrin@Sun.COM 94611670SNeil.Perrin@Sun.COM zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ 947789Sahrens 948789Sahrens /* 94911670SNeil.Perrin@Sun.COM * If there was an allocation failure then nlwb will be null which 95011670SNeil.Perrin@Sun.COM * forces a txg_wait_synced(). 951789Sahrens */ 952789Sahrens return (nlwb); 953789Sahrens } 954789Sahrens 955789Sahrens static lwb_t * 956789Sahrens zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 957789Sahrens { 958789Sahrens lr_t *lrc = &itx->itx_lr; /* common log record */ 95910922SJeff.Bonwick@Sun.COM lr_write_t *lrw = (lr_write_t *)lrc; 96010922SJeff.Bonwick@Sun.COM char *lr_buf; 961789Sahrens uint64_t txg = lrc->lrc_txg; 962789Sahrens uint64_t reclen = lrc->lrc_reclen; 96310922SJeff.Bonwick@Sun.COM uint64_t dlen = 0; 964789Sahrens 965789Sahrens if (lwb == NULL) 966789Sahrens return (NULL); 96710922SJeff.Bonwick@Sun.COM 968789Sahrens ASSERT(lwb->lwb_buf != NULL); 969789Sahrens 9702237Smaybee if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) 9712237Smaybee dlen = P2ROUNDUP_TYPED( 97210922SJeff.Bonwick@Sun.COM lrw->lr_length, sizeof (uint64_t), uint64_t); 9731669Sperrin 9741669Sperrin zilog->zl_cur_used += (reclen + dlen); 9751669Sperrin 9763063Sperrin zil_lwb_write_init(zilog, lwb); 9773063Sperrin 9781669Sperrin /* 9791669Sperrin * If this record won't fit in the current log block, start a new one. 9801669Sperrin */ 98111670SNeil.Perrin@Sun.COM if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 9821669Sperrin lwb = zil_lwb_write_start(zilog, lwb); 9832237Smaybee if (lwb == NULL) 9841669Sperrin return (NULL); 9853063Sperrin zil_lwb_write_init(zilog, lwb); 98611670SNeil.Perrin@Sun.COM ASSERT(LWB_EMPTY(lwb)); 98711670SNeil.Perrin@Sun.COM if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 9881669Sperrin txg_wait_synced(zilog->zl_dmu_pool, txg); 989789Sahrens return (lwb); 990789Sahrens } 991789Sahrens } 992789Sahrens 99310922SJeff.Bonwick@Sun.COM lr_buf = lwb->lwb_buf + lwb->lwb_nused; 99410922SJeff.Bonwick@Sun.COM bcopy(lrc, lr_buf, reclen); 99510922SJeff.Bonwick@Sun.COM lrc = (lr_t *)lr_buf; 99610922SJeff.Bonwick@Sun.COM lrw = (lr_write_t *)lrc; 9972237Smaybee 9982237Smaybee /* 9992237Smaybee * If it's a write, fetch the data or get its blkptr as appropriate. 10002237Smaybee */ 10012237Smaybee if (lrc->lrc_txtype == TX_WRITE) { 10022237Smaybee if (txg > spa_freeze_txg(zilog->zl_spa)) 10032237Smaybee txg_wait_synced(zilog->zl_dmu_pool, txg); 10042237Smaybee if (itx->itx_wr_state != WR_COPIED) { 10052237Smaybee char *dbuf; 10062237Smaybee int error; 10072237Smaybee 10082237Smaybee if (dlen) { 10092237Smaybee ASSERT(itx->itx_wr_state == WR_NEED_COPY); 101010922SJeff.Bonwick@Sun.COM dbuf = lr_buf + reclen; 101110922SJeff.Bonwick@Sun.COM lrw->lr_common.lrc_reclen += dlen; 10122237Smaybee } else { 10132237Smaybee ASSERT(itx->itx_wr_state == WR_INDIRECT); 10142237Smaybee dbuf = NULL; 10152237Smaybee } 10162237Smaybee error = zilog->zl_get_data( 101710922SJeff.Bonwick@Sun.COM itx->itx_private, lrw, dbuf, lwb->lwb_zio); 101810209SMark.Musante@Sun.COM if (error == EIO) { 101910209SMark.Musante@Sun.COM txg_wait_synced(zilog->zl_dmu_pool, txg); 102010209SMark.Musante@Sun.COM return (lwb); 102110209SMark.Musante@Sun.COM } 10222237Smaybee if (error) { 10232237Smaybee ASSERT(error == ENOENT || error == EEXIST || 10242237Smaybee error == EALREADY); 10252237Smaybee return (lwb); 10262237Smaybee } 10272237Smaybee } 10281669Sperrin } 10292237Smaybee 103010922SJeff.Bonwick@Sun.COM /* 103110922SJeff.Bonwick@Sun.COM * We're actually making an entry, so update lrc_seq to be the 103210922SJeff.Bonwick@Sun.COM * log record sequence number. Note that this is generally not 103310922SJeff.Bonwick@Sun.COM * equal to the itx sequence number because not all transactions 103410922SJeff.Bonwick@Sun.COM * are synchronous, and sometimes spa_sync() gets there first. 103510922SJeff.Bonwick@Sun.COM */ 103610922SJeff.Bonwick@Sun.COM lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ 10372237Smaybee lwb->lwb_nused += reclen + dlen; 1038789Sahrens lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 103911670SNeil.Perrin@Sun.COM ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1040789Sahrens ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); 1041789Sahrens 1042789Sahrens return (lwb); 1043789Sahrens } 1044789Sahrens 1045789Sahrens itx_t * 10465331Samw zil_itx_create(uint64_t txtype, size_t lrsize) 1047789Sahrens { 1048789Sahrens itx_t *itx; 1049789Sahrens 10501842Sperrin lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1051789Sahrens 1052789Sahrens itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1053789Sahrens itx->itx_lr.lrc_txtype = txtype; 1054789Sahrens itx->itx_lr.lrc_reclen = lrsize; 10556101Sperrin itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ 1056789Sahrens itx->itx_lr.lrc_seq = 0; /* defensive */ 105712699SNeil.Perrin@Sun.COM itx->itx_sync = B_TRUE; /* default is synchronous */ 1058789Sahrens 1059789Sahrens return (itx); 1060789Sahrens } 1061789Sahrens 106210922SJeff.Bonwick@Sun.COM void 106310922SJeff.Bonwick@Sun.COM zil_itx_destroy(itx_t *itx) 106410922SJeff.Bonwick@Sun.COM { 106510922SJeff.Bonwick@Sun.COM kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 106610922SJeff.Bonwick@Sun.COM } 106710922SJeff.Bonwick@Sun.COM 106812699SNeil.Perrin@Sun.COM /* 106912699SNeil.Perrin@Sun.COM * Free up the sync and async itxs. The itxs_t has already been detached 107012699SNeil.Perrin@Sun.COM * so no locks are needed. 107112699SNeil.Perrin@Sun.COM */ 107212699SNeil.Perrin@Sun.COM static void 107312699SNeil.Perrin@Sun.COM zil_itxg_clean(itxs_t *itxs) 1074789Sahrens { 107512699SNeil.Perrin@Sun.COM itx_t *itx; 107612699SNeil.Perrin@Sun.COM list_t *list; 107712699SNeil.Perrin@Sun.COM avl_tree_t *t; 107812699SNeil.Perrin@Sun.COM void *cookie; 107912699SNeil.Perrin@Sun.COM itx_async_node_t *ian; 1080789Sahrens 108112699SNeil.Perrin@Sun.COM list = &itxs->i_sync_list; 108212699SNeil.Perrin@Sun.COM while ((itx = list_head(list)) != NULL) { 108312699SNeil.Perrin@Sun.COM list_remove(list, itx); 108412699SNeil.Perrin@Sun.COM kmem_free(itx, offsetof(itx_t, itx_lr) + 108512699SNeil.Perrin@Sun.COM itx->itx_lr.lrc_reclen); 108612699SNeil.Perrin@Sun.COM } 1087789Sahrens 108812699SNeil.Perrin@Sun.COM cookie = NULL; 108912699SNeil.Perrin@Sun.COM t = &itxs->i_async_tree; 109012699SNeil.Perrin@Sun.COM while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 109112699SNeil.Perrin@Sun.COM list = &ian->ia_list; 109212699SNeil.Perrin@Sun.COM while ((itx = list_head(list)) != NULL) { 109312699SNeil.Perrin@Sun.COM list_remove(list, itx); 109412699SNeil.Perrin@Sun.COM kmem_free(itx, offsetof(itx_t, itx_lr) + 109512699SNeil.Perrin@Sun.COM itx->itx_lr.lrc_reclen); 109612699SNeil.Perrin@Sun.COM } 109712699SNeil.Perrin@Sun.COM list_destroy(list); 109812699SNeil.Perrin@Sun.COM kmem_free(ian, sizeof (itx_async_node_t)); 109912699SNeil.Perrin@Sun.COM } 110012699SNeil.Perrin@Sun.COM avl_destroy(t); 1101789Sahrens 110212699SNeil.Perrin@Sun.COM kmem_free(itxs, sizeof (itxs_t)); 110312699SNeil.Perrin@Sun.COM } 110412699SNeil.Perrin@Sun.COM 110512699SNeil.Perrin@Sun.COM static int 110612699SNeil.Perrin@Sun.COM zil_aitx_compare(const void *x1, const void *x2) 110712699SNeil.Perrin@Sun.COM { 110812699SNeil.Perrin@Sun.COM const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; 110912699SNeil.Perrin@Sun.COM const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; 111012699SNeil.Perrin@Sun.COM 111112699SNeil.Perrin@Sun.COM if (o1 < o2) 111212699SNeil.Perrin@Sun.COM return (-1); 111312699SNeil.Perrin@Sun.COM if (o1 > o2) 111412699SNeil.Perrin@Sun.COM return (1); 111512699SNeil.Perrin@Sun.COM 111612699SNeil.Perrin@Sun.COM return (0); 1117789Sahrens } 1118789Sahrens 1119789Sahrens /* 112012699SNeil.Perrin@Sun.COM * Remove all async itx with the given oid. 1121789Sahrens */ 112212699SNeil.Perrin@Sun.COM void 112312699SNeil.Perrin@Sun.COM zil_remove_async(zilog_t *zilog, uint64_t oid) 1124789Sahrens { 112512699SNeil.Perrin@Sun.COM uint64_t otxg, txg; 112612699SNeil.Perrin@Sun.COM itx_async_node_t *ian; 112712699SNeil.Perrin@Sun.COM avl_tree_t *t; 112812699SNeil.Perrin@Sun.COM avl_index_t where; 11293778Sjohansen list_t clean_list; 1130789Sahrens itx_t *itx; 1131789Sahrens 113212699SNeil.Perrin@Sun.COM ASSERT(oid != 0); 11333778Sjohansen list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 11343778Sjohansen 113512699SNeil.Perrin@Sun.COM if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 113612699SNeil.Perrin@Sun.COM otxg = ZILTEST_TXG; 113712699SNeil.Perrin@Sun.COM else 113812699SNeil.Perrin@Sun.COM otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 113912699SNeil.Perrin@Sun.COM 114012699SNeil.Perrin@Sun.COM for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 114112699SNeil.Perrin@Sun.COM itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 114212699SNeil.Perrin@Sun.COM 114312699SNeil.Perrin@Sun.COM mutex_enter(&itxg->itxg_lock); 114412699SNeil.Perrin@Sun.COM if (itxg->itxg_txg != txg) { 114512699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 114612699SNeil.Perrin@Sun.COM continue; 114712699SNeil.Perrin@Sun.COM } 114812699SNeil.Perrin@Sun.COM 114912699SNeil.Perrin@Sun.COM /* 115012699SNeil.Perrin@Sun.COM * Locate the object node and append its list. 115112699SNeil.Perrin@Sun.COM */ 115212699SNeil.Perrin@Sun.COM t = &itxg->itxg_itxs->i_async_tree; 115312699SNeil.Perrin@Sun.COM ian = avl_find(t, &oid, &where); 115412699SNeil.Perrin@Sun.COM if (ian != NULL) 115512699SNeil.Perrin@Sun.COM list_move_tail(&clean_list, &ian->ia_list); 115612699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 11572638Sperrin } 115812699SNeil.Perrin@Sun.COM while ((itx = list_head(&clean_list)) != NULL) { 115912699SNeil.Perrin@Sun.COM list_remove(&clean_list, itx); 116012699SNeil.Perrin@Sun.COM kmem_free(itx, offsetof(itx_t, itx_lr) + 116112699SNeil.Perrin@Sun.COM itx->itx_lr.lrc_reclen); 116212699SNeil.Perrin@Sun.COM } 116312699SNeil.Perrin@Sun.COM list_destroy(&clean_list); 116412699SNeil.Perrin@Sun.COM } 116512699SNeil.Perrin@Sun.COM 116612699SNeil.Perrin@Sun.COM void 116712699SNeil.Perrin@Sun.COM zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 116812699SNeil.Perrin@Sun.COM { 116912699SNeil.Perrin@Sun.COM uint64_t txg; 117012699SNeil.Perrin@Sun.COM itxg_t *itxg; 117112699SNeil.Perrin@Sun.COM itxs_t *itxs, *clean = NULL; 11723778Sjohansen 11733778Sjohansen /* 117412699SNeil.Perrin@Sun.COM * Object ids can be re-instantiated in the same or next txg so 117512699SNeil.Perrin@Sun.COM * remove any async transactions to avoid future leaks. 117612699SNeil.Perrin@Sun.COM * This can happen if a fsync occurs on the re-instantiated 117712699SNeil.Perrin@Sun.COM * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 117812699SNeil.Perrin@Sun.COM * the new file data and flushes a write record for the old object. 11793778Sjohansen */ 118012699SNeil.Perrin@Sun.COM if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) 1181*12700SNeil.Perrin@Sun.COM zil_remove_async(zilog, itx->itx_oid); 118212699SNeil.Perrin@Sun.COM 118312699SNeil.Perrin@Sun.COM if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) 118412699SNeil.Perrin@Sun.COM txg = ZILTEST_TXG; 118512699SNeil.Perrin@Sun.COM else 118612699SNeil.Perrin@Sun.COM txg = dmu_tx_get_txg(tx); 118712699SNeil.Perrin@Sun.COM 118812699SNeil.Perrin@Sun.COM itxg = &zilog->zl_itxg[txg & TXG_MASK]; 118912699SNeil.Perrin@Sun.COM mutex_enter(&itxg->itxg_lock); 119012699SNeil.Perrin@Sun.COM itxs = itxg->itxg_itxs; 119112699SNeil.Perrin@Sun.COM if (itxg->itxg_txg != txg) { 119212699SNeil.Perrin@Sun.COM if (itxs != NULL) { 119312699SNeil.Perrin@Sun.COM /* 119412699SNeil.Perrin@Sun.COM * The zil_clean callback hasn't got around to cleaning 119512699SNeil.Perrin@Sun.COM * this itxg. Save the itxs for release below. 119612699SNeil.Perrin@Sun.COM * This should be rare. 119712699SNeil.Perrin@Sun.COM */ 119812699SNeil.Perrin@Sun.COM atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); 119912699SNeil.Perrin@Sun.COM itxg->itxg_sod = 0; 120012699SNeil.Perrin@Sun.COM clean = itxg->itxg_itxs; 120112699SNeil.Perrin@Sun.COM } 120212699SNeil.Perrin@Sun.COM ASSERT(itxg->itxg_sod == 0); 120312699SNeil.Perrin@Sun.COM itxg->itxg_txg = txg; 120412699SNeil.Perrin@Sun.COM itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); 120512699SNeil.Perrin@Sun.COM 120612699SNeil.Perrin@Sun.COM list_create(&itxs->i_sync_list, sizeof (itx_t), 120712699SNeil.Perrin@Sun.COM offsetof(itx_t, itx_node)); 120812699SNeil.Perrin@Sun.COM avl_create(&itxs->i_async_tree, zil_aitx_compare, 120912699SNeil.Perrin@Sun.COM sizeof (itx_async_node_t), 121012699SNeil.Perrin@Sun.COM offsetof(itx_async_node_t, ia_node)); 12113778Sjohansen } 121212699SNeil.Perrin@Sun.COM if (itx->itx_sync) { 121312699SNeil.Perrin@Sun.COM list_insert_tail(&itxs->i_sync_list, itx); 121412699SNeil.Perrin@Sun.COM atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); 121512699SNeil.Perrin@Sun.COM itxg->itxg_sod += itx->itx_sod; 121612699SNeil.Perrin@Sun.COM } else { 121712699SNeil.Perrin@Sun.COM avl_tree_t *t = &itxs->i_async_tree; 121812699SNeil.Perrin@Sun.COM uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; 121912699SNeil.Perrin@Sun.COM itx_async_node_t *ian; 122012699SNeil.Perrin@Sun.COM avl_index_t where; 12213778Sjohansen 122212699SNeil.Perrin@Sun.COM ian = avl_find(t, &foid, &where); 122312699SNeil.Perrin@Sun.COM if (ian == NULL) { 122412699SNeil.Perrin@Sun.COM ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); 122512699SNeil.Perrin@Sun.COM list_create(&ian->ia_list, sizeof (itx_t), 122612699SNeil.Perrin@Sun.COM offsetof(itx_t, itx_node)); 122712699SNeil.Perrin@Sun.COM ian->ia_foid = foid; 122812699SNeil.Perrin@Sun.COM avl_insert(t, ian, where); 122912699SNeil.Perrin@Sun.COM } 123012699SNeil.Perrin@Sun.COM list_insert_tail(&ian->ia_list, itx); 1231789Sahrens } 123212699SNeil.Perrin@Sun.COM 123312699SNeil.Perrin@Sun.COM itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 123412699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 123512699SNeil.Perrin@Sun.COM 123612699SNeil.Perrin@Sun.COM /* Release the old itxs now we've dropped the lock */ 123712699SNeil.Perrin@Sun.COM if (clean != NULL) 123812699SNeil.Perrin@Sun.COM zil_itxg_clean(clean); 1239789Sahrens } 1240789Sahrens 12412638Sperrin /* 12423063Sperrin * If there are any in-memory intent log transactions which have now been 12433063Sperrin * synced then start up a taskq to free them. 12442638Sperrin */ 1245789Sahrens void 124612699SNeil.Perrin@Sun.COM zil_clean(zilog_t *zilog, uint64_t synced_txg) 1247789Sahrens { 124812699SNeil.Perrin@Sun.COM itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; 124912699SNeil.Perrin@Sun.COM itxs_t *clean_me; 125012699SNeil.Perrin@Sun.COM 125112699SNeil.Perrin@Sun.COM mutex_enter(&itxg->itxg_lock); 125212699SNeil.Perrin@Sun.COM if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { 125312699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 125412699SNeil.Perrin@Sun.COM return; 125512699SNeil.Perrin@Sun.COM } 125612699SNeil.Perrin@Sun.COM ASSERT3U(itxg->itxg_txg, <=, synced_txg); 125712699SNeil.Perrin@Sun.COM ASSERT(itxg->itxg_txg != 0); 125812699SNeil.Perrin@Sun.COM ASSERT(zilog->zl_clean_taskq != NULL); 125912699SNeil.Perrin@Sun.COM atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); 126012699SNeil.Perrin@Sun.COM itxg->itxg_sod = 0; 126112699SNeil.Perrin@Sun.COM clean_me = itxg->itxg_itxs; 126212699SNeil.Perrin@Sun.COM itxg->itxg_itxs = NULL; 126312699SNeil.Perrin@Sun.COM itxg->itxg_txg = 0; 126412699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 126512699SNeil.Perrin@Sun.COM /* 126612699SNeil.Perrin@Sun.COM * Preferably start a task queue to free up the old itxs but 126712699SNeil.Perrin@Sun.COM * if taskq_dispatch can't allocate resources to do that then 126812699SNeil.Perrin@Sun.COM * free it in-line. This should be rare. Note, using TQ_SLEEP 126912699SNeil.Perrin@Sun.COM * created a bad performance problem. 127012699SNeil.Perrin@Sun.COM */ 127112699SNeil.Perrin@Sun.COM if (taskq_dispatch(zilog->zl_clean_taskq, 127212699SNeil.Perrin@Sun.COM (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL) 127312699SNeil.Perrin@Sun.COM zil_itxg_clean(clean_me); 127412699SNeil.Perrin@Sun.COM } 127512699SNeil.Perrin@Sun.COM 127612699SNeil.Perrin@Sun.COM /* 127712699SNeil.Perrin@Sun.COM * Get the list of itxs to commit into zl_itx_commit_list. 127812699SNeil.Perrin@Sun.COM */ 127912699SNeil.Perrin@Sun.COM void 128012699SNeil.Perrin@Sun.COM zil_get_commit_list(zilog_t *zilog) 128112699SNeil.Perrin@Sun.COM { 128212699SNeil.Perrin@Sun.COM uint64_t otxg, txg; 128312699SNeil.Perrin@Sun.COM list_t *commit_list = &zilog->zl_itx_commit_list; 128412699SNeil.Perrin@Sun.COM uint64_t push_sod = 0; 128512699SNeil.Perrin@Sun.COM 128612699SNeil.Perrin@Sun.COM if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 128712699SNeil.Perrin@Sun.COM otxg = ZILTEST_TXG; 128812699SNeil.Perrin@Sun.COM else 128912699SNeil.Perrin@Sun.COM otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 129012699SNeil.Perrin@Sun.COM 129112699SNeil.Perrin@Sun.COM for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 129212699SNeil.Perrin@Sun.COM itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 129312699SNeil.Perrin@Sun.COM 129412699SNeil.Perrin@Sun.COM mutex_enter(&itxg->itxg_lock); 129512699SNeil.Perrin@Sun.COM if (itxg->itxg_txg != txg) { 129612699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 129712699SNeil.Perrin@Sun.COM continue; 129812699SNeil.Perrin@Sun.COM } 12993063Sperrin 130012699SNeil.Perrin@Sun.COM list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); 130112699SNeil.Perrin@Sun.COM push_sod += itxg->itxg_sod; 130212699SNeil.Perrin@Sun.COM itxg->itxg_sod = 0; 130312699SNeil.Perrin@Sun.COM 130412699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 13053063Sperrin } 130612699SNeil.Perrin@Sun.COM atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); 130712699SNeil.Perrin@Sun.COM } 130812699SNeil.Perrin@Sun.COM 130912699SNeil.Perrin@Sun.COM /* 131012699SNeil.Perrin@Sun.COM * Move the async itxs for a specified object to commit into sync lists. 131112699SNeil.Perrin@Sun.COM */ 131212699SNeil.Perrin@Sun.COM void 131312699SNeil.Perrin@Sun.COM zil_async_to_sync(zilog_t *zilog, uint64_t foid) 131412699SNeil.Perrin@Sun.COM { 131512699SNeil.Perrin@Sun.COM uint64_t otxg, txg; 131612699SNeil.Perrin@Sun.COM itx_async_node_t *ian; 131712699SNeil.Perrin@Sun.COM avl_tree_t *t; 131812699SNeil.Perrin@Sun.COM avl_index_t where; 131912699SNeil.Perrin@Sun.COM 132012699SNeil.Perrin@Sun.COM if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ 132112699SNeil.Perrin@Sun.COM otxg = ZILTEST_TXG; 132212699SNeil.Perrin@Sun.COM else 132312699SNeil.Perrin@Sun.COM otxg = spa_last_synced_txg(zilog->zl_spa) + 1; 132412699SNeil.Perrin@Sun.COM 132512699SNeil.Perrin@Sun.COM for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { 132612699SNeil.Perrin@Sun.COM itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; 132712699SNeil.Perrin@Sun.COM 132812699SNeil.Perrin@Sun.COM mutex_enter(&itxg->itxg_lock); 132912699SNeil.Perrin@Sun.COM if (itxg->itxg_txg != txg) { 133012699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 133112699SNeil.Perrin@Sun.COM continue; 133212699SNeil.Perrin@Sun.COM } 133312699SNeil.Perrin@Sun.COM 133412699SNeil.Perrin@Sun.COM /* 133512699SNeil.Perrin@Sun.COM * If a foid is specified then find that node and append its 133612699SNeil.Perrin@Sun.COM * list. Otherwise walk the tree appending all the lists 133712699SNeil.Perrin@Sun.COM * to the sync list. We add to the end rather than the 133812699SNeil.Perrin@Sun.COM * beginning to ensure the create has happened. 133912699SNeil.Perrin@Sun.COM */ 134012699SNeil.Perrin@Sun.COM t = &itxg->itxg_itxs->i_async_tree; 134112699SNeil.Perrin@Sun.COM if (foid != 0) { 134212699SNeil.Perrin@Sun.COM ian = avl_find(t, &foid, &where); 134312699SNeil.Perrin@Sun.COM if (ian != NULL) { 134412699SNeil.Perrin@Sun.COM list_move_tail(&itxg->itxg_itxs->i_sync_list, 134512699SNeil.Perrin@Sun.COM &ian->ia_list); 134612699SNeil.Perrin@Sun.COM } 134712699SNeil.Perrin@Sun.COM } else { 134812699SNeil.Perrin@Sun.COM void *cookie = NULL; 134912699SNeil.Perrin@Sun.COM 135012699SNeil.Perrin@Sun.COM while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { 135112699SNeil.Perrin@Sun.COM list_move_tail(&itxg->itxg_itxs->i_sync_list, 135212699SNeil.Perrin@Sun.COM &ian->ia_list); 135312699SNeil.Perrin@Sun.COM list_destroy(&ian->ia_list); 135412699SNeil.Perrin@Sun.COM kmem_free(ian, sizeof (itx_async_node_t)); 135512699SNeil.Perrin@Sun.COM } 135612699SNeil.Perrin@Sun.COM } 135712699SNeil.Perrin@Sun.COM mutex_exit(&itxg->itxg_lock); 135812699SNeil.Perrin@Sun.COM } 1359789Sahrens } 1360789Sahrens 13617754SJeff.Bonwick@Sun.COM static void 136212699SNeil.Perrin@Sun.COM zil_commit_writer(zilog_t *zilog) 1363789Sahrens { 1364789Sahrens uint64_t txg; 136512699SNeil.Perrin@Sun.COM itx_t *itx; 1366789Sahrens lwb_t *lwb; 136712699SNeil.Perrin@Sun.COM spa_t *spa = zilog->zl_spa; 136810922SJeff.Bonwick@Sun.COM int error = 0; 1369789Sahrens 13707754SJeff.Bonwick@Sun.COM ASSERT(zilog->zl_root_zio == NULL); 137112699SNeil.Perrin@Sun.COM 137212699SNeil.Perrin@Sun.COM mutex_exit(&zilog->zl_lock); 137312699SNeil.Perrin@Sun.COM 137412699SNeil.Perrin@Sun.COM zil_get_commit_list(zilog); 137512699SNeil.Perrin@Sun.COM 137612699SNeil.Perrin@Sun.COM /* 137712699SNeil.Perrin@Sun.COM * Return if there's nothing to commit before we dirty the fs by 137812699SNeil.Perrin@Sun.COM * calling zil_create(). 137912699SNeil.Perrin@Sun.COM */ 138012699SNeil.Perrin@Sun.COM if (list_head(&zilog->zl_itx_commit_list) == NULL) { 138112699SNeil.Perrin@Sun.COM mutex_enter(&zilog->zl_lock); 138212699SNeil.Perrin@Sun.COM return; 138312699SNeil.Perrin@Sun.COM } 1384789Sahrens 1385789Sahrens if (zilog->zl_suspend) { 1386789Sahrens lwb = NULL; 1387789Sahrens } else { 1388789Sahrens lwb = list_tail(&zilog->zl_lwb_list); 138912699SNeil.Perrin@Sun.COM if (lwb == NULL) 139011670SNeil.Perrin@Sun.COM lwb = zil_create(zilog); 1391789Sahrens } 139210922SJeff.Bonwick@Sun.COM 139312699SNeil.Perrin@Sun.COM DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); 139412699SNeil.Perrin@Sun.COM while (itx = list_head(&zilog->zl_itx_commit_list)) { 1395789Sahrens txg = itx->itx_lr.lrc_txg; 1396789Sahrens ASSERT(txg); 1397789Sahrens 139812699SNeil.Perrin@Sun.COM if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) 1399789Sahrens lwb = zil_lwb_commit(zilog, itx, lwb); 140012699SNeil.Perrin@Sun.COM list_remove(&zilog->zl_itx_commit_list, itx); 140112699SNeil.Perrin@Sun.COM kmem_free(itx, offsetof(itx_t, itx_lr) 140212699SNeil.Perrin@Sun.COM + itx->itx_lr.lrc_reclen); 1403789Sahrens } 14042638Sperrin DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 1405789Sahrens 1406789Sahrens /* write the last block out */ 14073063Sperrin if (lwb != NULL && lwb->lwb_zio != NULL) 1408789Sahrens lwb = zil_lwb_write_start(zilog, lwb); 1409789Sahrens 14101141Sperrin zilog->zl_cur_used = 0; 14111141Sperrin 14122638Sperrin /* 14132638Sperrin * Wait if necessary for the log blocks to be on stable storage. 14142638Sperrin */ 14152638Sperrin if (zilog->zl_root_zio) { 141610922SJeff.Bonwick@Sun.COM error = zio_wait(zilog->zl_root_zio); 14177754SJeff.Bonwick@Sun.COM zilog->zl_root_zio = NULL; 14185688Sbonwick zil_flush_vdevs(zilog); 1419789Sahrens } 14201141Sperrin 142110922SJeff.Bonwick@Sun.COM if (error || lwb == NULL) 1422789Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 14233063Sperrin 14243063Sperrin mutex_enter(&zilog->zl_lock); 142510922SJeff.Bonwick@Sun.COM 142610922SJeff.Bonwick@Sun.COM /* 142710922SJeff.Bonwick@Sun.COM * Remember the highest committed log sequence number for ztest. 142810922SJeff.Bonwick@Sun.COM * We only update this value when all the log writes succeeded, 142910922SJeff.Bonwick@Sun.COM * because ztest wants to ASSERT that it got the whole log chain. 143010922SJeff.Bonwick@Sun.COM */ 143110922SJeff.Bonwick@Sun.COM if (error == 0 && lwb != NULL) 143210922SJeff.Bonwick@Sun.COM zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 14332638Sperrin } 14342638Sperrin 14352638Sperrin /* 143612699SNeil.Perrin@Sun.COM * Commit zfs transactions to stable storage. 14372638Sperrin * If foid is 0 push out all transactions, otherwise push only those 143812699SNeil.Perrin@Sun.COM * for that object or might reference that object. 143912699SNeil.Perrin@Sun.COM * 144012699SNeil.Perrin@Sun.COM * itxs are committed in batches. In a heavily stressed zil there will be 144112699SNeil.Perrin@Sun.COM * a commit writer thread who is writing out a bunch of itxs to the log 144212699SNeil.Perrin@Sun.COM * for a set of committing threads (cthreads) in the same batch as the writer. 144312699SNeil.Perrin@Sun.COM * Those cthreads are all waiting on the same cv for that batch. 144412699SNeil.Perrin@Sun.COM * 144512699SNeil.Perrin@Sun.COM * There will also be a different and growing batch of threads that are 144612699SNeil.Perrin@Sun.COM * waiting to commit (qthreads). When the committing batch completes 144712699SNeil.Perrin@Sun.COM * a transition occurs such that the cthreads exit and the qthreads become 144812699SNeil.Perrin@Sun.COM * cthreads. One of the new cthreads becomes the writer thread for the 144912699SNeil.Perrin@Sun.COM * batch. Any new threads arriving become new qthreads. 145012699SNeil.Perrin@Sun.COM * 145112699SNeil.Perrin@Sun.COM * Only 2 condition variables are needed and there's no transition 145212699SNeil.Perrin@Sun.COM * between the two cvs needed. They just flip-flop between qthreads 145312699SNeil.Perrin@Sun.COM * and cthreads. 145412699SNeil.Perrin@Sun.COM * 145512699SNeil.Perrin@Sun.COM * Using this scheme we can efficiently wakeup up only those threads 145612699SNeil.Perrin@Sun.COM * that have been committed. 14572638Sperrin */ 14582638Sperrin void 145912699SNeil.Perrin@Sun.COM zil_commit(zilog_t *zilog, uint64_t foid) 14602638Sperrin { 146112699SNeil.Perrin@Sun.COM uint64_t mybatch; 146212699SNeil.Perrin@Sun.COM 146312699SNeil.Perrin@Sun.COM if (zilog->zl_sync == ZFS_SYNC_DISABLED) 14642638Sperrin return; 14652638Sperrin 146612699SNeil.Perrin@Sun.COM /* move the async itxs for the foid to the sync queues */ 146712699SNeil.Perrin@Sun.COM zil_async_to_sync(zilog, foid); 146812699SNeil.Perrin@Sun.COM 14692638Sperrin mutex_enter(&zilog->zl_lock); 147012699SNeil.Perrin@Sun.COM mybatch = zilog->zl_next_batch; 14713063Sperrin while (zilog->zl_writer) { 147212699SNeil.Perrin@Sun.COM cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); 147312699SNeil.Perrin@Sun.COM if (mybatch <= zilog->zl_com_batch) { 14743063Sperrin mutex_exit(&zilog->zl_lock); 14753063Sperrin return; 14763063Sperrin } 14773063Sperrin } 147810922SJeff.Bonwick@Sun.COM 147912699SNeil.Perrin@Sun.COM zilog->zl_next_batch++; 148012699SNeil.Perrin@Sun.COM zilog->zl_writer = B_TRUE; 148112699SNeil.Perrin@Sun.COM zil_commit_writer(zilog); 148212699SNeil.Perrin@Sun.COM zilog->zl_com_batch = mybatch; 148312699SNeil.Perrin@Sun.COM zilog->zl_writer = B_FALSE; 148412699SNeil.Perrin@Sun.COM mutex_exit(&zilog->zl_lock); 148510922SJeff.Bonwick@Sun.COM 148612699SNeil.Perrin@Sun.COM /* wake up one thread to become the next writer */ 148712699SNeil.Perrin@Sun.COM cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); 148810922SJeff.Bonwick@Sun.COM 148912699SNeil.Perrin@Sun.COM /* wake up all threads waiting for this batch to be committed */ 149012699SNeil.Perrin@Sun.COM cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); 149110922SJeff.Bonwick@Sun.COM } 149210922SJeff.Bonwick@Sun.COM 149310922SJeff.Bonwick@Sun.COM /* 1494789Sahrens * Called in syncing context to free committed log blocks and update log header. 1495789Sahrens */ 1496789Sahrens void 1497789Sahrens zil_sync(zilog_t *zilog, dmu_tx_t *tx) 1498789Sahrens { 14991807Sbonwick zil_header_t *zh = zil_header_in_syncing_context(zilog); 1500789Sahrens uint64_t txg = dmu_tx_get_txg(tx); 1501789Sahrens spa_t *spa = zilog->zl_spa; 150210922SJeff.Bonwick@Sun.COM uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 1503789Sahrens lwb_t *lwb; 1504789Sahrens 15059396SMatthew.Ahrens@Sun.COM /* 15069396SMatthew.Ahrens@Sun.COM * We don't zero out zl_destroy_txg, so make sure we don't try 15079396SMatthew.Ahrens@Sun.COM * to destroy it twice. 15089396SMatthew.Ahrens@Sun.COM */ 15099396SMatthew.Ahrens@Sun.COM if (spa_sync_pass(spa) != 1) 15109396SMatthew.Ahrens@Sun.COM return; 15119396SMatthew.Ahrens@Sun.COM 15121807Sbonwick mutex_enter(&zilog->zl_lock); 15131807Sbonwick 1514789Sahrens ASSERT(zilog->zl_stop_sync == 0); 1515789Sahrens 151610922SJeff.Bonwick@Sun.COM if (*replayed_seq != 0) { 151710922SJeff.Bonwick@Sun.COM ASSERT(zh->zh_replay_seq < *replayed_seq); 151810922SJeff.Bonwick@Sun.COM zh->zh_replay_seq = *replayed_seq; 151910922SJeff.Bonwick@Sun.COM *replayed_seq = 0; 152010922SJeff.Bonwick@Sun.COM } 1521789Sahrens 1522789Sahrens if (zilog->zl_destroy_txg == txg) { 15231807Sbonwick blkptr_t blk = zh->zh_log; 15241807Sbonwick 15251807Sbonwick ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 15261807Sbonwick 15271807Sbonwick bzero(zh, sizeof (zil_header_t)); 15288227SNeil.Perrin@Sun.COM bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 15291807Sbonwick 15301807Sbonwick if (zilog->zl_keep_first) { 15311807Sbonwick /* 15321807Sbonwick * If this block was part of log chain that couldn't 15331807Sbonwick * be claimed because a device was missing during 15341807Sbonwick * zil_claim(), but that device later returns, 15351807Sbonwick * then this block could erroneously appear valid. 15361807Sbonwick * To guard against this, assign a new GUID to the new 15371807Sbonwick * log chain so it doesn't matter what blk points to. 15381807Sbonwick */ 15391807Sbonwick zil_init_log_chain(zilog, &blk); 15401807Sbonwick zh->zh_log = blk; 15411807Sbonwick } 1542789Sahrens } 1543789Sahrens 15449701SGeorge.Wilson@Sun.COM while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 15452638Sperrin zh->zh_log = lwb->lwb_blk; 1546789Sahrens if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 1547789Sahrens break; 1548789Sahrens list_remove(&zilog->zl_lwb_list, lwb); 154910922SJeff.Bonwick@Sun.COM zio_free_zil(spa, txg, &lwb->lwb_blk); 1550789Sahrens kmem_cache_free(zil_lwb_cache, lwb); 15513668Sgw25295 15523668Sgw25295 /* 15533668Sgw25295 * If we don't have anything left in the lwb list then 15543668Sgw25295 * we've had an allocation failure and we need to zero 15553668Sgw25295 * out the zil_header blkptr so that we don't end 15563668Sgw25295 * up freeing the same block twice. 15573668Sgw25295 */ 15583668Sgw25295 if (list_head(&zilog->zl_lwb_list) == NULL) 15593668Sgw25295 BP_ZERO(&zh->zh_log); 1560789Sahrens } 1561789Sahrens mutex_exit(&zilog->zl_lock); 1562789Sahrens } 1563789Sahrens 1564789Sahrens void 1565789Sahrens zil_init(void) 1566789Sahrens { 1567789Sahrens zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 15682856Snd150628 sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); 1569789Sahrens } 1570789Sahrens 1571789Sahrens void 1572789Sahrens zil_fini(void) 1573789Sahrens { 1574789Sahrens kmem_cache_destroy(zil_lwb_cache); 1575789Sahrens } 1576789Sahrens 157710310SNeil.Perrin@Sun.COM void 157812294SMark.Musante@Sun.COM zil_set_sync(zilog_t *zilog, uint64_t sync) 157912294SMark.Musante@Sun.COM { 158012294SMark.Musante@Sun.COM zilog->zl_sync = sync; 158112294SMark.Musante@Sun.COM } 158212294SMark.Musante@Sun.COM 158312294SMark.Musante@Sun.COM void 158410310SNeil.Perrin@Sun.COM zil_set_logbias(zilog_t *zilog, uint64_t logbias) 158510310SNeil.Perrin@Sun.COM { 158610310SNeil.Perrin@Sun.COM zilog->zl_logbias = logbias; 158710310SNeil.Perrin@Sun.COM } 158810310SNeil.Perrin@Sun.COM 1589789Sahrens zilog_t * 1590789Sahrens zil_alloc(objset_t *os, zil_header_t *zh_phys) 1591789Sahrens { 1592789Sahrens zilog_t *zilog; 1593789Sahrens 1594789Sahrens zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 1595789Sahrens 1596789Sahrens zilog->zl_header = zh_phys; 1597789Sahrens zilog->zl_os = os; 1598789Sahrens zilog->zl_spa = dmu_objset_spa(os); 1599789Sahrens zilog->zl_dmu_pool = dmu_objset_pool(os); 16001807Sbonwick zilog->zl_destroy_txg = TXG_INITIAL - 1; 160110310SNeil.Perrin@Sun.COM zilog->zl_logbias = dmu_objset_logbias(os); 160212294SMark.Musante@Sun.COM zilog->zl_sync = dmu_objset_syncprop(os); 160312699SNeil.Perrin@Sun.COM zilog->zl_next_batch = 1; 1604789Sahrens 16052856Snd150628 mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 16062856Snd150628 160712699SNeil.Perrin@Sun.COM for (int i = 0; i < TXG_SIZE; i++) { 160812699SNeil.Perrin@Sun.COM mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, 160912699SNeil.Perrin@Sun.COM MUTEX_DEFAULT, NULL); 161012699SNeil.Perrin@Sun.COM } 1611789Sahrens 1612789Sahrens list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 1613789Sahrens offsetof(lwb_t, lwb_node)); 1614789Sahrens 161512699SNeil.Perrin@Sun.COM list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), 161612699SNeil.Perrin@Sun.COM offsetof(itx_t, itx_node)); 161712699SNeil.Perrin@Sun.COM 16185688Sbonwick mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 16195688Sbonwick 16205688Sbonwick avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, 16215688Sbonwick sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 1622789Sahrens 16235913Sperrin cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); 16245913Sperrin cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 162512699SNeil.Perrin@Sun.COM cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); 162612699SNeil.Perrin@Sun.COM cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); 16275913Sperrin 1628789Sahrens return (zilog); 1629789Sahrens } 1630789Sahrens 1631789Sahrens void 1632789Sahrens zil_free(zilog_t *zilog) 1633789Sahrens { 163412699SNeil.Perrin@Sun.COM lwb_t *head_lwb; 1635789Sahrens 1636789Sahrens zilog->zl_stop_sync = 1; 1637789Sahrens 163812699SNeil.Perrin@Sun.COM /* 163912699SNeil.Perrin@Sun.COM * After zil_close() there should only be one lwb with a buffer. 164012699SNeil.Perrin@Sun.COM */ 164112699SNeil.Perrin@Sun.COM head_lwb = list_head(&zilog->zl_lwb_list); 164212699SNeil.Perrin@Sun.COM if (head_lwb) { 164312699SNeil.Perrin@Sun.COM ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list)); 164412699SNeil.Perrin@Sun.COM list_remove(&zilog->zl_lwb_list, head_lwb); 164512699SNeil.Perrin@Sun.COM kmem_cache_free(zil_lwb_cache, head_lwb); 1646789Sahrens } 1647789Sahrens list_destroy(&zilog->zl_lwb_list); 1648789Sahrens 16495688Sbonwick avl_destroy(&zilog->zl_vdev_tree); 16505688Sbonwick mutex_destroy(&zilog->zl_vdev_lock); 1651789Sahrens 165212699SNeil.Perrin@Sun.COM ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); 165312699SNeil.Perrin@Sun.COM list_destroy(&zilog->zl_itx_commit_list); 165412699SNeil.Perrin@Sun.COM 165512699SNeil.Perrin@Sun.COM for (int i = 0; i < TXG_SIZE; i++) { 165612699SNeil.Perrin@Sun.COM /* 165712699SNeil.Perrin@Sun.COM * It's possible for an itx to be generated that doesn't dirty 165812699SNeil.Perrin@Sun.COM * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() 165912699SNeil.Perrin@Sun.COM * callback to remove the entry. We remove those here. 166012699SNeil.Perrin@Sun.COM * 166112699SNeil.Perrin@Sun.COM * Also free up the ziltest itxs. 166212699SNeil.Perrin@Sun.COM */ 166312699SNeil.Perrin@Sun.COM if (zilog->zl_itxg[i].itxg_itxs) 166412699SNeil.Perrin@Sun.COM zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); 166512699SNeil.Perrin@Sun.COM mutex_destroy(&zilog->zl_itxg[i].itxg_lock); 166612699SNeil.Perrin@Sun.COM } 166712699SNeil.Perrin@Sun.COM 16682856Snd150628 mutex_destroy(&zilog->zl_lock); 1669789Sahrens 16705913Sperrin cv_destroy(&zilog->zl_cv_writer); 16715913Sperrin cv_destroy(&zilog->zl_cv_suspend); 167212699SNeil.Perrin@Sun.COM cv_destroy(&zilog->zl_cv_batch[0]); 167312699SNeil.Perrin@Sun.COM cv_destroy(&zilog->zl_cv_batch[1]); 16745913Sperrin 1675789Sahrens kmem_free(zilog, sizeof (zilog_t)); 1676789Sahrens } 1677789Sahrens 1678789Sahrens /* 1679789Sahrens * Open an intent log. 1680789Sahrens */ 1681789Sahrens zilog_t * 1682789Sahrens zil_open(objset_t *os, zil_get_data_t *get_data) 1683789Sahrens { 1684789Sahrens zilog_t *zilog = dmu_objset_zil(os); 1685789Sahrens 1686789Sahrens zilog->zl_get_data = get_data; 1687789Sahrens zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 1688789Sahrens 2, 2, TASKQ_PREPOPULATE); 1689789Sahrens 1690789Sahrens return (zilog); 1691789Sahrens } 1692789Sahrens 1693789Sahrens /* 1694789Sahrens * Close an intent log. 1695789Sahrens */ 1696789Sahrens void 1697789Sahrens zil_close(zilog_t *zilog) 1698789Sahrens { 169912699SNeil.Perrin@Sun.COM lwb_t *tail_lwb; 170012699SNeil.Perrin@Sun.COM uint64_t txg = 0; 170112699SNeil.Perrin@Sun.COM 170212699SNeil.Perrin@Sun.COM zil_commit(zilog, 0); /* commit all itx */ 170312699SNeil.Perrin@Sun.COM 17041807Sbonwick /* 170512699SNeil.Perrin@Sun.COM * The lwb_max_txg for the stubby lwb will reflect the last activity 170612699SNeil.Perrin@Sun.COM * for the zil. After a txg_wait_synced() on the txg we know all the 170712699SNeil.Perrin@Sun.COM * callbacks have occurred that may clean the zil. Only then can we 170812699SNeil.Perrin@Sun.COM * destroy the zl_clean_taskq. 17091807Sbonwick */ 171012699SNeil.Perrin@Sun.COM mutex_enter(&zilog->zl_lock); 171112699SNeil.Perrin@Sun.COM tail_lwb = list_tail(&zilog->zl_lwb_list); 171212699SNeil.Perrin@Sun.COM if (tail_lwb != NULL) 171312699SNeil.Perrin@Sun.COM txg = tail_lwb->lwb_max_txg; 171412699SNeil.Perrin@Sun.COM mutex_exit(&zilog->zl_lock); 171512699SNeil.Perrin@Sun.COM if (txg) 17161807Sbonwick txg_wait_synced(zilog->zl_dmu_pool, txg); 17171807Sbonwick 1718789Sahrens taskq_destroy(zilog->zl_clean_taskq); 1719789Sahrens zilog->zl_clean_taskq = NULL; 1720789Sahrens zilog->zl_get_data = NULL; 1721789Sahrens } 1722789Sahrens 1723789Sahrens /* 1724789Sahrens * Suspend an intent log. While in suspended mode, we still honor 1725789Sahrens * synchronous semantics, but we rely on txg_wait_synced() to do it. 1726789Sahrens * We suspend the log briefly when taking a snapshot so that the snapshot 1727789Sahrens * contains all the data it's supposed to, and has an empty intent log. 1728789Sahrens */ 1729789Sahrens int 1730789Sahrens zil_suspend(zilog_t *zilog) 1731789Sahrens { 17321807Sbonwick const zil_header_t *zh = zilog->zl_header; 1733789Sahrens 1734789Sahrens mutex_enter(&zilog->zl_lock); 17358989SNeil.Perrin@Sun.COM if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 1736789Sahrens mutex_exit(&zilog->zl_lock); 1737789Sahrens return (EBUSY); 1738789Sahrens } 17391807Sbonwick if (zilog->zl_suspend++ != 0) { 17401807Sbonwick /* 17411807Sbonwick * Someone else already began a suspend. 17421807Sbonwick * Just wait for them to finish. 17431807Sbonwick */ 17441807Sbonwick while (zilog->zl_suspending) 17451807Sbonwick cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 17461807Sbonwick mutex_exit(&zilog->zl_lock); 17471807Sbonwick return (0); 17481807Sbonwick } 17491807Sbonwick zilog->zl_suspending = B_TRUE; 1750789Sahrens mutex_exit(&zilog->zl_lock); 1751789Sahrens 175212699SNeil.Perrin@Sun.COM zil_commit(zilog, 0); 1753789Sahrens 17541807Sbonwick zil_destroy(zilog, B_FALSE); 17551807Sbonwick 17561807Sbonwick mutex_enter(&zilog->zl_lock); 17571807Sbonwick zilog->zl_suspending = B_FALSE; 17581807Sbonwick cv_broadcast(&zilog->zl_cv_suspend); 17591807Sbonwick mutex_exit(&zilog->zl_lock); 1760789Sahrens 1761789Sahrens return (0); 1762789Sahrens } 1763789Sahrens 1764789Sahrens void 1765789Sahrens zil_resume(zilog_t *zilog) 1766789Sahrens { 1767789Sahrens mutex_enter(&zilog->zl_lock); 1768789Sahrens ASSERT(zilog->zl_suspend != 0); 1769789Sahrens zilog->zl_suspend--; 1770789Sahrens mutex_exit(&zilog->zl_lock); 1771789Sahrens } 1772789Sahrens 1773789Sahrens typedef struct zil_replay_arg { 1774789Sahrens zil_replay_func_t **zr_replay; 1775789Sahrens void *zr_arg; 1776789Sahrens boolean_t zr_byteswap; 177710922SJeff.Bonwick@Sun.COM char *zr_lr; 1778789Sahrens } zil_replay_arg_t; 1779789Sahrens 178010922SJeff.Bonwick@Sun.COM static int 178110922SJeff.Bonwick@Sun.COM zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 178210922SJeff.Bonwick@Sun.COM { 178310922SJeff.Bonwick@Sun.COM char name[MAXNAMELEN]; 178410922SJeff.Bonwick@Sun.COM 178510922SJeff.Bonwick@Sun.COM zilog->zl_replaying_seq--; /* didn't actually replay this one */ 178610922SJeff.Bonwick@Sun.COM 178710922SJeff.Bonwick@Sun.COM dmu_objset_name(zilog->zl_os, name); 178810922SJeff.Bonwick@Sun.COM 178910922SJeff.Bonwick@Sun.COM cmn_err(CE_WARN, "ZFS replay transaction error %d, " 179010922SJeff.Bonwick@Sun.COM "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 179110922SJeff.Bonwick@Sun.COM (u_longlong_t)lr->lrc_seq, 179210922SJeff.Bonwick@Sun.COM (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 179310922SJeff.Bonwick@Sun.COM (lr->lrc_txtype & TX_CI) ? "CI" : ""); 179410922SJeff.Bonwick@Sun.COM 179510922SJeff.Bonwick@Sun.COM return (error); 179610922SJeff.Bonwick@Sun.COM } 179710922SJeff.Bonwick@Sun.COM 179810922SJeff.Bonwick@Sun.COM static int 1799789Sahrens zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 1800789Sahrens { 1801789Sahrens zil_replay_arg_t *zr = zra; 18021807Sbonwick const zil_header_t *zh = zilog->zl_header; 1803789Sahrens uint64_t reclen = lr->lrc_reclen; 1804789Sahrens uint64_t txtype = lr->lrc_txtype; 180510922SJeff.Bonwick@Sun.COM int error = 0; 1806789Sahrens 180710922SJeff.Bonwick@Sun.COM zilog->zl_replaying_seq = lr->lrc_seq; 180810922SJeff.Bonwick@Sun.COM 180910922SJeff.Bonwick@Sun.COM if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 181010922SJeff.Bonwick@Sun.COM return (0); 1811789Sahrens 1812789Sahrens if (lr->lrc_txg < claim_txg) /* already committed */ 181310922SJeff.Bonwick@Sun.COM return (0); 1814789Sahrens 18155331Samw /* Strip case-insensitive bit, still present in log record */ 18165331Samw txtype &= ~TX_CI; 18175331Samw 181810922SJeff.Bonwick@Sun.COM if (txtype == 0 || txtype >= TX_MAX_TYPE) 181910922SJeff.Bonwick@Sun.COM return (zil_replay_error(zilog, lr, EINVAL)); 182010922SJeff.Bonwick@Sun.COM 182110922SJeff.Bonwick@Sun.COM /* 182210922SJeff.Bonwick@Sun.COM * If this record type can be logged out of order, the object 182310922SJeff.Bonwick@Sun.COM * (lr_foid) may no longer exist. That's legitimate, not an error. 182410922SJeff.Bonwick@Sun.COM */ 182510922SJeff.Bonwick@Sun.COM if (TX_OOO(txtype)) { 182610922SJeff.Bonwick@Sun.COM error = dmu_object_info(zilog->zl_os, 182710922SJeff.Bonwick@Sun.COM ((lr_ooo_t *)lr)->lr_foid, NULL); 182810922SJeff.Bonwick@Sun.COM if (error == ENOENT || error == EEXIST) 182910922SJeff.Bonwick@Sun.COM return (0); 18308227SNeil.Perrin@Sun.COM } 18318227SNeil.Perrin@Sun.COM 1832789Sahrens /* 1833789Sahrens * Make a copy of the data so we can revise and extend it. 1834789Sahrens */ 183510922SJeff.Bonwick@Sun.COM bcopy(lr, zr->zr_lr, reclen); 183610922SJeff.Bonwick@Sun.COM 183710922SJeff.Bonwick@Sun.COM /* 183810922SJeff.Bonwick@Sun.COM * If this is a TX_WRITE with a blkptr, suck in the data. 183910922SJeff.Bonwick@Sun.COM */ 184010922SJeff.Bonwick@Sun.COM if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 184110922SJeff.Bonwick@Sun.COM error = zil_read_log_data(zilog, (lr_write_t *)lr, 184210922SJeff.Bonwick@Sun.COM zr->zr_lr + reclen); 184310922SJeff.Bonwick@Sun.COM if (error) 184410922SJeff.Bonwick@Sun.COM return (zil_replay_error(zilog, lr, error)); 184510922SJeff.Bonwick@Sun.COM } 1846789Sahrens 1847789Sahrens /* 1848789Sahrens * The log block containing this lr may have been byteswapped 1849789Sahrens * so that we can easily examine common fields like lrc_txtype. 185010922SJeff.Bonwick@Sun.COM * However, the log is a mix of different record types, and only the 1851789Sahrens * replay vectors know how to byteswap their records. Therefore, if 1852789Sahrens * the lr was byteswapped, undo it before invoking the replay vector. 1853789Sahrens */ 1854789Sahrens if (zr->zr_byteswap) 185510922SJeff.Bonwick@Sun.COM byteswap_uint64_array(zr->zr_lr, reclen); 1856789Sahrens 1857789Sahrens /* 1858789Sahrens * We must now do two things atomically: replay this log record, 18598227SNeil.Perrin@Sun.COM * and update the log header sequence number to reflect the fact that 18608227SNeil.Perrin@Sun.COM * we did so. At the end of each replay function the sequence number 18618227SNeil.Perrin@Sun.COM * is updated if we are in replay mode. 1862789Sahrens */ 186310922SJeff.Bonwick@Sun.COM error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 186410922SJeff.Bonwick@Sun.COM if (error) { 18653063Sperrin /* 18663063Sperrin * The DMU's dnode layer doesn't see removes until the txg 18673063Sperrin * commits, so a subsequent claim can spuriously fail with 18688227SNeil.Perrin@Sun.COM * EEXIST. So if we receive any error we try syncing out 186910922SJeff.Bonwick@Sun.COM * any removes then retry the transaction. Note that we 187010922SJeff.Bonwick@Sun.COM * specify B_FALSE for byteswap now, so we don't do it twice. 18713063Sperrin */ 187210922SJeff.Bonwick@Sun.COM txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 187310922SJeff.Bonwick@Sun.COM error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 187410922SJeff.Bonwick@Sun.COM if (error) 187510922SJeff.Bonwick@Sun.COM return (zil_replay_error(zilog, lr, error)); 1876789Sahrens } 187710922SJeff.Bonwick@Sun.COM return (0); 18783063Sperrin } 1879789Sahrens 18803063Sperrin /* ARGSUSED */ 188110922SJeff.Bonwick@Sun.COM static int 18823063Sperrin zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 18833063Sperrin { 18843063Sperrin zilog->zl_replay_blks++; 188510922SJeff.Bonwick@Sun.COM 188610922SJeff.Bonwick@Sun.COM return (0); 1887789Sahrens } 1888789Sahrens 1889789Sahrens /* 18901362Sperrin * If this dataset has a non-empty intent log, replay it and destroy it. 1891789Sahrens */ 1892789Sahrens void 18938227SNeil.Perrin@Sun.COM zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 1894789Sahrens { 1895789Sahrens zilog_t *zilog = dmu_objset_zil(os); 18961807Sbonwick const zil_header_t *zh = zilog->zl_header; 18971807Sbonwick zil_replay_arg_t zr; 18981362Sperrin 18998989SNeil.Perrin@Sun.COM if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 19001807Sbonwick zil_destroy(zilog, B_TRUE); 19011362Sperrin return; 19021362Sperrin } 1903789Sahrens 1904789Sahrens zr.zr_replay = replay_func; 1905789Sahrens zr.zr_arg = arg; 19061807Sbonwick zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 190710922SJeff.Bonwick@Sun.COM zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 1908789Sahrens 1909789Sahrens /* 1910789Sahrens * Wait for in-progress removes to sync before starting replay. 1911789Sahrens */ 1912789Sahrens txg_wait_synced(zilog->zl_dmu_pool, 0); 1913789Sahrens 19148227SNeil.Perrin@Sun.COM zilog->zl_replay = B_TRUE; 191511066Srafael.vanoni@sun.com zilog->zl_replay_time = ddi_get_lbolt(); 19163063Sperrin ASSERT(zilog->zl_replay_blks == 0); 19173063Sperrin (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 19181807Sbonwick zh->zh_claim_txg); 191910922SJeff.Bonwick@Sun.COM kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 1920789Sahrens 19211807Sbonwick zil_destroy(zilog, B_FALSE); 19225712Sahrens txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 19238227SNeil.Perrin@Sun.COM zilog->zl_replay = B_FALSE; 1924789Sahrens } 19251646Sperrin 192610922SJeff.Bonwick@Sun.COM boolean_t 192710922SJeff.Bonwick@Sun.COM zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 19281646Sperrin { 192912294SMark.Musante@Sun.COM if (zilog->zl_sync == ZFS_SYNC_DISABLED) 193010922SJeff.Bonwick@Sun.COM return (B_TRUE); 19311646Sperrin 193210922SJeff.Bonwick@Sun.COM if (zilog->zl_replay) { 193310922SJeff.Bonwick@Sun.COM dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 193410922SJeff.Bonwick@Sun.COM zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 193510922SJeff.Bonwick@Sun.COM zilog->zl_replaying_seq; 193610922SJeff.Bonwick@Sun.COM return (B_TRUE); 19372638Sperrin } 19382638Sperrin 193910922SJeff.Bonwick@Sun.COM return (B_FALSE); 19401646Sperrin } 19419701SGeorge.Wilson@Sun.COM 19429701SGeorge.Wilson@Sun.COM /* ARGSUSED */ 19439701SGeorge.Wilson@Sun.COM int 194411209SMatthew.Ahrens@Sun.COM zil_vdev_offline(const char *osname, void *arg) 19459701SGeorge.Wilson@Sun.COM { 19469701SGeorge.Wilson@Sun.COM objset_t *os; 19479701SGeorge.Wilson@Sun.COM zilog_t *zilog; 19489701SGeorge.Wilson@Sun.COM int error; 19499701SGeorge.Wilson@Sun.COM 195010298SMatthew.Ahrens@Sun.COM error = dmu_objset_hold(osname, FTAG, &os); 19519701SGeorge.Wilson@Sun.COM if (error) 19529701SGeorge.Wilson@Sun.COM return (error); 19539701SGeorge.Wilson@Sun.COM 19549701SGeorge.Wilson@Sun.COM zilog = dmu_objset_zil(os); 19559701SGeorge.Wilson@Sun.COM if (zil_suspend(zilog) != 0) 19569701SGeorge.Wilson@Sun.COM error = EEXIST; 19579701SGeorge.Wilson@Sun.COM else 19589701SGeorge.Wilson@Sun.COM zil_resume(zilog); 195910298SMatthew.Ahrens@Sun.COM dmu_objset_rele(os, FTAG); 19609701SGeorge.Wilson@Sun.COM return (error); 19619701SGeorge.Wilson@Sun.COM } 1962