1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 228632SBill.Moore@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #include <sys/zfs_context.h> 271544Seschrock #include <sys/fm/fs/zfs.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/txg.h> 30789Sahrens #include <sys/spa_impl.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio_impl.h> 33789Sahrens #include <sys/zio_compress.h> 34789Sahrens #include <sys/zio_checksum.h> 3510922SJeff.Bonwick@Sun.COM #include <sys/dmu_objset.h> 3610922SJeff.Bonwick@Sun.COM #include <sys/arc.h> 3710922SJeff.Bonwick@Sun.COM #include <sys/ddt.h> 38789Sahrens 39789Sahrens /* 40789Sahrens * ========================================================================== 41789Sahrens * I/O priority table 42789Sahrens * ========================================================================== 43789Sahrens */ 44789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 45789Sahrens 0, /* ZIO_PRIORITY_NOW */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 47789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 48*11146SGeorge.Wilson@Sun.COM 0, /* ZIO_PRIORITY_LOG_WRITE */ 49*11146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_CACHE_FILL */ 50*11146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_AGG */ 51789Sahrens 4, /* ZIO_PRIORITY_FREE */ 52*11146SGeorge.Wilson@Sun.COM 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 53*11146SGeorge.Wilson@Sun.COM 6, /* ZIO_PRIORITY_ASYNC_READ */ 54789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 55789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 56789Sahrens }; 57789Sahrens 58789Sahrens /* 59789Sahrens * ========================================================================== 60789Sahrens * I/O type descriptions 61789Sahrens * ========================================================================== 62789Sahrens */ 63789Sahrens char *zio_type_name[ZIO_TYPES] = { 64*11146SGeorge.Wilson@Sun.COM "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 65*11146SGeorge.Wilson@Sun.COM "zio_ioctl" 66*11146SGeorge.Wilson@Sun.COM }; 67789Sahrens 68789Sahrens /* 69789Sahrens * ========================================================================== 70789Sahrens * I/O kmem caches 71789Sahrens * ========================================================================== 72789Sahrens */ 734055Seschrock kmem_cache_t *zio_cache; 748632SBill.Moore@Sun.COM kmem_cache_t *zio_link_cache; 75789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 763290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 773290Sjohansen 783290Sjohansen #ifdef _KERNEL 793290Sjohansen extern vmem_t *zio_alloc_arena; 803290Sjohansen #endif 81789Sahrens 825329Sgw25295 /* 837754SJeff.Bonwick@Sun.COM * An allocating zio is one that either currently has the DVA allocate 847754SJeff.Bonwick@Sun.COM * stage set or will have it later in its lifetime. 855329Sgw25295 */ 8610922SJeff.Bonwick@Sun.COM #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 8710922SJeff.Bonwick@Sun.COM 8810922SJeff.Bonwick@Sun.COM #ifdef ZFS_DEBUG 8910922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 16384; 9010922SJeff.Bonwick@Sun.COM #else 9110922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 0; 9210922SJeff.Bonwick@Sun.COM #endif 935329Sgw25295 94789Sahrens void 95789Sahrens zio_init(void) 96789Sahrens { 97789Sahrens size_t c; 983290Sjohansen vmem_t *data_alloc_arena = NULL; 993290Sjohansen 1003290Sjohansen #ifdef _KERNEL 1013290Sjohansen data_alloc_arena = zio_alloc_arena; 1023290Sjohansen #endif 1038632SBill.Moore@Sun.COM zio_cache = kmem_cache_create("zio_cache", 1048632SBill.Moore@Sun.COM sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1058632SBill.Moore@Sun.COM zio_link_cache = kmem_cache_create("zio_link_cache", 1068632SBill.Moore@Sun.COM sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1074055Seschrock 108789Sahrens /* 109789Sahrens * For small buffers, we want a cache for each multiple of 110789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 111789Sahrens * for each quarter-power of 2. For large buffers, we want 112789Sahrens * a cache for each multiple of PAGESIZE. 113789Sahrens */ 114789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 115789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 116789Sahrens size_t p2 = size; 117789Sahrens size_t align = 0; 118789Sahrens 119789Sahrens while (p2 & (p2 - 1)) 120789Sahrens p2 &= p2 - 1; 121789Sahrens 122789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 123789Sahrens align = SPA_MINBLOCKSIZE; 124789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 125789Sahrens align = PAGESIZE; 126789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 127789Sahrens align = p2 >> 2; 128789Sahrens } 129789Sahrens 130789Sahrens if (align != 0) { 1313290Sjohansen char name[36]; 1322856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 133789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 13410922SJeff.Bonwick@Sun.COM align, NULL, NULL, NULL, NULL, NULL, 13510922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 1363290Sjohansen 1373290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1383290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1393290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 14010922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 141789Sahrens } 142789Sahrens } 143789Sahrens 144789Sahrens while (--c != 0) { 145789Sahrens ASSERT(zio_buf_cache[c] != NULL); 146789Sahrens if (zio_buf_cache[c - 1] == NULL) 147789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1483290Sjohansen 1493290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1503290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1513290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 152789Sahrens } 1531544Seschrock 1541544Seschrock zio_inject_init(); 155789Sahrens } 156789Sahrens 157789Sahrens void 158789Sahrens zio_fini(void) 159789Sahrens { 160789Sahrens size_t c; 161789Sahrens kmem_cache_t *last_cache = NULL; 1623290Sjohansen kmem_cache_t *last_data_cache = NULL; 163789Sahrens 164789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 165789Sahrens if (zio_buf_cache[c] != last_cache) { 166789Sahrens last_cache = zio_buf_cache[c]; 167789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 168789Sahrens } 169789Sahrens zio_buf_cache[c] = NULL; 1703290Sjohansen 1713290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1723290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1733290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1743290Sjohansen } 1753290Sjohansen zio_data_buf_cache[c] = NULL; 176789Sahrens } 1771544Seschrock 1788632SBill.Moore@Sun.COM kmem_cache_destroy(zio_link_cache); 1794055Seschrock kmem_cache_destroy(zio_cache); 1804055Seschrock 1811544Seschrock zio_inject_fini(); 182789Sahrens } 183789Sahrens 184789Sahrens /* 185789Sahrens * ========================================================================== 186789Sahrens * Allocate and free I/O buffers 187789Sahrens * ========================================================================== 188789Sahrens */ 1893290Sjohansen 1903290Sjohansen /* 1913290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1923290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1933290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1943290Sjohansen * excess / transient data in-core during a crashdump. 1953290Sjohansen */ 196789Sahrens void * 197789Sahrens zio_buf_alloc(size_t size) 198789Sahrens { 199789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 200789Sahrens 201789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 202789Sahrens 2036245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 204789Sahrens } 205789Sahrens 2063290Sjohansen /* 2073290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2083290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2093290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2103290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2113290Sjohansen */ 2123290Sjohansen void * 2133290Sjohansen zio_data_buf_alloc(size_t size) 2143290Sjohansen { 2153290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2163290Sjohansen 2173290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2183290Sjohansen 2196245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2203290Sjohansen } 2213290Sjohansen 222789Sahrens void 223789Sahrens zio_buf_free(void *buf, size_t size) 224789Sahrens { 225789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 226789Sahrens 227789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 228789Sahrens 229789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 230789Sahrens } 231789Sahrens 2323290Sjohansen void 2333290Sjohansen zio_data_buf_free(void *buf, size_t size) 2343290Sjohansen { 2353290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2363290Sjohansen 2373290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2383290Sjohansen 2393290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2403290Sjohansen } 2413463Sahrens 242789Sahrens /* 243789Sahrens * ========================================================================== 244789Sahrens * Push and pop I/O transform buffers 245789Sahrens * ========================================================================== 246789Sahrens */ 247789Sahrens static void 2487754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 2497754SJeff.Bonwick@Sun.COM zio_transform_func_t *transform) 250789Sahrens { 251789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 252789Sahrens 2537754SJeff.Bonwick@Sun.COM zt->zt_orig_data = zio->io_data; 2547754SJeff.Bonwick@Sun.COM zt->zt_orig_size = zio->io_size; 255789Sahrens zt->zt_bufsize = bufsize; 2567754SJeff.Bonwick@Sun.COM zt->zt_transform = transform; 257789Sahrens 258789Sahrens zt->zt_next = zio->io_transform_stack; 259789Sahrens zio->io_transform_stack = zt; 260789Sahrens 261789Sahrens zio->io_data = data; 262789Sahrens zio->io_size = size; 263789Sahrens } 264789Sahrens 265789Sahrens static void 2667754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio) 267789Sahrens { 2687754SJeff.Bonwick@Sun.COM zio_transform_t *zt; 269789Sahrens 2707754SJeff.Bonwick@Sun.COM while ((zt = zio->io_transform_stack) != NULL) { 2717754SJeff.Bonwick@Sun.COM if (zt->zt_transform != NULL) 2727754SJeff.Bonwick@Sun.COM zt->zt_transform(zio, 2737754SJeff.Bonwick@Sun.COM zt->zt_orig_data, zt->zt_orig_size); 274789Sahrens 27510922SJeff.Bonwick@Sun.COM if (zt->zt_bufsize != 0) 27610922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zt->zt_bufsize); 277789Sahrens 2787754SJeff.Bonwick@Sun.COM zio->io_data = zt->zt_orig_data; 2797754SJeff.Bonwick@Sun.COM zio->io_size = zt->zt_orig_size; 2807754SJeff.Bonwick@Sun.COM zio->io_transform_stack = zt->zt_next; 281789Sahrens 2827754SJeff.Bonwick@Sun.COM kmem_free(zt, sizeof (zio_transform_t)); 283789Sahrens } 284789Sahrens } 285789Sahrens 286789Sahrens /* 287789Sahrens * ========================================================================== 2887754SJeff.Bonwick@Sun.COM * I/O transform callbacks for subblocks and decompression 2897754SJeff.Bonwick@Sun.COM * ========================================================================== 2907754SJeff.Bonwick@Sun.COM */ 2917754SJeff.Bonwick@Sun.COM static void 2927754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size) 2937754SJeff.Bonwick@Sun.COM { 2947754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size > size); 2957754SJeff.Bonwick@Sun.COM 2967754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ) 2977754SJeff.Bonwick@Sun.COM bcopy(zio->io_data, data, size); 2987754SJeff.Bonwick@Sun.COM } 2997754SJeff.Bonwick@Sun.COM 3007754SJeff.Bonwick@Sun.COM static void 3017754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size) 3027754SJeff.Bonwick@Sun.COM { 3037754SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && 3047754SJeff.Bonwick@Sun.COM zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 30510922SJeff.Bonwick@Sun.COM zio->io_data, data, zio->io_size, size) != 0) 3067754SJeff.Bonwick@Sun.COM zio->io_error = EIO; 3077754SJeff.Bonwick@Sun.COM } 3087754SJeff.Bonwick@Sun.COM 3097754SJeff.Bonwick@Sun.COM /* 3107754SJeff.Bonwick@Sun.COM * ========================================================================== 3117754SJeff.Bonwick@Sun.COM * I/O parent/child relationships and pipeline interlocks 3127754SJeff.Bonwick@Sun.COM * ========================================================================== 3137754SJeff.Bonwick@Sun.COM */ 3148632SBill.Moore@Sun.COM /* 3158632SBill.Moore@Sun.COM * NOTE - Callers to zio_walk_parents() and zio_walk_children must 3168632SBill.Moore@Sun.COM * continue calling these functions until they return NULL. 3178632SBill.Moore@Sun.COM * Otherwise, the next caller will pick up the list walk in 3188632SBill.Moore@Sun.COM * some indeterminate state. (Otherwise every caller would 3198632SBill.Moore@Sun.COM * have to pass in a cookie to keep the state represented by 3208632SBill.Moore@Sun.COM * io_walk_link, which gets annoying.) 3218632SBill.Moore@Sun.COM */ 3228632SBill.Moore@Sun.COM zio_t * 3238632SBill.Moore@Sun.COM zio_walk_parents(zio_t *cio) 3248632SBill.Moore@Sun.COM { 3258632SBill.Moore@Sun.COM zio_link_t *zl = cio->io_walk_link; 3268632SBill.Moore@Sun.COM list_t *pl = &cio->io_parent_list; 3277754SJeff.Bonwick@Sun.COM 3288632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 3298632SBill.Moore@Sun.COM cio->io_walk_link = zl; 3308632SBill.Moore@Sun.COM 3318632SBill.Moore@Sun.COM if (zl == NULL) 3328632SBill.Moore@Sun.COM return (NULL); 3338632SBill.Moore@Sun.COM 3348632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 3358632SBill.Moore@Sun.COM return (zl->zl_parent); 3368632SBill.Moore@Sun.COM } 3378632SBill.Moore@Sun.COM 3388632SBill.Moore@Sun.COM zio_t * 3398632SBill.Moore@Sun.COM zio_walk_children(zio_t *pio) 3407754SJeff.Bonwick@Sun.COM { 3418632SBill.Moore@Sun.COM zio_link_t *zl = pio->io_walk_link; 3428632SBill.Moore@Sun.COM list_t *cl = &pio->io_child_list; 3438632SBill.Moore@Sun.COM 3448632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 3458632SBill.Moore@Sun.COM pio->io_walk_link = zl; 3468632SBill.Moore@Sun.COM 3478632SBill.Moore@Sun.COM if (zl == NULL) 3488632SBill.Moore@Sun.COM return (NULL); 3498632SBill.Moore@Sun.COM 3508632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 3518632SBill.Moore@Sun.COM return (zl->zl_child); 3528632SBill.Moore@Sun.COM } 3538632SBill.Moore@Sun.COM 3548632SBill.Moore@Sun.COM zio_t * 3558632SBill.Moore@Sun.COM zio_unique_parent(zio_t *cio) 3568632SBill.Moore@Sun.COM { 3578632SBill.Moore@Sun.COM zio_t *pio = zio_walk_parents(cio); 3588632SBill.Moore@Sun.COM 3598632SBill.Moore@Sun.COM VERIFY(zio_walk_parents(cio) == NULL); 3608632SBill.Moore@Sun.COM return (pio); 3618632SBill.Moore@Sun.COM } 3628632SBill.Moore@Sun.COM 3638632SBill.Moore@Sun.COM void 3648632SBill.Moore@Sun.COM zio_add_child(zio_t *pio, zio_t *cio) 3658632SBill.Moore@Sun.COM { 3668632SBill.Moore@Sun.COM zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 3678632SBill.Moore@Sun.COM 3688632SBill.Moore@Sun.COM /* 3698632SBill.Moore@Sun.COM * Logical I/Os can have logical, gang, or vdev children. 3708632SBill.Moore@Sun.COM * Gang I/Os can have gang or vdev children. 3718632SBill.Moore@Sun.COM * Vdev I/Os can only have vdev children. 3728632SBill.Moore@Sun.COM * The following ASSERT captures all of these constraints. 3738632SBill.Moore@Sun.COM */ 3748632SBill.Moore@Sun.COM ASSERT(cio->io_child_type <= pio->io_child_type); 3758632SBill.Moore@Sun.COM 3768632SBill.Moore@Sun.COM zl->zl_parent = pio; 3778632SBill.Moore@Sun.COM zl->zl_child = cio; 3788632SBill.Moore@Sun.COM 3798632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 3807754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 3818632SBill.Moore@Sun.COM 3828632SBill.Moore@Sun.COM ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 3838632SBill.Moore@Sun.COM 3848632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3858632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 3868632SBill.Moore@Sun.COM 3878632SBill.Moore@Sun.COM list_insert_head(&pio->io_child_list, zl); 3888632SBill.Moore@Sun.COM list_insert_head(&cio->io_parent_list, zl); 3898632SBill.Moore@Sun.COM 39010922SJeff.Bonwick@Sun.COM pio->io_child_count++; 39110922SJeff.Bonwick@Sun.COM cio->io_parent_count++; 39210922SJeff.Bonwick@Sun.COM 3937754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 3948632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 3957754SJeff.Bonwick@Sun.COM } 3967754SJeff.Bonwick@Sun.COM 3977754SJeff.Bonwick@Sun.COM static void 3988632SBill.Moore@Sun.COM zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 3997754SJeff.Bonwick@Sun.COM { 4008632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 4018632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 4027754SJeff.Bonwick@Sun.COM 4038632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 4047754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4058632SBill.Moore@Sun.COM 4068632SBill.Moore@Sun.COM list_remove(&pio->io_child_list, zl); 4078632SBill.Moore@Sun.COM list_remove(&cio->io_parent_list, zl); 4088632SBill.Moore@Sun.COM 40910922SJeff.Bonwick@Sun.COM pio->io_child_count--; 41010922SJeff.Bonwick@Sun.COM cio->io_parent_count--; 41110922SJeff.Bonwick@Sun.COM 4127754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4138632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 4148632SBill.Moore@Sun.COM 4158632SBill.Moore@Sun.COM kmem_cache_free(zio_link_cache, zl); 4167754SJeff.Bonwick@Sun.COM } 4177754SJeff.Bonwick@Sun.COM 4187754SJeff.Bonwick@Sun.COM static boolean_t 4197754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 4207754SJeff.Bonwick@Sun.COM { 4217754SJeff.Bonwick@Sun.COM uint64_t *countp = &zio->io_children[child][wait]; 4227754SJeff.Bonwick@Sun.COM boolean_t waiting = B_FALSE; 4237754SJeff.Bonwick@Sun.COM 4247754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 4257754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 4267754SJeff.Bonwick@Sun.COM if (*countp != 0) { 42710922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 4287754SJeff.Bonwick@Sun.COM zio->io_stall = countp; 4297754SJeff.Bonwick@Sun.COM waiting = B_TRUE; 4307754SJeff.Bonwick@Sun.COM } 4317754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 4327754SJeff.Bonwick@Sun.COM 4337754SJeff.Bonwick@Sun.COM return (waiting); 4347754SJeff.Bonwick@Sun.COM } 4357754SJeff.Bonwick@Sun.COM 4367754SJeff.Bonwick@Sun.COM static void 4377754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 4387754SJeff.Bonwick@Sun.COM { 4397754SJeff.Bonwick@Sun.COM uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 4407754SJeff.Bonwick@Sun.COM int *errorp = &pio->io_child_error[zio->io_child_type]; 4417754SJeff.Bonwick@Sun.COM 4427754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4437754SJeff.Bonwick@Sun.COM if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 4447754SJeff.Bonwick@Sun.COM *errorp = zio_worst_error(*errorp, zio->io_error); 4457754SJeff.Bonwick@Sun.COM pio->io_reexecute |= zio->io_reexecute; 4467754SJeff.Bonwick@Sun.COM ASSERT3U(*countp, >, 0); 4477754SJeff.Bonwick@Sun.COM if (--*countp == 0 && pio->io_stall == countp) { 4487754SJeff.Bonwick@Sun.COM pio->io_stall = NULL; 4497754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4507754SJeff.Bonwick@Sun.COM zio_execute(pio); 4517754SJeff.Bonwick@Sun.COM } else { 4527754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4537754SJeff.Bonwick@Sun.COM } 4547754SJeff.Bonwick@Sun.COM } 4557754SJeff.Bonwick@Sun.COM 4567754SJeff.Bonwick@Sun.COM static void 4577754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c) 4587754SJeff.Bonwick@Sun.COM { 4597754SJeff.Bonwick@Sun.COM if (zio->io_child_error[c] != 0 && zio->io_error == 0) 4607754SJeff.Bonwick@Sun.COM zio->io_error = zio->io_child_error[c]; 4617754SJeff.Bonwick@Sun.COM } 4627754SJeff.Bonwick@Sun.COM 4637754SJeff.Bonwick@Sun.COM /* 4647754SJeff.Bonwick@Sun.COM * ========================================================================== 4657754SJeff.Bonwick@Sun.COM * Create the various types of I/O (read, write, free, etc) 466789Sahrens * ========================================================================== 467789Sahrens */ 468789Sahrens static zio_t * 46910922SJeff.Bonwick@Sun.COM zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 470789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 47110922SJeff.Bonwick@Sun.COM zio_type_t type, int priority, enum zio_flag flags, 47210922SJeff.Bonwick@Sun.COM vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 47310922SJeff.Bonwick@Sun.COM enum zio_stage stage, enum zio_stage pipeline) 474789Sahrens { 475789Sahrens zio_t *zio; 476789Sahrens 477789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 478789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 4797754SJeff.Bonwick@Sun.COM ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 480789Sahrens 4817754SJeff.Bonwick@Sun.COM ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 4827754SJeff.Bonwick@Sun.COM ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 4837754SJeff.Bonwick@Sun.COM ASSERT(vd || stage == ZIO_STAGE_OPEN); 4847046Sahrens 4854055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 4864055Seschrock bzero(zio, sizeof (zio_t)); 4877754SJeff.Bonwick@Sun.COM 4887754SJeff.Bonwick@Sun.COM mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 4897754SJeff.Bonwick@Sun.COM cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 4907754SJeff.Bonwick@Sun.COM 4918632SBill.Moore@Sun.COM list_create(&zio->io_parent_list, sizeof (zio_link_t), 4928632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_parent_node)); 4938632SBill.Moore@Sun.COM list_create(&zio->io_child_list, sizeof (zio_link_t), 4948632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_child_node)); 4958632SBill.Moore@Sun.COM 4967754SJeff.Bonwick@Sun.COM if (vd != NULL) 4977754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_VDEV; 4987754SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_GANG_CHILD) 4997754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_GANG; 50010922SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_DDT_CHILD) 50110922SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_DDT; 5027754SJeff.Bonwick@Sun.COM else 5037754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_LOGICAL; 5047754SJeff.Bonwick@Sun.COM 505789Sahrens if (bp != NULL) { 50610922SJeff.Bonwick@Sun.COM zio->io_bp = (blkptr_t *)bp; 507789Sahrens zio->io_bp_copy = *bp; 508789Sahrens zio->io_bp_orig = *bp; 50910922SJeff.Bonwick@Sun.COM if (type != ZIO_TYPE_WRITE || 51010922SJeff.Bonwick@Sun.COM zio->io_child_type == ZIO_CHILD_DDT) 5117754SJeff.Bonwick@Sun.COM zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 5129443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) 5137754SJeff.Bonwick@Sun.COM zio->io_logical = zio; 5149443SBill.Moore@Sun.COM if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 5159443SBill.Moore@Sun.COM pipeline |= ZIO_GANG_STAGES; 516789Sahrens } 5177754SJeff.Bonwick@Sun.COM 5187754SJeff.Bonwick@Sun.COM zio->io_spa = spa; 5197754SJeff.Bonwick@Sun.COM zio->io_txg = txg; 520789Sahrens zio->io_done = done; 521789Sahrens zio->io_private = private; 522789Sahrens zio->io_type = type; 523789Sahrens zio->io_priority = priority; 5247754SJeff.Bonwick@Sun.COM zio->io_vd = vd; 5257754SJeff.Bonwick@Sun.COM zio->io_offset = offset; 52610922SJeff.Bonwick@Sun.COM zio->io_orig_data = zio->io_data = data; 52710922SJeff.Bonwick@Sun.COM zio->io_orig_size = zio->io_size = size; 5287754SJeff.Bonwick@Sun.COM zio->io_orig_flags = zio->io_flags = flags; 5297754SJeff.Bonwick@Sun.COM zio->io_orig_stage = zio->io_stage = stage; 5307754SJeff.Bonwick@Sun.COM zio->io_orig_pipeline = zio->io_pipeline = pipeline; 5317754SJeff.Bonwick@Sun.COM 5328632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 5338632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 5348632SBill.Moore@Sun.COM 5357754SJeff.Bonwick@Sun.COM if (zb != NULL) 5367754SJeff.Bonwick@Sun.COM zio->io_bookmark = *zb; 537789Sahrens 5387754SJeff.Bonwick@Sun.COM if (pio != NULL) { 5397754SJeff.Bonwick@Sun.COM if (zio->io_logical == NULL) 5401544Seschrock zio->io_logical = pio->io_logical; 5419443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_GANG) 5429443SBill.Moore@Sun.COM zio->io_gang_leader = pio->io_gang_leader; 5437754SJeff.Bonwick@Sun.COM zio_add_child(pio, zio); 544789Sahrens } 545789Sahrens 546789Sahrens return (zio); 547789Sahrens } 548789Sahrens 5495329Sgw25295 static void 5507754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio) 5515329Sgw25295 { 5528632SBill.Moore@Sun.COM list_destroy(&zio->io_parent_list); 5538632SBill.Moore@Sun.COM list_destroy(&zio->io_child_list); 5547754SJeff.Bonwick@Sun.COM mutex_destroy(&zio->io_lock); 5557754SJeff.Bonwick@Sun.COM cv_destroy(&zio->io_cv); 5567754SJeff.Bonwick@Sun.COM kmem_cache_free(zio_cache, zio); 5575329Sgw25295 } 5585329Sgw25295 559789Sahrens zio_t * 5608632SBill.Moore@Sun.COM zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 56110922SJeff.Bonwick@Sun.COM void *private, enum zio_flag flags) 562789Sahrens { 563789Sahrens zio_t *zio; 564789Sahrens 565789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 5668632SBill.Moore@Sun.COM ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 5677754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 568789Sahrens 569789Sahrens return (zio); 570789Sahrens } 571789Sahrens 572789Sahrens zio_t * 57310922SJeff.Bonwick@Sun.COM zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 574789Sahrens { 5758632SBill.Moore@Sun.COM return (zio_null(NULL, spa, NULL, done, private, flags)); 576789Sahrens } 577789Sahrens 578789Sahrens zio_t * 5797754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 5807754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_done_func_t *done, void *private, 58110922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 582789Sahrens { 583789Sahrens zio_t *zio; 584789Sahrens 58510922SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 5867046Sahrens data, size, done, private, 5877754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 58810922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 58910922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 590789Sahrens 591789Sahrens return (zio); 592789Sahrens } 593789Sahrens 594789Sahrens zio_t * 5957754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 59610922SJeff.Bonwick@Sun.COM void *data, uint64_t size, const zio_prop_t *zp, 5977754SJeff.Bonwick@Sun.COM zio_done_func_t *ready, zio_done_func_t *done, void *private, 59810922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 599789Sahrens { 600789Sahrens zio_t *zio; 601789Sahrens 6027754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 6037754SJeff.Bonwick@Sun.COM zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 6047754SJeff.Bonwick@Sun.COM zp->zp_compress >= ZIO_COMPRESS_OFF && 6057754SJeff.Bonwick@Sun.COM zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 6067754SJeff.Bonwick@Sun.COM zp->zp_type < DMU_OT_NUMTYPES && 6077754SJeff.Bonwick@Sun.COM zp->zp_level < 32 && 60810922SJeff.Bonwick@Sun.COM zp->zp_copies > 0 && 60910922SJeff.Bonwick@Sun.COM zp->zp_copies <= spa_max_replication(spa) && 61010922SJeff.Bonwick@Sun.COM zp->zp_dedup <= 1 && 61110922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify <= 1); 6125329Sgw25295 613789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6147754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 61510922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 61610922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 617789Sahrens 6183547Smaybee zio->io_ready = ready; 6197754SJeff.Bonwick@Sun.COM zio->io_prop = *zp; 620789Sahrens 621789Sahrens return (zio); 622789Sahrens } 623789Sahrens 624789Sahrens zio_t * 6257754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 6267754SJeff.Bonwick@Sun.COM uint64_t size, zio_done_func_t *done, void *private, int priority, 62710922SJeff.Bonwick@Sun.COM enum zio_flag flags, zbookmark_t *zb) 628789Sahrens { 629789Sahrens zio_t *zio; 630789Sahrens 6317181Sperrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6327754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 6337754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 634789Sahrens 635789Sahrens return (zio); 636789Sahrens } 637789Sahrens 63810922SJeff.Bonwick@Sun.COM void 63910922SJeff.Bonwick@Sun.COM zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 64010922SJeff.Bonwick@Sun.COM { 64110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 64210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 64310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 64410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 64510922SJeff.Bonwick@Sun.COM 64610922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies = copies; 64710922SJeff.Bonwick@Sun.COM zio->io_bp_override = bp; 64810922SJeff.Bonwick@Sun.COM } 64910922SJeff.Bonwick@Sun.COM 65010922SJeff.Bonwick@Sun.COM void 65110922SJeff.Bonwick@Sun.COM zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 65210922SJeff.Bonwick@Sun.COM { 65310922SJeff.Bonwick@Sun.COM bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 65410922SJeff.Bonwick@Sun.COM } 65510922SJeff.Bonwick@Sun.COM 656789Sahrens zio_t * 65710922SJeff.Bonwick@Sun.COM zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 65810922SJeff.Bonwick@Sun.COM enum zio_flag flags) 659789Sahrens { 660789Sahrens zio_t *zio; 661789Sahrens 662789Sahrens ASSERT(!BP_IS_HOLE(bp)); 66310922SJeff.Bonwick@Sun.COM ASSERT(spa_syncing_txg(spa) == txg); 66410922SJeff.Bonwick@Sun.COM ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 665789Sahrens 6667754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 66710922SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 6687754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 669789Sahrens 670789Sahrens return (zio); 671789Sahrens } 672789Sahrens 673789Sahrens zio_t * 67410922SJeff.Bonwick@Sun.COM zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 67510922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, enum zio_flag flags) 676789Sahrens { 677789Sahrens zio_t *zio; 678789Sahrens 679789Sahrens /* 680789Sahrens * A claim is an allocation of a specific block. Claims are needed 681789Sahrens * to support immediate writes in the intent log. The issue is that 682789Sahrens * immediate writes contain committed data, but in a txg that was 683789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 684789Sahrens * the intent log claims all blocks that contain immediate write data 685789Sahrens * so that the SPA knows they're in use. 686789Sahrens * 687789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 688789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 68910922SJeff.Bonwick@Sun.COM * If txg == 0 we just verify that the block is claimable. 690789Sahrens */ 691789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 69210922SJeff.Bonwick@Sun.COM ASSERT(txg == spa_first_txg(spa) || txg == 0); 69310922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 694789Sahrens 6957754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 6967754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 6977754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 698789Sahrens 699789Sahrens return (zio); 700789Sahrens } 701789Sahrens 702789Sahrens zio_t * 703789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 70410922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 705789Sahrens { 706789Sahrens zio_t *zio; 707789Sahrens int c; 708789Sahrens 709789Sahrens if (vd->vdev_children == 0) { 710789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 7117754SJeff.Bonwick@Sun.COM ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 712789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 713789Sahrens 714789Sahrens zio->io_cmd = cmd; 715789Sahrens } else { 7168632SBill.Moore@Sun.COM zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 717789Sahrens 718789Sahrens for (c = 0; c < vd->vdev_children; c++) 719789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 720789Sahrens done, private, priority, flags)); 721789Sahrens } 722789Sahrens 723789Sahrens return (zio); 724789Sahrens } 725789Sahrens 726789Sahrens zio_t * 727789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 728789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 72910922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 730789Sahrens { 731789Sahrens zio_t *zio; 7325329Sgw25295 7337754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7347754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7357754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7367754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 737789Sahrens 7387754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7397754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 740789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 741789Sahrens 7427754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 743789Sahrens 744789Sahrens return (zio); 745789Sahrens } 746789Sahrens 747789Sahrens zio_t * 748789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 749789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 75010922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 751789Sahrens { 752789Sahrens zio_t *zio; 753789Sahrens 7547754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7557754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7567754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7577754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 7585329Sgw25295 7597754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7607754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 761789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 762789Sahrens 7637754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 764789Sahrens 765789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 766789Sahrens /* 767789Sahrens * zbt checksums are necessarily destructive -- they modify 7687754SJeff.Bonwick@Sun.COM * the end of the write buffer to hold the verifier/checksum. 769789Sahrens * Therefore, we must make a local copy in case the data is 7707754SJeff.Bonwick@Sun.COM * being written to multiple places in parallel. 771789Sahrens */ 7727754SJeff.Bonwick@Sun.COM void *wbuf = zio_buf_alloc(size); 773789Sahrens bcopy(data, wbuf, size); 7747754SJeff.Bonwick@Sun.COM zio_push_transform(zio, wbuf, size, size, NULL); 775789Sahrens } 776789Sahrens 777789Sahrens return (zio); 778789Sahrens } 779789Sahrens 780789Sahrens /* 7817754SJeff.Bonwick@Sun.COM * Create a child I/O to do some work for us. 782789Sahrens */ 783789Sahrens zio_t * 7847754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 78510922SJeff.Bonwick@Sun.COM void *data, uint64_t size, int type, int priority, enum zio_flag flags, 786789Sahrens zio_done_func_t *done, void *private) 787789Sahrens { 78810922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 7897754SJeff.Bonwick@Sun.COM zio_t *zio; 7907754SJeff.Bonwick@Sun.COM 7917754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_parent == 7927754SJeff.Bonwick@Sun.COM (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 793789Sahrens 794789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 795789Sahrens /* 796789Sahrens * If we have the bp, then the child should perform the 797789Sahrens * checksum and the parent need not. This pushes error 798789Sahrens * detection as close to the leaves as possible and 799789Sahrens * eliminates redundant checksums in the interior nodes. 800789Sahrens */ 80110922SJeff.Bonwick@Sun.COM pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 80210922SJeff.Bonwick@Sun.COM pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 8037754SJeff.Bonwick@Sun.COM } 8047754SJeff.Bonwick@Sun.COM 8057754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0) 8067754SJeff.Bonwick@Sun.COM offset += VDEV_LABEL_START_SIZE; 8077754SJeff.Bonwick@Sun.COM 80810922SJeff.Bonwick@Sun.COM flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 80910922SJeff.Bonwick@Sun.COM 81010922SJeff.Bonwick@Sun.COM /* 81110922SJeff.Bonwick@Sun.COM * If we've decided to do a repair, the write is not speculative -- 81210922SJeff.Bonwick@Sun.COM * even if the original read was. 81310922SJeff.Bonwick@Sun.COM */ 81410922SJeff.Bonwick@Sun.COM if (flags & ZIO_FLAG_IO_REPAIR) 81510922SJeff.Bonwick@Sun.COM flags &= ~ZIO_FLAG_SPECULATIVE; 81610922SJeff.Bonwick@Sun.COM 8177754SJeff.Bonwick@Sun.COM zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 81810922SJeff.Bonwick@Sun.COM done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 81910922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 8207754SJeff.Bonwick@Sun.COM 8217754SJeff.Bonwick@Sun.COM return (zio); 8227754SJeff.Bonwick@Sun.COM } 8237754SJeff.Bonwick@Sun.COM 8247754SJeff.Bonwick@Sun.COM zio_t * 8257754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 82610922SJeff.Bonwick@Sun.COM int type, int priority, enum zio_flag flags, 82710922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private) 8287754SJeff.Bonwick@Sun.COM { 8297754SJeff.Bonwick@Sun.COM zio_t *zio; 8307754SJeff.Bonwick@Sun.COM 8317754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_ops->vdev_op_leaf); 8327754SJeff.Bonwick@Sun.COM 8337754SJeff.Bonwick@Sun.COM zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 8347754SJeff.Bonwick@Sun.COM data, size, done, private, type, priority, 8357754SJeff.Bonwick@Sun.COM flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 8367754SJeff.Bonwick@Sun.COM vd, offset, NULL, 83710922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 8387754SJeff.Bonwick@Sun.COM 8397754SJeff.Bonwick@Sun.COM return (zio); 8407754SJeff.Bonwick@Sun.COM } 8417754SJeff.Bonwick@Sun.COM 8427754SJeff.Bonwick@Sun.COM void 8437754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd) 8447754SJeff.Bonwick@Sun.COM { 8457754SJeff.Bonwick@Sun.COM zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 8467754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_PRIORITY_NOW, 8477754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 8487754SJeff.Bonwick@Sun.COM } 8497754SJeff.Bonwick@Sun.COM 8507754SJeff.Bonwick@Sun.COM /* 8517754SJeff.Bonwick@Sun.COM * ========================================================================== 8527754SJeff.Bonwick@Sun.COM * Prepare to read and write logical blocks 8537754SJeff.Bonwick@Sun.COM * ========================================================================== 8547754SJeff.Bonwick@Sun.COM */ 8557754SJeff.Bonwick@Sun.COM 8567754SJeff.Bonwick@Sun.COM static int 8577754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio) 8587754SJeff.Bonwick@Sun.COM { 8597754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8607754SJeff.Bonwick@Sun.COM 8618274SJeff.Bonwick@Sun.COM if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 8629443SBill.Moore@Sun.COM zio->io_child_type == ZIO_CHILD_LOGICAL && 8639443SBill.Moore@Sun.COM !(zio->io_flags & ZIO_FLAG_RAW)) { 86410922SJeff.Bonwick@Sun.COM uint64_t psize = BP_GET_PSIZE(bp); 86510922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(psize); 86610922SJeff.Bonwick@Sun.COM 86710922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 8687754SJeff.Bonwick@Sun.COM } 8697754SJeff.Bonwick@Sun.COM 8707754SJeff.Bonwick@Sun.COM if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 8717754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 8727754SJeff.Bonwick@Sun.COM 87311125SJeff.Bonwick@Sun.COM if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 87411125SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 87511125SJeff.Bonwick@Sun.COM 87610922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 87710922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 87810922SJeff.Bonwick@Sun.COM 8797754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 8807754SJeff.Bonwick@Sun.COM } 8817754SJeff.Bonwick@Sun.COM 8827754SJeff.Bonwick@Sun.COM static int 8837754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio) 8847754SJeff.Bonwick@Sun.COM { 88510922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 8867754SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 88710922SJeff.Bonwick@Sun.COM enum zio_compress compress = zp->zp_compress; 8887754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8897754SJeff.Bonwick@Sun.COM uint64_t lsize = zio->io_size; 89010922SJeff.Bonwick@Sun.COM uint64_t psize = lsize; 8917754SJeff.Bonwick@Sun.COM int pass = 1; 8927754SJeff.Bonwick@Sun.COM 8937754SJeff.Bonwick@Sun.COM /* 8947754SJeff.Bonwick@Sun.COM * If our children haven't all reached the ready stage, 8957754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 8967754SJeff.Bonwick@Sun.COM */ 8977754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 8987754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 8997754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 9007754SJeff.Bonwick@Sun.COM 9017754SJeff.Bonwick@Sun.COM if (!IO_IS_ALLOCATING(zio)) 9027754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9037754SJeff.Bonwick@Sun.COM 90410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 90510922SJeff.Bonwick@Sun.COM 90610922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 90710922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth != zio->io_txg); 90810922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 90910922SJeff.Bonwick@Sun.COM 91010922SJeff.Bonwick@Sun.COM *bp = *zio->io_bp_override; 91110922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 91210922SJeff.Bonwick@Sun.COM 91310922SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(bp) || !zp->zp_dedup) 91410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 91510922SJeff.Bonwick@Sun.COM 91610922SJeff.Bonwick@Sun.COM ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 91710922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify); 91810922SJeff.Bonwick@Sun.COM 91910922SJeff.Bonwick@Sun.COM if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 92010922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, 1); 92110922SJeff.Bonwick@Sun.COM zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 92210922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 92310922SJeff.Bonwick@Sun.COM } 92410922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 92510922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 92610922SJeff.Bonwick@Sun.COM } 9277754SJeff.Bonwick@Sun.COM 9287754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg) { 9297754SJeff.Bonwick@Sun.COM /* 9307754SJeff.Bonwick@Sun.COM * We're rewriting an existing block, which means we're 9317754SJeff.Bonwick@Sun.COM * working on behalf of spa_sync(). For spa_sync() to 9327754SJeff.Bonwick@Sun.COM * converge, it must eventually be the case that we don't 9337754SJeff.Bonwick@Sun.COM * have to allocate new blocks. But compression changes 9347754SJeff.Bonwick@Sun.COM * the blocksize, which forces a reallocate, and makes 9357754SJeff.Bonwick@Sun.COM * convergence take longer. Therefore, after the first 9367754SJeff.Bonwick@Sun.COM * few passes, stop compressing to ensure convergence. 9377754SJeff.Bonwick@Sun.COM */ 93810922SJeff.Bonwick@Sun.COM pass = spa_sync_pass(spa); 93910922SJeff.Bonwick@Sun.COM 94010922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(spa)); 94110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 94210922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp)); 9437754SJeff.Bonwick@Sun.COM 9447754SJeff.Bonwick@Sun.COM if (pass > SYNC_PASS_DONT_COMPRESS) 9457754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 9467754SJeff.Bonwick@Sun.COM 9477754SJeff.Bonwick@Sun.COM /* Make sure someone doesn't change their mind on overwrites */ 94810922SJeff.Bonwick@Sun.COM ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 94910922SJeff.Bonwick@Sun.COM spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 9507754SJeff.Bonwick@Sun.COM } 9517754SJeff.Bonwick@Sun.COM 9527754SJeff.Bonwick@Sun.COM if (compress != ZIO_COMPRESS_OFF) { 95310922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(lsize); 95410922SJeff.Bonwick@Sun.COM psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 95510922SJeff.Bonwick@Sun.COM if (psize == 0 || psize == lsize) { 9567754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 95710922SJeff.Bonwick@Sun.COM zio_buf_free(cbuf, lsize); 95810922SJeff.Bonwick@Sun.COM } else { 95910922SJeff.Bonwick@Sun.COM ASSERT(psize < lsize); 96010922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, lsize, NULL); 9617754SJeff.Bonwick@Sun.COM } 962789Sahrens } 963789Sahrens 9647754SJeff.Bonwick@Sun.COM /* 9657754SJeff.Bonwick@Sun.COM * The final pass of spa_sync() must be all rewrites, but the first 9667754SJeff.Bonwick@Sun.COM * few passes offer a trade-off: allocating blocks defers convergence, 9677754SJeff.Bonwick@Sun.COM * but newly allocated blocks are sequential, so they can be written 9687754SJeff.Bonwick@Sun.COM * to disk faster. Therefore, we allow the first few passes of 9697754SJeff.Bonwick@Sun.COM * spa_sync() to allocate new blocks, but force rewrites after that. 9707754SJeff.Bonwick@Sun.COM * There should only be a handful of blocks after pass 1 in any case. 9717754SJeff.Bonwick@Sun.COM */ 97210922SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 9737754SJeff.Bonwick@Sun.COM pass > SYNC_PASS_REWRITE) { 97410922SJeff.Bonwick@Sun.COM ASSERT(psize != 0); 97510922SJeff.Bonwick@Sun.COM enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 9767754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 9777754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_REWRITE; 9787754SJeff.Bonwick@Sun.COM } else { 9797754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 9807754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 9817754SJeff.Bonwick@Sun.COM } 9827754SJeff.Bonwick@Sun.COM 98310922SJeff.Bonwick@Sun.COM if (psize == 0) { 9847754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 9857754SJeff.Bonwick@Sun.COM } else { 9867754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 9877754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(bp, lsize); 98810922SJeff.Bonwick@Sun.COM BP_SET_PSIZE(bp, psize); 9897754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(bp, compress); 9907754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(bp, zp->zp_checksum); 9917754SJeff.Bonwick@Sun.COM BP_SET_TYPE(bp, zp->zp_type); 9927754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(bp, zp->zp_level); 99310922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, zp->zp_dedup); 9947754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 99510922SJeff.Bonwick@Sun.COM if (zp->zp_dedup) { 99610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 99710922SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 99810922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 99910922SJeff.Bonwick@Sun.COM } 100010922SJeff.Bonwick@Sun.COM } 100110922SJeff.Bonwick@Sun.COM 100210922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 100310922SJeff.Bonwick@Sun.COM } 100410922SJeff.Bonwick@Sun.COM 100510922SJeff.Bonwick@Sun.COM static int 100610922SJeff.Bonwick@Sun.COM zio_free_bp_init(zio_t *zio) 100710922SJeff.Bonwick@Sun.COM { 100810922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 100910922SJeff.Bonwick@Sun.COM 101010922SJeff.Bonwick@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 101110922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp)) 101210922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 101310922SJeff.Bonwick@Sun.COM else 101410922SJeff.Bonwick@Sun.COM arc_free(zio->io_spa, bp); 10157754SJeff.Bonwick@Sun.COM } 10167754SJeff.Bonwick@Sun.COM 10177754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 10187754SJeff.Bonwick@Sun.COM } 10197754SJeff.Bonwick@Sun.COM 10207754SJeff.Bonwick@Sun.COM /* 10217754SJeff.Bonwick@Sun.COM * ========================================================================== 10227754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline 10237754SJeff.Bonwick@Sun.COM * ========================================================================== 10247754SJeff.Bonwick@Sun.COM */ 10257754SJeff.Bonwick@Sun.COM 10267754SJeff.Bonwick@Sun.COM static void 10277754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) 10287754SJeff.Bonwick@Sun.COM { 1029*11146SGeorge.Wilson@Sun.COM spa_t *spa = zio->io_spa; 10307754SJeff.Bonwick@Sun.COM zio_type_t t = zio->io_type; 10317754SJeff.Bonwick@Sun.COM 10327754SJeff.Bonwick@Sun.COM /* 10339722SGeorge.Wilson@Sun.COM * If we're a config writer or a probe, the normal issue and 10349722SGeorge.Wilson@Sun.COM * interrupt threads may all be blocked waiting for the config lock. 10359722SGeorge.Wilson@Sun.COM * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 10367754SJeff.Bonwick@Sun.COM */ 10379722SGeorge.Wilson@Sun.COM if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 10387754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10397754SJeff.Bonwick@Sun.COM 10407754SJeff.Bonwick@Sun.COM /* 10417754SJeff.Bonwick@Sun.COM * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 10427754SJeff.Bonwick@Sun.COM */ 10437754SJeff.Bonwick@Sun.COM if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 10447754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10457754SJeff.Bonwick@Sun.COM 1046*11146SGeorge.Wilson@Sun.COM /* 1047*11146SGeorge.Wilson@Sun.COM * If this is a high priority I/O, then use the high priority taskq. 1048*11146SGeorge.Wilson@Sun.COM */ 1049*11146SGeorge.Wilson@Sun.COM if (zio->io_priority == ZIO_PRIORITY_NOW && 1050*11146SGeorge.Wilson@Sun.COM spa->spa_zio_taskq[t][q + 1] != NULL) 1051*11146SGeorge.Wilson@Sun.COM q++; 1052*11146SGeorge.Wilson@Sun.COM 1053*11146SGeorge.Wilson@Sun.COM ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1054*11146SGeorge.Wilson@Sun.COM (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 10557754SJeff.Bonwick@Sun.COM (task_func_t *)zio_execute, zio, TQ_SLEEP); 10567754SJeff.Bonwick@Sun.COM } 10577754SJeff.Bonwick@Sun.COM 10587754SJeff.Bonwick@Sun.COM static boolean_t 10597754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 10607754SJeff.Bonwick@Sun.COM { 10617754SJeff.Bonwick@Sun.COM kthread_t *executor = zio->io_executor; 10627754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 1063789Sahrens 10647754SJeff.Bonwick@Sun.COM for (zio_type_t t = 0; t < ZIO_TYPES; t++) 10657754SJeff.Bonwick@Sun.COM if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 10667754SJeff.Bonwick@Sun.COM return (B_TRUE); 10677754SJeff.Bonwick@Sun.COM 10687754SJeff.Bonwick@Sun.COM return (B_FALSE); 10697754SJeff.Bonwick@Sun.COM } 10707754SJeff.Bonwick@Sun.COM 10717754SJeff.Bonwick@Sun.COM static int 10727754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio) 10737754SJeff.Bonwick@Sun.COM { 10747754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 10757754SJeff.Bonwick@Sun.COM 10767754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 10777754SJeff.Bonwick@Sun.COM } 10787754SJeff.Bonwick@Sun.COM 10797754SJeff.Bonwick@Sun.COM void 10807754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio) 10817754SJeff.Bonwick@Sun.COM { 10827754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); 10837754SJeff.Bonwick@Sun.COM } 10847754SJeff.Bonwick@Sun.COM 10857754SJeff.Bonwick@Sun.COM /* 10867754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline until one of the following occurs: 10877754SJeff.Bonwick@Sun.COM * (1) the I/O completes; (2) the pipeline stalls waiting for 10887754SJeff.Bonwick@Sun.COM * dependent child I/Os; (3) the I/O issues, so we're waiting 10897754SJeff.Bonwick@Sun.COM * for an I/O completion interrupt; (4) the I/O is delegated by 10907754SJeff.Bonwick@Sun.COM * vdev-level caching or aggregation; (5) the I/O is deferred 10917754SJeff.Bonwick@Sun.COM * due to vdev-level queueing; (6) the I/O is handed off to 10927754SJeff.Bonwick@Sun.COM * another thread. In all cases, the pipeline stops whenever 10937754SJeff.Bonwick@Sun.COM * there's no CPU work; it never burns a thread in cv_wait(). 10947754SJeff.Bonwick@Sun.COM * 10957754SJeff.Bonwick@Sun.COM * There's no locking on io_stage because there's no legitimate way 10967754SJeff.Bonwick@Sun.COM * for multiple threads to be attempting to process the same I/O. 10977754SJeff.Bonwick@Sun.COM */ 109810922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[]; 1099789Sahrens 11007754SJeff.Bonwick@Sun.COM void 11017754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio) 11027754SJeff.Bonwick@Sun.COM { 11037754SJeff.Bonwick@Sun.COM zio->io_executor = curthread; 11047754SJeff.Bonwick@Sun.COM 11057754SJeff.Bonwick@Sun.COM while (zio->io_stage < ZIO_STAGE_DONE) { 110610922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = zio->io_pipeline; 110710922SJeff.Bonwick@Sun.COM enum zio_stage stage = zio->io_stage; 11087754SJeff.Bonwick@Sun.COM int rv; 11097754SJeff.Bonwick@Sun.COM 11107754SJeff.Bonwick@Sun.COM ASSERT(!MUTEX_HELD(&zio->io_lock)); 111110922SJeff.Bonwick@Sun.COM ASSERT(ISP2(stage)); 111210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 111310922SJeff.Bonwick@Sun.COM 111410922SJeff.Bonwick@Sun.COM do { 111510922SJeff.Bonwick@Sun.COM stage <<= 1; 111610922SJeff.Bonwick@Sun.COM } while ((stage & pipeline) == 0); 11177754SJeff.Bonwick@Sun.COM 11187754SJeff.Bonwick@Sun.COM ASSERT(stage <= ZIO_STAGE_DONE); 11197754SJeff.Bonwick@Sun.COM 11207754SJeff.Bonwick@Sun.COM /* 11217754SJeff.Bonwick@Sun.COM * If we are in interrupt context and this pipeline stage 11227754SJeff.Bonwick@Sun.COM * will grab a config lock that is held across I/O, 112310922SJeff.Bonwick@Sun.COM * or may wait for an I/O that needs an interrupt thread 112410922SJeff.Bonwick@Sun.COM * to complete, issue async to avoid deadlock. 11257754SJeff.Bonwick@Sun.COM */ 112610922SJeff.Bonwick@Sun.COM if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 11277754SJeff.Bonwick@Sun.COM zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 11287754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 11297754SJeff.Bonwick@Sun.COM return; 11307754SJeff.Bonwick@Sun.COM } 11317754SJeff.Bonwick@Sun.COM 11327754SJeff.Bonwick@Sun.COM zio->io_stage = stage; 113310922SJeff.Bonwick@Sun.COM rv = zio_pipeline[highbit(stage) - 1](zio); 11347754SJeff.Bonwick@Sun.COM 11357754SJeff.Bonwick@Sun.COM if (rv == ZIO_PIPELINE_STOP) 11367754SJeff.Bonwick@Sun.COM return; 11377754SJeff.Bonwick@Sun.COM 11387754SJeff.Bonwick@Sun.COM ASSERT(rv == ZIO_PIPELINE_CONTINUE); 11397754SJeff.Bonwick@Sun.COM } 1140789Sahrens } 1141789Sahrens 1142789Sahrens /* 1143789Sahrens * ========================================================================== 1144789Sahrens * Initiate I/O, either sync or async 1145789Sahrens * ========================================================================== 1146789Sahrens */ 1147789Sahrens int 1148789Sahrens zio_wait(zio_t *zio) 1149789Sahrens { 1150789Sahrens int error; 1151789Sahrens 1152789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 11537754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 1154789Sahrens 1155789Sahrens zio->io_waiter = curthread; 1156789Sahrens 11575530Sbonwick zio_execute(zio); 1158789Sahrens 1159789Sahrens mutex_enter(&zio->io_lock); 11607754SJeff.Bonwick@Sun.COM while (zio->io_executor != NULL) 1161789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 1162789Sahrens mutex_exit(&zio->io_lock); 1163789Sahrens 1164789Sahrens error = zio->io_error; 11656523Sek110237 zio_destroy(zio); 1166789Sahrens 1167789Sahrens return (error); 1168789Sahrens } 1169789Sahrens 1170789Sahrens void 1171789Sahrens zio_nowait(zio_t *zio) 1172789Sahrens { 11737754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 11747754SJeff.Bonwick@Sun.COM 11758632SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL && 11768632SBill.Moore@Sun.COM zio_unique_parent(zio) == NULL) { 11777754SJeff.Bonwick@Sun.COM /* 11787754SJeff.Bonwick@Sun.COM * This is a logical async I/O with no parent to wait for it. 11799234SGeorge.Wilson@Sun.COM * We add it to the spa_async_root_zio "Godfather" I/O which 11809234SGeorge.Wilson@Sun.COM * will ensure they complete prior to unloading the pool. 11817754SJeff.Bonwick@Sun.COM */ 11827754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 11839234SGeorge.Wilson@Sun.COM 11849234SGeorge.Wilson@Sun.COM zio_add_child(spa->spa_async_zio_root, zio); 11857754SJeff.Bonwick@Sun.COM } 11867754SJeff.Bonwick@Sun.COM 11875530Sbonwick zio_execute(zio); 11885530Sbonwick } 11895530Sbonwick 11907754SJeff.Bonwick@Sun.COM /* 11917754SJeff.Bonwick@Sun.COM * ========================================================================== 11927754SJeff.Bonwick@Sun.COM * Reexecute or suspend/resume failed I/O 11937754SJeff.Bonwick@Sun.COM * ========================================================================== 11947754SJeff.Bonwick@Sun.COM */ 11957754SJeff.Bonwick@Sun.COM 11967754SJeff.Bonwick@Sun.COM static void 11977754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio) 11987754SJeff.Bonwick@Sun.COM { 11998632SBill.Moore@Sun.COM zio_t *cio, *cio_next; 12008632SBill.Moore@Sun.COM 12018632SBill.Moore@Sun.COM ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 12028632SBill.Moore@Sun.COM ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 12039443SBill.Moore@Sun.COM ASSERT(pio->io_gang_leader == NULL); 12049443SBill.Moore@Sun.COM ASSERT(pio->io_gang_tree == NULL); 12057754SJeff.Bonwick@Sun.COM 12067754SJeff.Bonwick@Sun.COM pio->io_flags = pio->io_orig_flags; 12077754SJeff.Bonwick@Sun.COM pio->io_stage = pio->io_orig_stage; 12087754SJeff.Bonwick@Sun.COM pio->io_pipeline = pio->io_orig_pipeline; 12097754SJeff.Bonwick@Sun.COM pio->io_reexecute = 0; 12107754SJeff.Bonwick@Sun.COM pio->io_error = 0; 12118632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12128632SBill.Moore@Sun.COM pio->io_state[w] = 0; 12137754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 12147754SJeff.Bonwick@Sun.COM pio->io_child_error[c] = 0; 12157754SJeff.Bonwick@Sun.COM 121610922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(pio)) 121710922SJeff.Bonwick@Sun.COM BP_ZERO(pio->io_bp); 12187754SJeff.Bonwick@Sun.COM 12197754SJeff.Bonwick@Sun.COM /* 12207754SJeff.Bonwick@Sun.COM * As we reexecute pio's children, new children could be created. 12218632SBill.Moore@Sun.COM * New children go to the head of pio's io_child_list, however, 12227754SJeff.Bonwick@Sun.COM * so we will (correctly) not reexecute them. The key is that 12238632SBill.Moore@Sun.COM * the remainder of pio's io_child_list, from 'cio_next' onward, 12248632SBill.Moore@Sun.COM * cannot be affected by any side effects of reexecuting 'cio'. 12257754SJeff.Bonwick@Sun.COM */ 12268632SBill.Moore@Sun.COM for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 12278632SBill.Moore@Sun.COM cio_next = zio_walk_children(pio); 12287754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 12298632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12308632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w]++; 12317754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 12328632SBill.Moore@Sun.COM zio_reexecute(cio); 12337754SJeff.Bonwick@Sun.COM } 12347754SJeff.Bonwick@Sun.COM 12357754SJeff.Bonwick@Sun.COM /* 12367754SJeff.Bonwick@Sun.COM * Now that all children have been reexecuted, execute the parent. 12379234SGeorge.Wilson@Sun.COM * We don't reexecute "The Godfather" I/O here as it's the 12389234SGeorge.Wilson@Sun.COM * responsibility of the caller to wait on him. 12397754SJeff.Bonwick@Sun.COM */ 12409234SGeorge.Wilson@Sun.COM if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 12419234SGeorge.Wilson@Sun.COM zio_execute(pio); 12427754SJeff.Bonwick@Sun.COM } 12437754SJeff.Bonwick@Sun.COM 12445530Sbonwick void 12457754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio) 12465530Sbonwick { 12477754SJeff.Bonwick@Sun.COM if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 12487754SJeff.Bonwick@Sun.COM fm_panic("Pool '%s' has encountered an uncorrectable I/O " 12497754SJeff.Bonwick@Sun.COM "failure and the failure mode property for this pool " 12507754SJeff.Bonwick@Sun.COM "is set to panic.", spa_name(spa)); 12517754SJeff.Bonwick@Sun.COM 12527754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 12537754SJeff.Bonwick@Sun.COM 12547754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12557754SJeff.Bonwick@Sun.COM 12567754SJeff.Bonwick@Sun.COM if (spa->spa_suspend_zio_root == NULL) 12579234SGeorge.Wilson@Sun.COM spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 12589234SGeorge.Wilson@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 12599234SGeorge.Wilson@Sun.COM ZIO_FLAG_GODFATHER); 12607754SJeff.Bonwick@Sun.COM 12617754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_TRUE; 12627754SJeff.Bonwick@Sun.COM 12637754SJeff.Bonwick@Sun.COM if (zio != NULL) { 12649234SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 12657754SJeff.Bonwick@Sun.COM ASSERT(zio != spa->spa_suspend_zio_root); 12667754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 12678632SBill.Moore@Sun.COM ASSERT(zio_unique_parent(zio) == NULL); 12687754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_DONE); 12697754SJeff.Bonwick@Sun.COM zio_add_child(spa->spa_suspend_zio_root, zio); 12707754SJeff.Bonwick@Sun.COM } 12717754SJeff.Bonwick@Sun.COM 12727754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12735530Sbonwick } 12745530Sbonwick 12759234SGeorge.Wilson@Sun.COM int 12767754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa) 12775530Sbonwick { 12789234SGeorge.Wilson@Sun.COM zio_t *pio; 12797754SJeff.Bonwick@Sun.COM 12807754SJeff.Bonwick@Sun.COM /* 12817754SJeff.Bonwick@Sun.COM * Reexecute all previously suspended i/o. 12827754SJeff.Bonwick@Sun.COM */ 12837754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12847754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_FALSE; 12857754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_suspend_cv); 12867754SJeff.Bonwick@Sun.COM pio = spa->spa_suspend_zio_root; 12877754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = NULL; 12887754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12897754SJeff.Bonwick@Sun.COM 12907754SJeff.Bonwick@Sun.COM if (pio == NULL) 12919234SGeorge.Wilson@Sun.COM return (0); 12925530Sbonwick 12939234SGeorge.Wilson@Sun.COM zio_reexecute(pio); 12949234SGeorge.Wilson@Sun.COM return (zio_wait(pio)); 12957754SJeff.Bonwick@Sun.COM } 12967754SJeff.Bonwick@Sun.COM 12977754SJeff.Bonwick@Sun.COM void 12987754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa) 12997754SJeff.Bonwick@Sun.COM { 13007754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 13017754SJeff.Bonwick@Sun.COM while (spa_suspended(spa)) 13027754SJeff.Bonwick@Sun.COM cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 13037754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1304789Sahrens } 1305789Sahrens 1306789Sahrens /* 1307789Sahrens * ========================================================================== 13087754SJeff.Bonwick@Sun.COM * Gang blocks. 13097754SJeff.Bonwick@Sun.COM * 13107754SJeff.Bonwick@Sun.COM * A gang block is a collection of small blocks that looks to the DMU 13117754SJeff.Bonwick@Sun.COM * like one large block. When zio_dva_allocate() cannot find a block 13127754SJeff.Bonwick@Sun.COM * of the requested size, due to either severe fragmentation or the pool 13137754SJeff.Bonwick@Sun.COM * being nearly full, it calls zio_write_gang_block() to construct the 13147754SJeff.Bonwick@Sun.COM * block from smaller fragments. 13157754SJeff.Bonwick@Sun.COM * 13167754SJeff.Bonwick@Sun.COM * A gang block consists of a gang header (zio_gbh_phys_t) and up to 13177754SJeff.Bonwick@Sun.COM * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 13187754SJeff.Bonwick@Sun.COM * an indirect block: it's an array of block pointers. It consumes 13197754SJeff.Bonwick@Sun.COM * only one sector and hence is allocatable regardless of fragmentation. 13207754SJeff.Bonwick@Sun.COM * The gang header's bps point to its gang members, which hold the data. 13217754SJeff.Bonwick@Sun.COM * 13227754SJeff.Bonwick@Sun.COM * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 13237754SJeff.Bonwick@Sun.COM * as the verifier to ensure uniqueness of the SHA256 checksum. 13247754SJeff.Bonwick@Sun.COM * Critically, the gang block bp's blk_cksum is the checksum of the data, 13257754SJeff.Bonwick@Sun.COM * not the gang header. This ensures that data block signatures (needed for 13267754SJeff.Bonwick@Sun.COM * deduplication) are independent of how the block is physically stored. 13277754SJeff.Bonwick@Sun.COM * 13287754SJeff.Bonwick@Sun.COM * Gang blocks can be nested: a gang member may itself be a gang block. 13297754SJeff.Bonwick@Sun.COM * Thus every gang block is a tree in which root and all interior nodes are 13307754SJeff.Bonwick@Sun.COM * gang headers, and the leaves are normal blocks that contain user data. 13317754SJeff.Bonwick@Sun.COM * The root of the gang tree is called the gang leader. 13327754SJeff.Bonwick@Sun.COM * 13337754SJeff.Bonwick@Sun.COM * To perform any operation (read, rewrite, free, claim) on a gang block, 13347754SJeff.Bonwick@Sun.COM * zio_gang_assemble() first assembles the gang tree (minus data leaves) 13357754SJeff.Bonwick@Sun.COM * in the io_gang_tree field of the original logical i/o by recursively 13367754SJeff.Bonwick@Sun.COM * reading the gang leader and all gang headers below it. This yields 13377754SJeff.Bonwick@Sun.COM * an in-core tree containing the contents of every gang header and the 13387754SJeff.Bonwick@Sun.COM * bps for every constituent of the gang block. 13397754SJeff.Bonwick@Sun.COM * 13407754SJeff.Bonwick@Sun.COM * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 13417754SJeff.Bonwick@Sun.COM * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 13427754SJeff.Bonwick@Sun.COM * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 13437754SJeff.Bonwick@Sun.COM * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 13447754SJeff.Bonwick@Sun.COM * zio_read_gang() is a wrapper around zio_read() that omits reading gang 13457754SJeff.Bonwick@Sun.COM * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 13467754SJeff.Bonwick@Sun.COM * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 13477754SJeff.Bonwick@Sun.COM * of the gang header plus zio_checksum_compute() of the data to update the 13487754SJeff.Bonwick@Sun.COM * gang header's blk_cksum as described above. 13497754SJeff.Bonwick@Sun.COM * 13507754SJeff.Bonwick@Sun.COM * The two-phase assemble/issue model solves the problem of partial failure -- 13517754SJeff.Bonwick@Sun.COM * what if you'd freed part of a gang block but then couldn't read the 13527754SJeff.Bonwick@Sun.COM * gang header for another part? Assembling the entire gang tree first 13537754SJeff.Bonwick@Sun.COM * ensures that all the necessary gang header I/O has succeeded before 13547754SJeff.Bonwick@Sun.COM * starting the actual work of free, claim, or write. Once the gang tree 13557754SJeff.Bonwick@Sun.COM * is assembled, free and claim are in-memory operations that cannot fail. 13567754SJeff.Bonwick@Sun.COM * 13577754SJeff.Bonwick@Sun.COM * In the event that a gang write fails, zio_dva_unallocate() walks the 13587754SJeff.Bonwick@Sun.COM * gang tree to immediately free (i.e. insert back into the space map) 13597754SJeff.Bonwick@Sun.COM * everything we've allocated. This ensures that we don't get ENOSPC 13607754SJeff.Bonwick@Sun.COM * errors during repeated suspend/resume cycles due to a flaky device. 13617754SJeff.Bonwick@Sun.COM * 13627754SJeff.Bonwick@Sun.COM * Gang rewrites only happen during sync-to-convergence. If we can't assemble 13637754SJeff.Bonwick@Sun.COM * the gang tree, we won't modify the block, so we can safely defer the free 13647754SJeff.Bonwick@Sun.COM * (knowing that the block is still intact). If we *can* assemble the gang 13657754SJeff.Bonwick@Sun.COM * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 13667754SJeff.Bonwick@Sun.COM * each constituent bp and we can allocate a new block on the next sync pass. 13677754SJeff.Bonwick@Sun.COM * 13687754SJeff.Bonwick@Sun.COM * In all cases, the gang tree allows complete recovery from partial failure. 1369789Sahrens * ========================================================================== 1370789Sahrens */ 13715530Sbonwick 13727754SJeff.Bonwick@Sun.COM static zio_t * 13737754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 13747754SJeff.Bonwick@Sun.COM { 13757754SJeff.Bonwick@Sun.COM if (gn != NULL) 13767754SJeff.Bonwick@Sun.COM return (pio); 13775530Sbonwick 13787754SJeff.Bonwick@Sun.COM return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 13797754SJeff.Bonwick@Sun.COM NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 13807754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1381789Sahrens } 1382789Sahrens 13837754SJeff.Bonwick@Sun.COM zio_t * 13847754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 13856523Sek110237 { 13867754SJeff.Bonwick@Sun.COM zio_t *zio; 13876523Sek110237 13887754SJeff.Bonwick@Sun.COM if (gn != NULL) { 13897754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 13907754SJeff.Bonwick@Sun.COM gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 13917754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 13927754SJeff.Bonwick@Sun.COM /* 13937754SJeff.Bonwick@Sun.COM * As we rewrite each gang header, the pipeline will compute 13947754SJeff.Bonwick@Sun.COM * a new gang block header checksum for it; but no one will 13957754SJeff.Bonwick@Sun.COM * compute a new data checksum, so we do that here. The one 13967754SJeff.Bonwick@Sun.COM * exception is the gang leader: the pipeline already computed 13977754SJeff.Bonwick@Sun.COM * its data checksum because that stage precedes gang assembly. 13987754SJeff.Bonwick@Sun.COM * (Presently, nothing actually uses interior data checksums; 13997754SJeff.Bonwick@Sun.COM * this is just good hygiene.) 14007754SJeff.Bonwick@Sun.COM */ 14019443SBill.Moore@Sun.COM if (gn != pio->io_gang_leader->io_gang_tree) { 14027754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 14037754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp)); 14047754SJeff.Bonwick@Sun.COM } 140510922SJeff.Bonwick@Sun.COM /* 140610922SJeff.Bonwick@Sun.COM * If we are here to damage data for testing purposes, 140710922SJeff.Bonwick@Sun.COM * leave the GBH alone so that we can detect the damage. 140810922SJeff.Bonwick@Sun.COM */ 140910922SJeff.Bonwick@Sun.COM if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 141010922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 14117754SJeff.Bonwick@Sun.COM } else { 14127754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 14137754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 14147754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14156523Sek110237 } 14166523Sek110237 14177754SJeff.Bonwick@Sun.COM return (zio); 14187754SJeff.Bonwick@Sun.COM } 14197754SJeff.Bonwick@Sun.COM 14207754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14217754SJeff.Bonwick@Sun.COM zio_t * 14227754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14237754SJeff.Bonwick@Sun.COM { 142410922SJeff.Bonwick@Sun.COM return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 142510922SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio))); 14267754SJeff.Bonwick@Sun.COM } 14277754SJeff.Bonwick@Sun.COM 14287754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14297754SJeff.Bonwick@Sun.COM zio_t * 14307754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14317754SJeff.Bonwick@Sun.COM { 14327754SJeff.Bonwick@Sun.COM return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 14337754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 14347754SJeff.Bonwick@Sun.COM } 14357754SJeff.Bonwick@Sun.COM 14367754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 14377754SJeff.Bonwick@Sun.COM NULL, 14387754SJeff.Bonwick@Sun.COM zio_read_gang, 14397754SJeff.Bonwick@Sun.COM zio_rewrite_gang, 14407754SJeff.Bonwick@Sun.COM zio_free_gang, 14417754SJeff.Bonwick@Sun.COM zio_claim_gang, 14427754SJeff.Bonwick@Sun.COM NULL 14437754SJeff.Bonwick@Sun.COM }; 14447754SJeff.Bonwick@Sun.COM 14457754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio); 14467754SJeff.Bonwick@Sun.COM 14477754SJeff.Bonwick@Sun.COM static zio_gang_node_t * 14487754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp) 14497754SJeff.Bonwick@Sun.COM { 14507754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn; 14517754SJeff.Bonwick@Sun.COM 14527754SJeff.Bonwick@Sun.COM ASSERT(*gnpp == NULL); 14537754SJeff.Bonwick@Sun.COM 14547754SJeff.Bonwick@Sun.COM gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 14557754SJeff.Bonwick@Sun.COM gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 14567754SJeff.Bonwick@Sun.COM *gnpp = gn; 14577754SJeff.Bonwick@Sun.COM 14587754SJeff.Bonwick@Sun.COM return (gn); 14596523Sek110237 } 14606523Sek110237 14616523Sek110237 static void 14627754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp) 14637754SJeff.Bonwick@Sun.COM { 14647754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 14657754SJeff.Bonwick@Sun.COM 14667754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 14677754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_child[g] == NULL); 14687754SJeff.Bonwick@Sun.COM 14697754SJeff.Bonwick@Sun.COM zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 14707754SJeff.Bonwick@Sun.COM kmem_free(gn, sizeof (*gn)); 14717754SJeff.Bonwick@Sun.COM *gnpp = NULL; 14727754SJeff.Bonwick@Sun.COM } 14737754SJeff.Bonwick@Sun.COM 14747754SJeff.Bonwick@Sun.COM static void 14757754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp) 1476789Sahrens { 14777754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 14787754SJeff.Bonwick@Sun.COM 14797754SJeff.Bonwick@Sun.COM if (gn == NULL) 14807754SJeff.Bonwick@Sun.COM return; 14817754SJeff.Bonwick@Sun.COM 14827754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 14837754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&gn->gn_child[g]); 14847754SJeff.Bonwick@Sun.COM 14857754SJeff.Bonwick@Sun.COM zio_gang_node_free(gnpp); 14867754SJeff.Bonwick@Sun.COM } 14877754SJeff.Bonwick@Sun.COM 14887754SJeff.Bonwick@Sun.COM static void 14899443SBill.Moore@Sun.COM zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 14907754SJeff.Bonwick@Sun.COM { 14917754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1492789Sahrens 14939443SBill.Moore@Sun.COM ASSERT(gio->io_gang_leader == gio); 14947754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp)); 14957754SJeff.Bonwick@Sun.COM 14969443SBill.Moore@Sun.COM zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 14977754SJeff.Bonwick@Sun.COM SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 14989443SBill.Moore@Sun.COM gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 14997754SJeff.Bonwick@Sun.COM } 15007754SJeff.Bonwick@Sun.COM 15017754SJeff.Bonwick@Sun.COM static void 15027754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio) 15037754SJeff.Bonwick@Sun.COM { 15049443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 15057754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio->io_private; 15067754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 15077754SJeff.Bonwick@Sun.COM 15089443SBill.Moore@Sun.COM ASSERT(gio == zio_unique_parent(zio)); 150910922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 15107754SJeff.Bonwick@Sun.COM 15117754SJeff.Bonwick@Sun.COM if (zio->io_error) 15127754SJeff.Bonwick@Sun.COM return; 15137754SJeff.Bonwick@Sun.COM 15147754SJeff.Bonwick@Sun.COM if (BP_SHOULD_BYTESWAP(bp)) 15157754SJeff.Bonwick@Sun.COM byteswap_uint64_array(zio->io_data, zio->io_size); 15167754SJeff.Bonwick@Sun.COM 15177754SJeff.Bonwick@Sun.COM ASSERT(zio->io_data == gn->gn_gbh); 15187754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 15197754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 15207754SJeff.Bonwick@Sun.COM 15217754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15227754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15237754SJeff.Bonwick@Sun.COM if (!BP_IS_GANG(gbp)) 15247754SJeff.Bonwick@Sun.COM continue; 15259443SBill.Moore@Sun.COM zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1526789Sahrens } 1527789Sahrens } 1528789Sahrens 15297754SJeff.Bonwick@Sun.COM static void 15307754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1531789Sahrens { 15329443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 15337754SJeff.Bonwick@Sun.COM zio_t *zio; 15347754SJeff.Bonwick@Sun.COM 15357754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) == !!gn); 15369443SBill.Moore@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 15379443SBill.Moore@Sun.COM ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 15387754SJeff.Bonwick@Sun.COM 15397754SJeff.Bonwick@Sun.COM /* 15407754SJeff.Bonwick@Sun.COM * If you're a gang header, your data is in gn->gn_gbh. 15417754SJeff.Bonwick@Sun.COM * If you're a gang member, your data is in 'data' and gn == NULL. 15427754SJeff.Bonwick@Sun.COM */ 15439443SBill.Moore@Sun.COM zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1544789Sahrens 15457754SJeff.Bonwick@Sun.COM if (gn != NULL) { 15467754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 15477754SJeff.Bonwick@Sun.COM 15487754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15497754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15507754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(gbp)) 15517754SJeff.Bonwick@Sun.COM continue; 15527754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 15537754SJeff.Bonwick@Sun.COM data = (char *)data + BP_GET_PSIZE(gbp); 15547754SJeff.Bonwick@Sun.COM } 15557754SJeff.Bonwick@Sun.COM } 15567754SJeff.Bonwick@Sun.COM 15579443SBill.Moore@Sun.COM if (gn == gio->io_gang_tree) 15589443SBill.Moore@Sun.COM ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 15597754SJeff.Bonwick@Sun.COM 15607754SJeff.Bonwick@Sun.COM if (zio != pio) 15617754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1562789Sahrens } 1563789Sahrens 15645530Sbonwick static int 15657754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio) 15665329Sgw25295 { 15675530Sbonwick blkptr_t *bp = zio->io_bp; 15685530Sbonwick 15699443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 15709443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 15719443SBill.Moore@Sun.COM 15729443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 15735530Sbonwick 15747754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1575789Sahrens 15765530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1577789Sahrens } 1578789Sahrens 15795530Sbonwick static int 15807754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio) 15816523Sek110237 { 15826523Sek110237 blkptr_t *bp = zio->io_bp; 1583789Sahrens 15847754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 15857754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 15865329Sgw25295 15879443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 15889443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1589789Sahrens 15907754SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 15919443SBill.Moore@Sun.COM zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 15927754SJeff.Bonwick@Sun.COM else 15939443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 1594789Sahrens 15957754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 15965530Sbonwick 15975530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1598789Sahrens } 1599789Sahrens 1600789Sahrens static void 16017754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio) 1602789Sahrens { 16038632SBill.Moore@Sun.COM zio_t *pio = zio_unique_parent(zio); 16049443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 16051775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 16061775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1607789Sahrens uint64_t asize; 16087754SJeff.Bonwick@Sun.COM 16097754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(zio->io_bp)) 16107754SJeff.Bonwick@Sun.COM return; 16117754SJeff.Bonwick@Sun.COM 16127754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1613789Sahrens 16147754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 161510922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 161610922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 161710922SJeff.Bonwick@Sun.COM ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 16181775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 16191775Sbillm 1620789Sahrens mutex_enter(&pio->io_lock); 16217754SJeff.Bonwick@Sun.COM for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 16221775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 16231775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 16241775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 16251775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 16261775Sbillm } 1627789Sahrens mutex_exit(&pio->io_lock); 1628789Sahrens } 1629789Sahrens 16305329Sgw25295 static int 16317754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio) 1632789Sahrens { 16337754SJeff.Bonwick@Sun.COM spa_t *spa = pio->io_spa; 16347754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 16359443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 16367754SJeff.Bonwick@Sun.COM zio_t *zio; 16377754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn, **gnpp; 1638789Sahrens zio_gbh_phys_t *gbh; 16397754SJeff.Bonwick@Sun.COM uint64_t txg = pio->io_txg; 16407754SJeff.Bonwick@Sun.COM uint64_t resid = pio->io_size; 16417754SJeff.Bonwick@Sun.COM uint64_t lsize; 164210922SJeff.Bonwick@Sun.COM int copies = gio->io_prop.zp_copies; 164310922SJeff.Bonwick@Sun.COM int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 16447754SJeff.Bonwick@Sun.COM zio_prop_t zp; 1645789Sahrens int error; 1646789Sahrens 164710922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 164810922SJeff.Bonwick@Sun.COM bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 16497754SJeff.Bonwick@Sun.COM METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 16505530Sbonwick if (error) { 16517754SJeff.Bonwick@Sun.COM pio->io_error = error; 16525530Sbonwick return (ZIO_PIPELINE_CONTINUE); 16535530Sbonwick } 1654789Sahrens 16559443SBill.Moore@Sun.COM if (pio == gio) { 16569443SBill.Moore@Sun.COM gnpp = &gio->io_gang_tree; 16577754SJeff.Bonwick@Sun.COM } else { 16587754SJeff.Bonwick@Sun.COM gnpp = pio->io_private; 16597754SJeff.Bonwick@Sun.COM ASSERT(pio->io_ready == zio_write_gang_member_ready); 1660789Sahrens } 1661789Sahrens 16627754SJeff.Bonwick@Sun.COM gn = zio_gang_node_alloc(gnpp); 16637754SJeff.Bonwick@Sun.COM gbh = gn->gn_gbh; 16647754SJeff.Bonwick@Sun.COM bzero(gbh, SPA_GANGBLOCKSIZE); 1665789Sahrens 16667754SJeff.Bonwick@Sun.COM /* 16677754SJeff.Bonwick@Sun.COM * Create the gang header. 16687754SJeff.Bonwick@Sun.COM */ 16697754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 16707754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 16715530Sbonwick 16721775Sbillm /* 16737754SJeff.Bonwick@Sun.COM * Create and nowait the gang children. 16741775Sbillm */ 16757754SJeff.Bonwick@Sun.COM for (int g = 0; resid != 0; resid -= lsize, g++) { 16767754SJeff.Bonwick@Sun.COM lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 16777754SJeff.Bonwick@Sun.COM SPA_MINBLOCKSIZE); 16787754SJeff.Bonwick@Sun.COM ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 16797754SJeff.Bonwick@Sun.COM 16809443SBill.Moore@Sun.COM zp.zp_checksum = gio->io_prop.zp_checksum; 16817754SJeff.Bonwick@Sun.COM zp.zp_compress = ZIO_COMPRESS_OFF; 16827754SJeff.Bonwick@Sun.COM zp.zp_type = DMU_OT_NONE; 16837754SJeff.Bonwick@Sun.COM zp.zp_level = 0; 168410922SJeff.Bonwick@Sun.COM zp.zp_copies = gio->io_prop.zp_copies; 168510922SJeff.Bonwick@Sun.COM zp.zp_dedup = 0; 168610922SJeff.Bonwick@Sun.COM zp.zp_dedup_verify = 0; 16877754SJeff.Bonwick@Sun.COM 16887754SJeff.Bonwick@Sun.COM zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 16897754SJeff.Bonwick@Sun.COM (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 16907754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready, NULL, &gn->gn_child[g], 16917754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 16927754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 16937754SJeff.Bonwick@Sun.COM } 16947754SJeff.Bonwick@Sun.COM 16957754SJeff.Bonwick@Sun.COM /* 16967754SJeff.Bonwick@Sun.COM * Set pio's pipeline to just wait for zio to finish. 16977754SJeff.Bonwick@Sun.COM */ 16987754SJeff.Bonwick@Sun.COM pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 16997754SJeff.Bonwick@Sun.COM 17007754SJeff.Bonwick@Sun.COM zio_nowait(zio); 17017754SJeff.Bonwick@Sun.COM 17027754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1703789Sahrens } 1704789Sahrens 1705789Sahrens /* 1706789Sahrens * ========================================================================== 170710922SJeff.Bonwick@Sun.COM * Dedup 170810922SJeff.Bonwick@Sun.COM * ========================================================================== 170910922SJeff.Bonwick@Sun.COM */ 171010922SJeff.Bonwick@Sun.COM static void 171110922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done(zio_t *zio) 171210922SJeff.Bonwick@Sun.COM { 171310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 171410922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 171510922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 171610922SJeff.Bonwick@Sun.COM zio_t *pio = zio_unique_parent(zio); 171710922SJeff.Bonwick@Sun.COM 171810922SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 171910922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 172010922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) 172110922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 172210922SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && dde->dde_repair_data == NULL) 172310922SJeff.Bonwick@Sun.COM dde->dde_repair_data = zio->io_data; 172410922SJeff.Bonwick@Sun.COM else 172510922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zio->io_size); 172610922SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 172710922SJeff.Bonwick@Sun.COM } 172810922SJeff.Bonwick@Sun.COM 172910922SJeff.Bonwick@Sun.COM static int 173010922SJeff.Bonwick@Sun.COM zio_ddt_read_start(zio_t *zio) 173110922SJeff.Bonwick@Sun.COM { 173210922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 173310922SJeff.Bonwick@Sun.COM 173410922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 173510922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 173610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 173710922SJeff.Bonwick@Sun.COM 173810922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 173910922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 174010922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = ddt_repair_start(ddt, bp); 174110922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = dde->dde_phys; 174210922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 174310922SJeff.Bonwick@Sun.COM blkptr_t blk; 174410922SJeff.Bonwick@Sun.COM 174510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 174610922SJeff.Bonwick@Sun.COM zio->io_vsd = dde; 174710922SJeff.Bonwick@Sun.COM 174810922SJeff.Bonwick@Sun.COM if (ddp_self == NULL) 174910922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 175010922SJeff.Bonwick@Sun.COM 175110922SJeff.Bonwick@Sun.COM for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 175210922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 175310922SJeff.Bonwick@Sun.COM continue; 175411125SJeff.Bonwick@Sun.COM ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 175511125SJeff.Bonwick@Sun.COM &blk); 175610922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, &blk, 175710922SJeff.Bonwick@Sun.COM zio_buf_alloc(zio->io_size), zio->io_size, 175810922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done, dde, zio->io_priority, 175910922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 176010922SJeff.Bonwick@Sun.COM &zio->io_bookmark)); 176110922SJeff.Bonwick@Sun.COM } 176210922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 176310922SJeff.Bonwick@Sun.COM } 176410922SJeff.Bonwick@Sun.COM 176510922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, bp, 176610922SJeff.Bonwick@Sun.COM zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 176710922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 176810922SJeff.Bonwick@Sun.COM 176910922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 177010922SJeff.Bonwick@Sun.COM } 177110922SJeff.Bonwick@Sun.COM 177210922SJeff.Bonwick@Sun.COM static int 177310922SJeff.Bonwick@Sun.COM zio_ddt_read_done(zio_t *zio) 177410922SJeff.Bonwick@Sun.COM { 177510922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 177610922SJeff.Bonwick@Sun.COM 177710922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 177810922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 177910922SJeff.Bonwick@Sun.COM 178010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 178110922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 178210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 178310922SJeff.Bonwick@Sun.COM 178410922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 178510922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 178610922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_vsd; 178710922SJeff.Bonwick@Sun.COM if (ddt == NULL) { 178810922SJeff.Bonwick@Sun.COM ASSERT(zio->io_spa->spa_load_state != SPA_LOAD_NONE); 178910922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 179010922SJeff.Bonwick@Sun.COM } 179110922SJeff.Bonwick@Sun.COM if (dde == NULL) { 179210922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 179310922SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 179410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 179510922SJeff.Bonwick@Sun.COM } 179610922SJeff.Bonwick@Sun.COM if (dde->dde_repair_data != NULL) { 179710922SJeff.Bonwick@Sun.COM bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 179810922SJeff.Bonwick@Sun.COM zio->io_child_error[ZIO_CHILD_DDT] = 0; 179910922SJeff.Bonwick@Sun.COM } 180010922SJeff.Bonwick@Sun.COM ddt_repair_done(ddt, dde); 180110922SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 180210922SJeff.Bonwick@Sun.COM } 180310922SJeff.Bonwick@Sun.COM 180410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 180510922SJeff.Bonwick@Sun.COM 180610922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 180710922SJeff.Bonwick@Sun.COM } 180810922SJeff.Bonwick@Sun.COM 180910922SJeff.Bonwick@Sun.COM static boolean_t 181010922SJeff.Bonwick@Sun.COM zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 181110922SJeff.Bonwick@Sun.COM { 181210922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 181310922SJeff.Bonwick@Sun.COM 181410922SJeff.Bonwick@Sun.COM /* 181510922SJeff.Bonwick@Sun.COM * Note: we compare the original data, not the transformed data, 181610922SJeff.Bonwick@Sun.COM * because when zio->io_bp is an override bp, we will not have 181710922SJeff.Bonwick@Sun.COM * pushed the I/O transforms. That's an important optimization 181810922SJeff.Bonwick@Sun.COM * because otherwise we'd compress/encrypt all dmu_sync() data twice. 181910922SJeff.Bonwick@Sun.COM */ 182010922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 182110922SJeff.Bonwick@Sun.COM zio_t *lio = dde->dde_lead_zio[p]; 182210922SJeff.Bonwick@Sun.COM 182310922SJeff.Bonwick@Sun.COM if (lio != NULL) { 182410922SJeff.Bonwick@Sun.COM return (lio->io_orig_size != zio->io_orig_size || 182510922SJeff.Bonwick@Sun.COM bcmp(zio->io_orig_data, lio->io_orig_data, 182610922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0); 182710922SJeff.Bonwick@Sun.COM } 182810922SJeff.Bonwick@Sun.COM } 182910922SJeff.Bonwick@Sun.COM 183010922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 183110922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 183210922SJeff.Bonwick@Sun.COM 183310922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) { 183410922SJeff.Bonwick@Sun.COM arc_buf_t *abuf = NULL; 183510922SJeff.Bonwick@Sun.COM uint32_t aflags = ARC_WAIT; 183610922SJeff.Bonwick@Sun.COM blkptr_t blk = *zio->io_bp; 183710922SJeff.Bonwick@Sun.COM int error; 183810922SJeff.Bonwick@Sun.COM 183910922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 184010922SJeff.Bonwick@Sun.COM 184110922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 184210922SJeff.Bonwick@Sun.COM 184310922SJeff.Bonwick@Sun.COM error = arc_read_nolock(NULL, spa, &blk, 184410922SJeff.Bonwick@Sun.COM arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 184510922SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 184610922SJeff.Bonwick@Sun.COM &aflags, &zio->io_bookmark); 184710922SJeff.Bonwick@Sun.COM 184810922SJeff.Bonwick@Sun.COM if (error == 0) { 184910922SJeff.Bonwick@Sun.COM if (arc_buf_size(abuf) != zio->io_orig_size || 185010922SJeff.Bonwick@Sun.COM bcmp(abuf->b_data, zio->io_orig_data, 185110922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0) 185210922SJeff.Bonwick@Sun.COM error = EEXIST; 185310922SJeff.Bonwick@Sun.COM VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 185410922SJeff.Bonwick@Sun.COM } 185510922SJeff.Bonwick@Sun.COM 185610922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 185710922SJeff.Bonwick@Sun.COM return (error != 0); 185810922SJeff.Bonwick@Sun.COM } 185910922SJeff.Bonwick@Sun.COM } 186010922SJeff.Bonwick@Sun.COM 186110922SJeff.Bonwick@Sun.COM return (B_FALSE); 186210922SJeff.Bonwick@Sun.COM } 186310922SJeff.Bonwick@Sun.COM 186410922SJeff.Bonwick@Sun.COM static void 186510922SJeff.Bonwick@Sun.COM zio_ddt_child_write_ready(zio_t *zio) 186610922SJeff.Bonwick@Sun.COM { 186710922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 186810922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 186910922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 187010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 187110922SJeff.Bonwick@Sun.COM zio_t *pio; 187210922SJeff.Bonwick@Sun.COM 187310922SJeff.Bonwick@Sun.COM if (zio->io_error) 187410922SJeff.Bonwick@Sun.COM return; 187510922SJeff.Bonwick@Sun.COM 187610922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 187710922SJeff.Bonwick@Sun.COM 187810922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 187910922SJeff.Bonwick@Sun.COM 188010922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, zio->io_bp); 188110922SJeff.Bonwick@Sun.COM 188210922SJeff.Bonwick@Sun.COM while ((pio = zio_walk_parents(zio)) != NULL) 188310922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 188410922SJeff.Bonwick@Sun.COM 188510922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 188610922SJeff.Bonwick@Sun.COM } 188710922SJeff.Bonwick@Sun.COM 188810922SJeff.Bonwick@Sun.COM static void 188910922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done(zio_t *zio) 189010922SJeff.Bonwick@Sun.COM { 189110922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 189210922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 189310922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 189410922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 189510922SJeff.Bonwick@Sun.COM 189610922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 189710922SJeff.Bonwick@Sun.COM 189810922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 189910922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 190010922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 190110922SJeff.Bonwick@Sun.COM 190210922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 190310922SJeff.Bonwick@Sun.COM while (zio_walk_parents(zio) != NULL) 190410922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 190510922SJeff.Bonwick@Sun.COM } else { 190610922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); 190710922SJeff.Bonwick@Sun.COM } 190810922SJeff.Bonwick@Sun.COM 190910922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 191010922SJeff.Bonwick@Sun.COM } 191110922SJeff.Bonwick@Sun.COM 191210922SJeff.Bonwick@Sun.COM static void 191310922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done(zio_t *zio) 191410922SJeff.Bonwick@Sun.COM { 191510922SJeff.Bonwick@Sun.COM int p = DDT_PHYS_DITTO; 191610922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 191710922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 191810922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 191910922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 192010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 192110922SJeff.Bonwick@Sun.COM ddt_key_t *ddk = &dde->dde_key; 192210922SJeff.Bonwick@Sun.COM 192310922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 192410922SJeff.Bonwick@Sun.COM 192510922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 192610922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 192710922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 192810922SJeff.Bonwick@Sun.COM 192910922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 193010922SJeff.Bonwick@Sun.COM ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 193110922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 193210922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 193310922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 193410922SJeff.Bonwick@Sun.COM ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 193510922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 193610922SJeff.Bonwick@Sun.COM } 193710922SJeff.Bonwick@Sun.COM 193810922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 193910922SJeff.Bonwick@Sun.COM } 194010922SJeff.Bonwick@Sun.COM 194110922SJeff.Bonwick@Sun.COM static int 194210922SJeff.Bonwick@Sun.COM zio_ddt_write(zio_t *zio) 194310922SJeff.Bonwick@Sun.COM { 194410922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 194510922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 194610922SJeff.Bonwick@Sun.COM uint64_t txg = zio->io_txg; 194710922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 194810922SJeff.Bonwick@Sun.COM int p = zp->zp_copies; 194910922SJeff.Bonwick@Sun.COM int ditto_copies; 195010922SJeff.Bonwick@Sun.COM zio_t *cio = NULL; 195110922SJeff.Bonwick@Sun.COM zio_t *dio = NULL; 195210922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 195310922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 195410922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 195510922SJeff.Bonwick@Sun.COM 195610922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 195710922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 195810922SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 195910922SJeff.Bonwick@Sun.COM 196010922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 196110922SJeff.Bonwick@Sun.COM dde = ddt_lookup(ddt, bp, B_TRUE); 196210922SJeff.Bonwick@Sun.COM ddp = &dde->dde_phys[p]; 196310922SJeff.Bonwick@Sun.COM 196410922SJeff.Bonwick@Sun.COM if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 196510922SJeff.Bonwick@Sun.COM /* 196610922SJeff.Bonwick@Sun.COM * If we're using a weak checksum, upgrade to a strong checksum 196710922SJeff.Bonwick@Sun.COM * and try again. If we're already using a strong checksum, 196810922SJeff.Bonwick@Sun.COM * we can't resolve it, so just convert to an ordinary write. 196910922SJeff.Bonwick@Sun.COM * (And automatically e-mail a paper to Nature?) 197010922SJeff.Bonwick@Sun.COM */ 197110922SJeff.Bonwick@Sun.COM if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 197210922SJeff.Bonwick@Sun.COM zp->zp_checksum = spa_dedup_checksum(spa); 197310922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 197410922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 197510922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 197610922SJeff.Bonwick@Sun.COM } else { 197710922SJeff.Bonwick@Sun.COM zp->zp_dedup = 0; 197810922SJeff.Bonwick@Sun.COM } 197910922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 198010922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 198110922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 198210922SJeff.Bonwick@Sun.COM } 198310922SJeff.Bonwick@Sun.COM 198410922SJeff.Bonwick@Sun.COM ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 198510922SJeff.Bonwick@Sun.COM ASSERT(ditto_copies < SPA_DVAS_PER_BP); 198610922SJeff.Bonwick@Sun.COM 198710922SJeff.Bonwick@Sun.COM if (ditto_copies > ddt_ditto_copies_present(dde) && 198810922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 198910922SJeff.Bonwick@Sun.COM zio_prop_t czp = *zp; 199010922SJeff.Bonwick@Sun.COM 199110922SJeff.Bonwick@Sun.COM czp.zp_copies = ditto_copies; 199210922SJeff.Bonwick@Sun.COM 199310922SJeff.Bonwick@Sun.COM /* 199410922SJeff.Bonwick@Sun.COM * If we arrived here with an override bp, we won't have run 199510922SJeff.Bonwick@Sun.COM * the transform stack, so we won't have the data we need to 199610922SJeff.Bonwick@Sun.COM * generate a child i/o. So, toss the override bp and restart. 199710922SJeff.Bonwick@Sun.COM * This is safe, because using the override bp is just an 199810922SJeff.Bonwick@Sun.COM * optimization; and it's rare, so the cost doesn't matter. 199910922SJeff.Bonwick@Sun.COM */ 200010922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 200110922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 200210922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 200310922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 200410922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 200510922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 200610922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 200710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 200810922SJeff.Bonwick@Sun.COM } 200910922SJeff.Bonwick@Sun.COM 201010922SJeff.Bonwick@Sun.COM dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 201110922SJeff.Bonwick@Sun.COM zio->io_orig_size, &czp, NULL, 201210922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done, dde, zio->io_priority, 201310922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 201410922SJeff.Bonwick@Sun.COM 201510922SJeff.Bonwick@Sun.COM zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 201610922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 201710922SJeff.Bonwick@Sun.COM } 201810922SJeff.Bonwick@Sun.COM 201910922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 202010922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 202110922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, bp, txg); 202210922SJeff.Bonwick@Sun.COM if (dde->dde_lead_zio[p] != NULL) 202310922SJeff.Bonwick@Sun.COM zio_add_child(zio, dde->dde_lead_zio[p]); 202410922SJeff.Bonwick@Sun.COM else 202510922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 202610922SJeff.Bonwick@Sun.COM } else if (zio->io_bp_override) { 202710922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == txg); 202810922SJeff.Bonwick@Sun.COM ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 202910922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 203010922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 203110922SJeff.Bonwick@Sun.COM } else { 203210922SJeff.Bonwick@Sun.COM cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 203310922SJeff.Bonwick@Sun.COM zio->io_orig_size, zp, zio_ddt_child_write_ready, 203410922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done, dde, zio->io_priority, 203510922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 203610922SJeff.Bonwick@Sun.COM 203710922SJeff.Bonwick@Sun.COM zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 203810922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = cio; 203910922SJeff.Bonwick@Sun.COM } 204010922SJeff.Bonwick@Sun.COM 204110922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 204210922SJeff.Bonwick@Sun.COM 204310922SJeff.Bonwick@Sun.COM if (cio) 204410922SJeff.Bonwick@Sun.COM zio_nowait(cio); 204510922SJeff.Bonwick@Sun.COM if (dio) 204610922SJeff.Bonwick@Sun.COM zio_nowait(dio); 204710922SJeff.Bonwick@Sun.COM 204810922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 204910922SJeff.Bonwick@Sun.COM } 205010922SJeff.Bonwick@Sun.COM 205110922SJeff.Bonwick@Sun.COM static int 205210922SJeff.Bonwick@Sun.COM zio_ddt_free(zio_t *zio) 205310922SJeff.Bonwick@Sun.COM { 205410922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 205510922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 205610922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 205710922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 205810922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 205910922SJeff.Bonwick@Sun.COM 206010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 206110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 206210922SJeff.Bonwick@Sun.COM 206310922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 206410922SJeff.Bonwick@Sun.COM dde = ddt_lookup(ddt, bp, B_TRUE); 206510922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 206610922SJeff.Bonwick@Sun.COM ddt_phys_decref(ddp); 206710922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 206810922SJeff.Bonwick@Sun.COM 206910922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 207010922SJeff.Bonwick@Sun.COM } 207110922SJeff.Bonwick@Sun.COM 207210922SJeff.Bonwick@Sun.COM /* 207310922SJeff.Bonwick@Sun.COM * ========================================================================== 2074789Sahrens * Allocate and free blocks 2075789Sahrens * ========================================================================== 2076789Sahrens */ 20775530Sbonwick static int 2078789Sahrens zio_dva_allocate(zio_t *zio) 2079789Sahrens { 20804527Sperrin spa_t *spa = zio->io_spa; 208110922SJeff.Bonwick@Sun.COM metaslab_class_t *mc = spa_normal_class(spa); 2082789Sahrens blkptr_t *bp = zio->io_bp; 2083789Sahrens int error; 2084789Sahrens 20859443SBill.Moore@Sun.COM if (zio->io_gang_leader == NULL) { 20869443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 20879443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 20889443SBill.Moore@Sun.COM } 20899443SBill.Moore@Sun.COM 2090789Sahrens ASSERT(BP_IS_HOLE(bp)); 20911775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 209210922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, >, 0); 209310922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2094789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2095789Sahrens 20967754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, mc, zio->io_size, bp, 209710922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2098789Sahrens 20997754SJeff.Bonwick@Sun.COM if (error) { 21007754SJeff.Bonwick@Sun.COM if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 21017754SJeff.Bonwick@Sun.COM return (zio_write_gang_block(zio)); 2102789Sahrens zio->io_error = error; 2103789Sahrens } 21045530Sbonwick 21055530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2106789Sahrens } 2107789Sahrens 21085530Sbonwick static int 2109789Sahrens zio_dva_free(zio_t *zio) 2110789Sahrens { 21117754SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2112789Sahrens 21135530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2114789Sahrens } 2115789Sahrens 21165530Sbonwick static int 2117789Sahrens zio_dva_claim(zio_t *zio) 2118789Sahrens { 21197754SJeff.Bonwick@Sun.COM int error; 21207754SJeff.Bonwick@Sun.COM 21217754SJeff.Bonwick@Sun.COM error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 21227754SJeff.Bonwick@Sun.COM if (error) 21237754SJeff.Bonwick@Sun.COM zio->io_error = error; 2124789Sahrens 21255530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2126789Sahrens } 2127789Sahrens 2128789Sahrens /* 21297754SJeff.Bonwick@Sun.COM * Undo an allocation. This is used by zio_done() when an I/O fails 21307754SJeff.Bonwick@Sun.COM * and we want to give back the block we just allocated. 21317754SJeff.Bonwick@Sun.COM * This handles both normal blocks and gang blocks. 21327754SJeff.Bonwick@Sun.COM */ 21337754SJeff.Bonwick@Sun.COM static void 21347754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 21357754SJeff.Bonwick@Sun.COM { 21367754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 213710922SJeff.Bonwick@Sun.COM ASSERT(zio->io_bp_override == NULL); 21387754SJeff.Bonwick@Sun.COM 21397754SJeff.Bonwick@Sun.COM if (!BP_IS_HOLE(bp)) 214010922SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 21417754SJeff.Bonwick@Sun.COM 21427754SJeff.Bonwick@Sun.COM if (gn != NULL) { 21437754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 21447754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, gn->gn_child[g], 21457754SJeff.Bonwick@Sun.COM &gn->gn_gbh->zg_blkptr[g]); 21467754SJeff.Bonwick@Sun.COM } 21477754SJeff.Bonwick@Sun.COM } 21487754SJeff.Bonwick@Sun.COM } 21497754SJeff.Bonwick@Sun.COM 21507754SJeff.Bonwick@Sun.COM /* 21517754SJeff.Bonwick@Sun.COM * Try to allocate an intent log block. Return 0 on success, errno on failure. 21527754SJeff.Bonwick@Sun.COM */ 21537754SJeff.Bonwick@Sun.COM int 215410922SJeff.Bonwick@Sun.COM zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 215510922SJeff.Bonwick@Sun.COM uint64_t size, boolean_t use_slog) 21567754SJeff.Bonwick@Sun.COM { 215710310SNeil.Perrin@Sun.COM int error = 1; 21587754SJeff.Bonwick@Sun.COM 215910922SJeff.Bonwick@Sun.COM ASSERT(txg > spa_syncing_txg(spa)); 216010922SJeff.Bonwick@Sun.COM 216110879SNeil.Perrin@Sun.COM if (use_slog) 216210922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_log_class(spa), size, 216310310SNeil.Perrin@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21647754SJeff.Bonwick@Sun.COM 21657754SJeff.Bonwick@Sun.COM if (error) 216610922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), size, 21677754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21687754SJeff.Bonwick@Sun.COM 21697754SJeff.Bonwick@Sun.COM if (error == 0) { 21707754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(new_bp, size); 21717754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(new_bp, size); 21727754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 21737754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 21747754SJeff.Bonwick@Sun.COM BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 21757754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(new_bp, 0); 217610922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(new_bp, 0); 21777754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 21787754SJeff.Bonwick@Sun.COM } 21797754SJeff.Bonwick@Sun.COM 21807754SJeff.Bonwick@Sun.COM return (error); 21817754SJeff.Bonwick@Sun.COM } 21827754SJeff.Bonwick@Sun.COM 21837754SJeff.Bonwick@Sun.COM /* 218410922SJeff.Bonwick@Sun.COM * Free an intent log block. 21857754SJeff.Bonwick@Sun.COM */ 21867754SJeff.Bonwick@Sun.COM void 218710922SJeff.Bonwick@Sun.COM zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 21887754SJeff.Bonwick@Sun.COM { 218910922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 21907754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(bp)); 21917754SJeff.Bonwick@Sun.COM 219210922SJeff.Bonwick@Sun.COM zio_free(spa, txg, bp); 21937754SJeff.Bonwick@Sun.COM } 21947754SJeff.Bonwick@Sun.COM 21957754SJeff.Bonwick@Sun.COM /* 2196789Sahrens * ========================================================================== 2197789Sahrens * Read and write to physical devices 2198789Sahrens * ========================================================================== 2199789Sahrens */ 22005530Sbonwick static int 22011775Sbillm zio_vdev_io_start(zio_t *zio) 2202789Sahrens { 2203789Sahrens vdev_t *vd = zio->io_vd; 22041775Sbillm uint64_t align; 22055329Sgw25295 spa_t *spa = zio->io_spa; 22065329Sgw25295 22077754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0); 22087754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 22097754SJeff.Bonwick@Sun.COM 22107754SJeff.Bonwick@Sun.COM if (vd == NULL) { 22117754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 22127754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2213789Sahrens 22147754SJeff.Bonwick@Sun.COM /* 22157754SJeff.Bonwick@Sun.COM * The mirror_ops handle multiple DVAs in a single BP. 22167754SJeff.Bonwick@Sun.COM */ 22175530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 22187754SJeff.Bonwick@Sun.COM } 22191775Sbillm 22207754SJeff.Bonwick@Sun.COM align = 1ULL << vd->vdev_top->vdev_ashift; 2221789Sahrens 22221732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 22231732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 22241732Sbonwick char *abuf = zio_buf_alloc(asize); 22257754SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 22261732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 22271732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 22281732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 22291732Sbonwick } 22307754SJeff.Bonwick@Sun.COM zio_push_transform(zio, abuf, asize, asize, zio_subblock); 22311732Sbonwick } 22321732Sbonwick 22331732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 22341732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 22358241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 22368241SJeff.Bonwick@Sun.COM 22378241SJeff.Bonwick@Sun.COM /* 22388241SJeff.Bonwick@Sun.COM * If this is a repair I/O, and there's no self-healing involved -- 22398241SJeff.Bonwick@Sun.COM * that is, we're just resilvering what we expect to resilver -- 22408241SJeff.Bonwick@Sun.COM * then don't do the I/O unless zio's txg is actually in vd's DTL. 22418241SJeff.Bonwick@Sun.COM * This prevents spurious resilvering with nested replication. 22428241SJeff.Bonwick@Sun.COM * For example, given a mirror of mirrors, (A+B)+(C+D), if only 22438241SJeff.Bonwick@Sun.COM * A is out of date, we'll read from C+D, then use the data to 22448241SJeff.Bonwick@Sun.COM * resilver A+B -- but we don't actually want to resilver B, just A. 22458241SJeff.Bonwick@Sun.COM * The top-level mirror has no way to know this, so instead we just 22468241SJeff.Bonwick@Sun.COM * discard unnecessary repairs as we work our way down the vdev tree. 22478241SJeff.Bonwick@Sun.COM * The same logic applies to any form of nested replication: 22488241SJeff.Bonwick@Sun.COM * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 22498241SJeff.Bonwick@Sun.COM */ 22508241SJeff.Bonwick@Sun.COM if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 22518241SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 22528241SJeff.Bonwick@Sun.COM zio->io_txg != 0 && /* not a delegated i/o */ 22538241SJeff.Bonwick@Sun.COM !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 22548241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 22558241SJeff.Bonwick@Sun.COM zio_vdev_io_bypass(zio); 22568241SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 22578241SJeff.Bonwick@Sun.COM } 2258789Sahrens 22597754SJeff.Bonwick@Sun.COM if (vd->vdev_ops->vdev_op_leaf && 22607754SJeff.Bonwick@Sun.COM (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 22617754SJeff.Bonwick@Sun.COM 22627754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 22638632SBill.Moore@Sun.COM return (ZIO_PIPELINE_CONTINUE); 22647754SJeff.Bonwick@Sun.COM 22657754SJeff.Bonwick@Sun.COM if ((zio = vdev_queue_io(zio)) == NULL) 22667754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22677754SJeff.Bonwick@Sun.COM 22687754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 22697754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 22707754SJeff.Bonwick@Sun.COM zio_interrupt(zio); 22717754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22727754SJeff.Bonwick@Sun.COM } 22737754SJeff.Bonwick@Sun.COM } 22747754SJeff.Bonwick@Sun.COM 22755530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 2276789Sahrens } 2277789Sahrens 22785530Sbonwick static int 2279789Sahrens zio_vdev_io_done(zio_t *zio) 2280789Sahrens { 22817754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 22827754SJeff.Bonwick@Sun.COM vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 22837754SJeff.Bonwick@Sun.COM boolean_t unexpected_error = B_FALSE; 22845530Sbonwick 22857754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 22867754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22877754SJeff.Bonwick@Sun.COM 22887754SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2289789Sahrens 22907754SJeff.Bonwick@Sun.COM if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 22917754SJeff.Bonwick@Sun.COM 22927754SJeff.Bonwick@Sun.COM vdev_queue_io_done(zio); 22937754SJeff.Bonwick@Sun.COM 22947754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE) 22957754SJeff.Bonwick@Sun.COM vdev_cache_write(zio); 22967754SJeff.Bonwick@Sun.COM 22977754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 22989725SEric.Schrock@Sun.COM zio->io_error = zio_handle_device_injection(vd, 22999725SEric.Schrock@Sun.COM zio, EIO); 2300789Sahrens 23017754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23027754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_label_injection(zio, EIO); 23037754SJeff.Bonwick@Sun.COM 23047754SJeff.Bonwick@Sun.COM if (zio->io_error) { 23057754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 23067754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23077754SJeff.Bonwick@Sun.COM } else { 23087754SJeff.Bonwick@Sun.COM unexpected_error = B_TRUE; 23097754SJeff.Bonwick@Sun.COM } 23107754SJeff.Bonwick@Sun.COM } 23116976Seschrock } 23127754SJeff.Bonwick@Sun.COM 23137754SJeff.Bonwick@Sun.COM ops->vdev_op_io_done(zio); 2314789Sahrens 23157754SJeff.Bonwick@Sun.COM if (unexpected_error) 23168632SBill.Moore@Sun.COM VERIFY(vdev_probe(vd, zio) == NULL); 23177754SJeff.Bonwick@Sun.COM 23187754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2319789Sahrens } 2320789Sahrens 232110614SJonathan.Adams@Sun.COM /* 232210614SJonathan.Adams@Sun.COM * For non-raidz ZIOs, we can just copy aside the bad data read from the 232310614SJonathan.Adams@Sun.COM * disk, and use that to finish the checksum ereport later. 232410614SJonathan.Adams@Sun.COM */ 232510614SJonathan.Adams@Sun.COM static void 232610614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 232710614SJonathan.Adams@Sun.COM const void *good_buf) 232810614SJonathan.Adams@Sun.COM { 232910614SJonathan.Adams@Sun.COM /* no processing needed */ 233010614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 233110614SJonathan.Adams@Sun.COM } 233210614SJonathan.Adams@Sun.COM 233310614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 233410614SJonathan.Adams@Sun.COM void 233510614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 233610614SJonathan.Adams@Sun.COM { 233710614SJonathan.Adams@Sun.COM void *buf = zio_buf_alloc(zio->io_size); 233810614SJonathan.Adams@Sun.COM 233910614SJonathan.Adams@Sun.COM bcopy(zio->io_data, buf, zio->io_size); 234010614SJonathan.Adams@Sun.COM 234110614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = zio->io_size; 234210614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = buf; 234310614SJonathan.Adams@Sun.COM zcr->zcr_finish = zio_vsd_default_cksum_finish; 234410614SJonathan.Adams@Sun.COM zcr->zcr_free = zio_buf_free; 234510614SJonathan.Adams@Sun.COM } 234610614SJonathan.Adams@Sun.COM 23475530Sbonwick static int 2348789Sahrens zio_vdev_io_assess(zio_t *zio) 2349789Sahrens { 2350789Sahrens vdev_t *vd = zio->io_vd; 2351789Sahrens 23527754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 23537754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2354789Sahrens 23557754SJeff.Bonwick@Sun.COM if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 23567754SJeff.Bonwick@Sun.COM spa_config_exit(zio->io_spa, SCL_ZIO, zio); 23577754SJeff.Bonwick@Sun.COM 23587754SJeff.Bonwick@Sun.COM if (zio->io_vsd != NULL) { 235910614SJonathan.Adams@Sun.COM zio->io_vsd_ops->vsd_free(zio); 23607754SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 23611732Sbonwick } 23621732Sbonwick 23637754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23641544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 2365789Sahrens 2366789Sahrens /* 2367789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 2368789Sahrens */ 23697754SJeff.Bonwick@Sun.COM if (zio->io_error && vd == NULL && 23707754SJeff.Bonwick@Sun.COM !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 23717754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 23727754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2373789Sahrens zio->io_error = 0; 23747754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_RETRY | 23757754SJeff.Bonwick@Sun.COM ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 237610922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 23777754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 23787754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23797754SJeff.Bonwick@Sun.COM } 2380789Sahrens 23817754SJeff.Bonwick@Sun.COM /* 23827754SJeff.Bonwick@Sun.COM * If we got an error on a leaf device, convert it to ENXIO 23837754SJeff.Bonwick@Sun.COM * if the device is not accessible at all. 23847754SJeff.Bonwick@Sun.COM */ 23857754SJeff.Bonwick@Sun.COM if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 23867754SJeff.Bonwick@Sun.COM !vdev_accessible(vd, zio)) 23877754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23887754SJeff.Bonwick@Sun.COM 23897754SJeff.Bonwick@Sun.COM /* 23907754SJeff.Bonwick@Sun.COM * If we can't write to an interior vdev (mirror or RAID-Z), 23917754SJeff.Bonwick@Sun.COM * set vdev_cant_write so that we stop trying to allocate from it. 23927754SJeff.Bonwick@Sun.COM */ 23937754SJeff.Bonwick@Sun.COM if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 23947754SJeff.Bonwick@Sun.COM vd != NULL && !vd->vdev_ops->vdev_op_leaf) 23957754SJeff.Bonwick@Sun.COM vd->vdev_cant_write = B_TRUE; 23967754SJeff.Bonwick@Sun.COM 23977754SJeff.Bonwick@Sun.COM if (zio->io_error) 23987754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2399789Sahrens 24005530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2401789Sahrens } 2402789Sahrens 2403789Sahrens void 2404789Sahrens zio_vdev_io_reissue(zio_t *zio) 2405789Sahrens { 2406789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2407789Sahrens ASSERT(zio->io_error == 0); 2408789Sahrens 240910922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2410789Sahrens } 2411789Sahrens 2412789Sahrens void 2413789Sahrens zio_vdev_io_redone(zio_t *zio) 2414789Sahrens { 2415789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2416789Sahrens 241710922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2418789Sahrens } 2419789Sahrens 2420789Sahrens void 2421789Sahrens zio_vdev_io_bypass(zio_t *zio) 2422789Sahrens { 2423789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2424789Sahrens ASSERT(zio->io_error == 0); 2425789Sahrens 2426789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 242710922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2428789Sahrens } 2429789Sahrens 2430789Sahrens /* 2431789Sahrens * ========================================================================== 2432789Sahrens * Generate and verify checksums 2433789Sahrens * ========================================================================== 2434789Sahrens */ 24355530Sbonwick static int 2436789Sahrens zio_checksum_generate(zio_t *zio) 2437789Sahrens { 2438789Sahrens blkptr_t *bp = zio->io_bp; 24397754SJeff.Bonwick@Sun.COM enum zio_checksum checksum; 2440789Sahrens 24417754SJeff.Bonwick@Sun.COM if (bp == NULL) { 24427754SJeff.Bonwick@Sun.COM /* 24437754SJeff.Bonwick@Sun.COM * This is zio_write_phys(). 24447754SJeff.Bonwick@Sun.COM * We're either generating a label checksum, or none at all. 24457754SJeff.Bonwick@Sun.COM */ 24467754SJeff.Bonwick@Sun.COM checksum = zio->io_prop.zp_checksum; 2447789Sahrens 24487754SJeff.Bonwick@Sun.COM if (checksum == ZIO_CHECKSUM_OFF) 24497754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2450789Sahrens 24517754SJeff.Bonwick@Sun.COM ASSERT(checksum == ZIO_CHECKSUM_LABEL); 24527754SJeff.Bonwick@Sun.COM } else { 24537754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 24547754SJeff.Bonwick@Sun.COM ASSERT(!IO_IS_ALLOCATING(zio)); 24557754SJeff.Bonwick@Sun.COM checksum = ZIO_CHECKSUM_GANG_HEADER; 24567754SJeff.Bonwick@Sun.COM } else { 24577754SJeff.Bonwick@Sun.COM checksum = BP_GET_CHECKSUM(bp); 24587754SJeff.Bonwick@Sun.COM } 24597754SJeff.Bonwick@Sun.COM } 2460789Sahrens 24617754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2462789Sahrens 24635530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2464789Sahrens } 2465789Sahrens 24665530Sbonwick static int 2467789Sahrens zio_checksum_verify(zio_t *zio) 2468789Sahrens { 246910614SJonathan.Adams@Sun.COM zio_bad_cksum_t info; 24707754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 24717754SJeff.Bonwick@Sun.COM int error; 24727754SJeff.Bonwick@Sun.COM 247310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vd != NULL); 247410922SJeff.Bonwick@Sun.COM 24757754SJeff.Bonwick@Sun.COM if (bp == NULL) { 24767754SJeff.Bonwick@Sun.COM /* 24777754SJeff.Bonwick@Sun.COM * This is zio_read_phys(). 24787754SJeff.Bonwick@Sun.COM * We're either verifying a label checksum, or nothing at all. 24797754SJeff.Bonwick@Sun.COM */ 24807754SJeff.Bonwick@Sun.COM if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 24817754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 24827754SJeff.Bonwick@Sun.COM 24837754SJeff.Bonwick@Sun.COM ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 24847754SJeff.Bonwick@Sun.COM } 24857754SJeff.Bonwick@Sun.COM 248610614SJonathan.Adams@Sun.COM if ((error = zio_checksum_error(zio, &info)) != 0) { 24877754SJeff.Bonwick@Sun.COM zio->io_error = error; 24887754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 248910614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum(zio->io_spa, 249010614SJonathan.Adams@Sun.COM zio->io_vd, zio, zio->io_offset, 249110614SJonathan.Adams@Sun.COM zio->io_size, NULL, &info); 24927754SJeff.Bonwick@Sun.COM } 2493789Sahrens } 2494789Sahrens 24955530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2496789Sahrens } 2497789Sahrens 2498789Sahrens /* 2499789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 2500789Sahrens */ 2501789Sahrens void 2502789Sahrens zio_checksum_verified(zio_t *zio) 2503789Sahrens { 250410922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2505789Sahrens } 2506789Sahrens 2507789Sahrens /* 25087754SJeff.Bonwick@Sun.COM * ========================================================================== 25097754SJeff.Bonwick@Sun.COM * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 25107754SJeff.Bonwick@Sun.COM * An error of 0 indictes success. ENXIO indicates whole-device failure, 25117754SJeff.Bonwick@Sun.COM * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 25127754SJeff.Bonwick@Sun.COM * indicate errors that are specific to one I/O, and most likely permanent. 25137754SJeff.Bonwick@Sun.COM * Any other error is presumed to be worse because we weren't expecting it. 25147754SJeff.Bonwick@Sun.COM * ========================================================================== 2515789Sahrens */ 25167754SJeff.Bonwick@Sun.COM int 25177754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2) 2518789Sahrens { 25197754SJeff.Bonwick@Sun.COM static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 25207754SJeff.Bonwick@Sun.COM int r1, r2; 25211775Sbillm 25227754SJeff.Bonwick@Sun.COM for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 25237754SJeff.Bonwick@Sun.COM if (e1 == zio_error_rank[r1]) 25247754SJeff.Bonwick@Sun.COM break; 25257754SJeff.Bonwick@Sun.COM 25267754SJeff.Bonwick@Sun.COM for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 25277754SJeff.Bonwick@Sun.COM if (e2 == zio_error_rank[r2]) 25287754SJeff.Bonwick@Sun.COM break; 25297754SJeff.Bonwick@Sun.COM 25307754SJeff.Bonwick@Sun.COM return (r1 > r2 ? e1 : e2); 2531789Sahrens } 2532789Sahrens 2533789Sahrens /* 2534789Sahrens * ========================================================================== 25357754SJeff.Bonwick@Sun.COM * I/O completion 2536789Sahrens * ========================================================================== 2537789Sahrens */ 25387754SJeff.Bonwick@Sun.COM static int 25397754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio) 25407754SJeff.Bonwick@Sun.COM { 25417754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 25428632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 25437754SJeff.Bonwick@Sun.COM 254410922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 254510922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 25469443SBill.Moore@Sun.COM return (ZIO_PIPELINE_STOP); 25479443SBill.Moore@Sun.COM 25487754SJeff.Bonwick@Sun.COM if (zio->io_ready) { 25497754SJeff.Bonwick@Sun.COM ASSERT(IO_IS_ALLOCATING(zio)); 25507754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 25517754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 25527754SJeff.Bonwick@Sun.COM 25537754SJeff.Bonwick@Sun.COM zio->io_ready(zio); 25547754SJeff.Bonwick@Sun.COM } 25557754SJeff.Bonwick@Sun.COM 25567754SJeff.Bonwick@Sun.COM if (bp != NULL && bp != &zio->io_bp_copy) 25577754SJeff.Bonwick@Sun.COM zio->io_bp_copy = *bp; 25587754SJeff.Bonwick@Sun.COM 25597754SJeff.Bonwick@Sun.COM if (zio->io_error) 25607754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 25617754SJeff.Bonwick@Sun.COM 25628632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 25638632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = 1; 25648632SBill.Moore@Sun.COM pio = zio_walk_parents(zio); 25658632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 25668632SBill.Moore@Sun.COM 25678632SBill.Moore@Sun.COM /* 25688632SBill.Moore@Sun.COM * As we notify zio's parents, new parents could be added. 25698632SBill.Moore@Sun.COM * New parents go to the head of zio's io_parent_list, however, 25708632SBill.Moore@Sun.COM * so we will (correctly) not notify them. The remainder of zio's 25718632SBill.Moore@Sun.COM * io_parent_list, from 'pio_next' onward, cannot change because 25728632SBill.Moore@Sun.COM * all parents must wait for us to be done before they can be done. 25738632SBill.Moore@Sun.COM */ 25748632SBill.Moore@Sun.COM for (; pio != NULL; pio = pio_next) { 25758632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 25767754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_READY); 25778632SBill.Moore@Sun.COM } 25787754SJeff.Bonwick@Sun.COM 257910922SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_NODATA) { 258010922SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp)) { 258110922SJeff.Bonwick@Sun.COM zio->io_flags &= ~ZIO_FLAG_NODATA; 258210922SJeff.Bonwick@Sun.COM } else { 258310922SJeff.Bonwick@Sun.COM ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 258410922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 258510922SJeff.Bonwick@Sun.COM } 258610922SJeff.Bonwick@Sun.COM } 258710922SJeff.Bonwick@Sun.COM 258811026STim.Haley@Sun.COM if (zio_injection_enabled && 258911026STim.Haley@Sun.COM zio->io_spa->spa_syncing_txg == zio->io_txg) 259011026STim.Haley@Sun.COM zio_handle_ignored_writes(zio); 259111026STim.Haley@Sun.COM 25927754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 25937754SJeff.Bonwick@Sun.COM } 25947754SJeff.Bonwick@Sun.COM 25957754SJeff.Bonwick@Sun.COM static int 25967754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio) 25977754SJeff.Bonwick@Sun.COM { 25987754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 25997754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 26007754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 26017754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 26027754SJeff.Bonwick@Sun.COM uint64_t psize = zio->io_size; 26038632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 26047754SJeff.Bonwick@Sun.COM 26057754SJeff.Bonwick@Sun.COM /* 26069443SBill.Moore@Sun.COM * If our children haven't all completed, 26077754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 26087754SJeff.Bonwick@Sun.COM */ 26097754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 26107754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 261110922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 26127754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 26137754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 26147754SJeff.Bonwick@Sun.COM 26157754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 26167754SJeff.Bonwick@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 26177754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[c][w] == 0); 26187754SJeff.Bonwick@Sun.COM 26197754SJeff.Bonwick@Sun.COM if (bp != NULL) { 26207754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[0] == 0); 26217754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[1] == 0); 26227754SJeff.Bonwick@Sun.COM ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 26238632SBill.Moore@Sun.COM (bp == zio_unique_parent(zio)->io_bp)); 26247754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 262510922SJeff.Bonwick@Sun.COM zio->io_bp_override == NULL && 26267754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 26277754SJeff.Bonwick@Sun.COM ASSERT(!BP_SHOULD_BYTESWAP(bp)); 262810922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 26297754SJeff.Bonwick@Sun.COM ASSERT(BP_COUNT_GANG(bp) == 0 || 26307754SJeff.Bonwick@Sun.COM (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 26317754SJeff.Bonwick@Sun.COM } 26327754SJeff.Bonwick@Sun.COM } 26337754SJeff.Bonwick@Sun.COM 26347754SJeff.Bonwick@Sun.COM /* 263510922SJeff.Bonwick@Sun.COM * If there were child vdev/gang/ddt errors, they apply to us now. 26367754SJeff.Bonwick@Sun.COM */ 26377754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 26387754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 263910922SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 264010922SJeff.Bonwick@Sun.COM 264110922SJeff.Bonwick@Sun.COM /* 264210922SJeff.Bonwick@Sun.COM * If the I/O on the transformed data was successful, generate any 264310922SJeff.Bonwick@Sun.COM * checksum reports now while we still have the transformed data. 264410922SJeff.Bonwick@Sun.COM */ 264510922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 264610922SJeff.Bonwick@Sun.COM while (zio->io_cksum_report != NULL) { 264710922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 264810922SJeff.Bonwick@Sun.COM uint64_t align = zcr->zcr_align; 264910922SJeff.Bonwick@Sun.COM uint64_t asize = P2ROUNDUP(psize, align); 265010922SJeff.Bonwick@Sun.COM char *abuf = zio->io_data; 265110922SJeff.Bonwick@Sun.COM 265210922SJeff.Bonwick@Sun.COM if (asize != psize) { 265310922SJeff.Bonwick@Sun.COM abuf = zio_buf_alloc(asize); 265410922SJeff.Bonwick@Sun.COM bcopy(zio->io_data, abuf, psize); 265510922SJeff.Bonwick@Sun.COM bzero(abuf + psize, asize - psize); 265610922SJeff.Bonwick@Sun.COM } 265710922SJeff.Bonwick@Sun.COM 265810922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 265910922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 266010922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, abuf); 266110922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 266210922SJeff.Bonwick@Sun.COM 266310922SJeff.Bonwick@Sun.COM if (asize != psize) 266410922SJeff.Bonwick@Sun.COM zio_buf_free(abuf, asize); 266510922SJeff.Bonwick@Sun.COM } 266610922SJeff.Bonwick@Sun.COM } 26677754SJeff.Bonwick@Sun.COM 26687754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); /* note: may set zio->io_error */ 26697754SJeff.Bonwick@Sun.COM 26707754SJeff.Bonwick@Sun.COM vdev_stat_update(zio, psize); 26717754SJeff.Bonwick@Sun.COM 26727754SJeff.Bonwick@Sun.COM if (zio->io_error) { 26737754SJeff.Bonwick@Sun.COM /* 26747754SJeff.Bonwick@Sun.COM * If this I/O is attached to a particular vdev, 26757754SJeff.Bonwick@Sun.COM * generate an error message describing the I/O failure 26767754SJeff.Bonwick@Sun.COM * at the block level. We ignore these errors if the 26777754SJeff.Bonwick@Sun.COM * device is currently unavailable. 26787754SJeff.Bonwick@Sun.COM */ 26797754SJeff.Bonwick@Sun.COM if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 26807754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 26817754SJeff.Bonwick@Sun.COM 268210685SGeorge.Wilson@Sun.COM if ((zio->io_error == EIO || !(zio->io_flags & 268310685SGeorge.Wilson@Sun.COM (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 268410685SGeorge.Wilson@Sun.COM zio == lio) { 26857754SJeff.Bonwick@Sun.COM /* 26867754SJeff.Bonwick@Sun.COM * For logical I/O requests, tell the SPA to log the 26877754SJeff.Bonwick@Sun.COM * error and generate a logical data ereport. 26887754SJeff.Bonwick@Sun.COM */ 26897754SJeff.Bonwick@Sun.COM spa_log_error(spa, zio); 26907754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 26917754SJeff.Bonwick@Sun.COM 0, 0); 26927754SJeff.Bonwick@Sun.COM } 26937754SJeff.Bonwick@Sun.COM } 26947754SJeff.Bonwick@Sun.COM 26957754SJeff.Bonwick@Sun.COM if (zio->io_error && zio == lio) { 26967754SJeff.Bonwick@Sun.COM /* 26977754SJeff.Bonwick@Sun.COM * Determine whether zio should be reexecuted. This will 26987754SJeff.Bonwick@Sun.COM * propagate all the way to the root via zio_notify_parent(). 26997754SJeff.Bonwick@Sun.COM */ 27007754SJeff.Bonwick@Sun.COM ASSERT(vd == NULL && bp != NULL); 270110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 270210922SJeff.Bonwick@Sun.COM 270310922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio) && 270410922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 27057754SJeff.Bonwick@Sun.COM if (zio->io_error != ENOSPC) 27067754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_NOW; 27077754SJeff.Bonwick@Sun.COM else 27087754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 270910922SJeff.Bonwick@Sun.COM } 27107754SJeff.Bonwick@Sun.COM 27117754SJeff.Bonwick@Sun.COM if ((zio->io_type == ZIO_TYPE_READ || 27127754SJeff.Bonwick@Sun.COM zio->io_type == ZIO_TYPE_FREE) && 27137754SJeff.Bonwick@Sun.COM zio->io_error == ENXIO && 27148241SJeff.Bonwick@Sun.COM spa->spa_load_state == SPA_LOAD_NONE && 27157754SJeff.Bonwick@Sun.COM spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 27167754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 27177754SJeff.Bonwick@Sun.COM 27187754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 27197754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 272010614SJonathan.Adams@Sun.COM 272110614SJonathan.Adams@Sun.COM /* 272210614SJonathan.Adams@Sun.COM * Here is a possibly good place to attempt to do 272310614SJonathan.Adams@Sun.COM * either combinatorial reconstruction or error correction 272410614SJonathan.Adams@Sun.COM * based on checksums. It also might be a good place 272510614SJonathan.Adams@Sun.COM * to send out preliminary ereports before we suspend 272610614SJonathan.Adams@Sun.COM * processing. 272710614SJonathan.Adams@Sun.COM */ 27287754SJeff.Bonwick@Sun.COM } 27297754SJeff.Bonwick@Sun.COM 27307754SJeff.Bonwick@Sun.COM /* 27317754SJeff.Bonwick@Sun.COM * If there were logical child errors, they apply to us now. 27327754SJeff.Bonwick@Sun.COM * We defer this until now to avoid conflating logical child 27337754SJeff.Bonwick@Sun.COM * errors with errors that happened to the zio itself when 27347754SJeff.Bonwick@Sun.COM * updating vdev stats and reporting FMA events above. 27357754SJeff.Bonwick@Sun.COM */ 27367754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 27377754SJeff.Bonwick@Sun.COM 273810922SJeff.Bonwick@Sun.COM if ((zio->io_error || zio->io_reexecute) && 273910922SJeff.Bonwick@Sun.COM IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 274010922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 27419443SBill.Moore@Sun.COM zio_dva_unallocate(zio, zio->io_gang_tree, bp); 27429443SBill.Moore@Sun.COM 27439443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 27449443SBill.Moore@Sun.COM 27459470SGeorge.Wilson@Sun.COM /* 27469470SGeorge.Wilson@Sun.COM * Godfather I/Os should never suspend. 27479470SGeorge.Wilson@Sun.COM */ 27489470SGeorge.Wilson@Sun.COM if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 27499470SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 27509470SGeorge.Wilson@Sun.COM zio->io_reexecute = 0; 27519470SGeorge.Wilson@Sun.COM 27529470SGeorge.Wilson@Sun.COM if (zio->io_reexecute) { 27537754SJeff.Bonwick@Sun.COM /* 27547754SJeff.Bonwick@Sun.COM * This is a logical I/O that wants to reexecute. 27557754SJeff.Bonwick@Sun.COM * 27567754SJeff.Bonwick@Sun.COM * Reexecute is top-down. When an i/o fails, if it's not 27577754SJeff.Bonwick@Sun.COM * the root, it simply notifies its parent and sticks around. 27587754SJeff.Bonwick@Sun.COM * The parent, seeing that it still has children in zio_done(), 27597754SJeff.Bonwick@Sun.COM * does the same. This percolates all the way up to the root. 27607754SJeff.Bonwick@Sun.COM * The root i/o will reexecute or suspend the entire tree. 27617754SJeff.Bonwick@Sun.COM * 27627754SJeff.Bonwick@Sun.COM * This approach ensures that zio_reexecute() honors 27637754SJeff.Bonwick@Sun.COM * all the original i/o dependency relationships, e.g. 27647754SJeff.Bonwick@Sun.COM * parents not executing until children are ready. 27657754SJeff.Bonwick@Sun.COM */ 27667754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 27677754SJeff.Bonwick@Sun.COM 27689443SBill.Moore@Sun.COM zio->io_gang_leader = NULL; 27697754SJeff.Bonwick@Sun.COM 27708632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 27718632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 27728632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 27738632SBill.Moore@Sun.COM 27749234SGeorge.Wilson@Sun.COM /* 27759234SGeorge.Wilson@Sun.COM * "The Godfather" I/O monitors its children but is 27769234SGeorge.Wilson@Sun.COM * not a true parent to them. It will track them through 27779234SGeorge.Wilson@Sun.COM * the pipeline but severs its ties whenever they get into 27789234SGeorge.Wilson@Sun.COM * trouble (e.g. suspended). This allows "The Godfather" 27799234SGeorge.Wilson@Sun.COM * I/O to return status without blocking. 27809234SGeorge.Wilson@Sun.COM */ 27819234SGeorge.Wilson@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 27829234SGeorge.Wilson@Sun.COM zio_link_t *zl = zio->io_walk_link; 27839234SGeorge.Wilson@Sun.COM pio_next = zio_walk_parents(zio); 27849234SGeorge.Wilson@Sun.COM 27859234SGeorge.Wilson@Sun.COM if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 27869234SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 27879234SGeorge.Wilson@Sun.COM zio_remove_child(pio, zio, zl); 27889234SGeorge.Wilson@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 27899234SGeorge.Wilson@Sun.COM } 27909234SGeorge.Wilson@Sun.COM } 27919234SGeorge.Wilson@Sun.COM 27928632SBill.Moore@Sun.COM if ((pio = zio_unique_parent(zio)) != NULL) { 27937754SJeff.Bonwick@Sun.COM /* 27947754SJeff.Bonwick@Sun.COM * We're not a root i/o, so there's nothing to do 27957754SJeff.Bonwick@Sun.COM * but notify our parent. Don't propagate errors 27967754SJeff.Bonwick@Sun.COM * upward since we haven't permanently failed yet. 27977754SJeff.Bonwick@Sun.COM */ 27989470SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 27997754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 28007754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28017754SJeff.Bonwick@Sun.COM } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 28027754SJeff.Bonwick@Sun.COM /* 28037754SJeff.Bonwick@Sun.COM * We'd fail again if we reexecuted now, so suspend 28047754SJeff.Bonwick@Sun.COM * until conditions improve (e.g. device comes online). 28057754SJeff.Bonwick@Sun.COM */ 28067754SJeff.Bonwick@Sun.COM zio_suspend(spa, zio); 28077754SJeff.Bonwick@Sun.COM } else { 28087754SJeff.Bonwick@Sun.COM /* 28097754SJeff.Bonwick@Sun.COM * Reexecution is potentially a huge amount of work. 28107754SJeff.Bonwick@Sun.COM * Hand it off to the otherwise-unused claim taskq. 28117754SJeff.Bonwick@Sun.COM */ 28127754SJeff.Bonwick@Sun.COM (void) taskq_dispatch( 28137754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 28147754SJeff.Bonwick@Sun.COM (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 28157754SJeff.Bonwick@Sun.COM } 28167754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28177754SJeff.Bonwick@Sun.COM } 28187754SJeff.Bonwick@Sun.COM 281910922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 28209470SGeorge.Wilson@Sun.COM ASSERT(zio->io_reexecute == 0); 28217754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 28227754SJeff.Bonwick@Sun.COM 282310922SJeff.Bonwick@Sun.COM /* 282410922SJeff.Bonwick@Sun.COM * Report any checksum errors, since the I/O is complete. 282510922SJeff.Bonwick@Sun.COM */ 282610614SJonathan.Adams@Sun.COM while (zio->io_cksum_report != NULL) { 282710922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 282810922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 282910922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 283010922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, NULL); 283110922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 283210614SJonathan.Adams@Sun.COM } 283310614SJonathan.Adams@Sun.COM 28348632SBill.Moore@Sun.COM /* 28358632SBill.Moore@Sun.COM * It is the responsibility of the done callback to ensure that this 28368632SBill.Moore@Sun.COM * particular zio is no longer discoverable for adoption, and as 28378632SBill.Moore@Sun.COM * such, cannot acquire any new parents. 28388632SBill.Moore@Sun.COM */ 28397754SJeff.Bonwick@Sun.COM if (zio->io_done) 28407754SJeff.Bonwick@Sun.COM zio->io_done(zio); 28417754SJeff.Bonwick@Sun.COM 28428632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 28438632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 28448632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 28457754SJeff.Bonwick@Sun.COM 28468632SBill.Moore@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 28478632SBill.Moore@Sun.COM zio_link_t *zl = zio->io_walk_link; 28488632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 28498632SBill.Moore@Sun.COM zio_remove_child(pio, zio, zl); 28507754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28517754SJeff.Bonwick@Sun.COM } 28527754SJeff.Bonwick@Sun.COM 28537754SJeff.Bonwick@Sun.COM if (zio->io_waiter != NULL) { 28547754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 28557754SJeff.Bonwick@Sun.COM zio->io_executor = NULL; 28567754SJeff.Bonwick@Sun.COM cv_broadcast(&zio->io_cv); 28577754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 28587754SJeff.Bonwick@Sun.COM } else { 28597754SJeff.Bonwick@Sun.COM zio_destroy(zio); 28607754SJeff.Bonwick@Sun.COM } 28617754SJeff.Bonwick@Sun.COM 28627754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28637754SJeff.Bonwick@Sun.COM } 28647754SJeff.Bonwick@Sun.COM 28657754SJeff.Bonwick@Sun.COM /* 28667754SJeff.Bonwick@Sun.COM * ========================================================================== 28677754SJeff.Bonwick@Sun.COM * I/O pipeline definition 28687754SJeff.Bonwick@Sun.COM * ========================================================================== 28697754SJeff.Bonwick@Sun.COM */ 287010922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[] = { 28715530Sbonwick NULL, 287210922SJeff.Bonwick@Sun.COM zio_read_bp_init, 287310922SJeff.Bonwick@Sun.COM zio_free_bp_init, 28745530Sbonwick zio_issue_async, 28757754SJeff.Bonwick@Sun.COM zio_write_bp_init, 2876789Sahrens zio_checksum_generate, 287710922SJeff.Bonwick@Sun.COM zio_ddt_read_start, 287810922SJeff.Bonwick@Sun.COM zio_ddt_read_done, 287910922SJeff.Bonwick@Sun.COM zio_ddt_write, 288010922SJeff.Bonwick@Sun.COM zio_ddt_free, 28817754SJeff.Bonwick@Sun.COM zio_gang_assemble, 28827754SJeff.Bonwick@Sun.COM zio_gang_issue, 2883789Sahrens zio_dva_allocate, 2884789Sahrens zio_dva_free, 2885789Sahrens zio_dva_claim, 2886789Sahrens zio_ready, 2887789Sahrens zio_vdev_io_start, 2888789Sahrens zio_vdev_io_done, 2889789Sahrens zio_vdev_io_assess, 2890789Sahrens zio_checksum_verify, 28917754SJeff.Bonwick@Sun.COM zio_done 2892789Sahrens }; 2893