1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 2212296SLin.Ling@Sun.COM * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23789Sahrens */ 24789Sahrens 25789Sahrens #include <sys/zfs_context.h> 261544Seschrock #include <sys/fm/fs/zfs.h> 27789Sahrens #include <sys/spa.h> 28789Sahrens #include <sys/txg.h> 29789Sahrens #include <sys/spa_impl.h> 30789Sahrens #include <sys/vdev_impl.h> 31789Sahrens #include <sys/zio_impl.h> 32789Sahrens #include <sys/zio_compress.h> 33789Sahrens #include <sys/zio_checksum.h> 3410922SJeff.Bonwick@Sun.COM #include <sys/dmu_objset.h> 3510922SJeff.Bonwick@Sun.COM #include <sys/arc.h> 3610922SJeff.Bonwick@Sun.COM #include <sys/ddt.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 4711146SGeorge.Wilson@Sun.COM 0, /* ZIO_PRIORITY_LOG_WRITE */ 4811146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_CACHE_FILL */ 4911146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_AGG */ 50789Sahrens 4, /* ZIO_PRIORITY_FREE */ 5111146SGeorge.Wilson@Sun.COM 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 5211146SGeorge.Wilson@Sun.COM 6, /* ZIO_PRIORITY_ASYNC_READ */ 53789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 54789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 5512450SGeorge.Wilson@Sun.COM 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 56789Sahrens }; 57789Sahrens 58789Sahrens /* 59789Sahrens * ========================================================================== 60789Sahrens * I/O type descriptions 61789Sahrens * ========================================================================== 62789Sahrens */ 63789Sahrens char *zio_type_name[ZIO_TYPES] = { 6411146SGeorge.Wilson@Sun.COM "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 6511146SGeorge.Wilson@Sun.COM "zio_ioctl" 6611146SGeorge.Wilson@Sun.COM }; 67789Sahrens 68789Sahrens /* 69789Sahrens * ========================================================================== 70789Sahrens * I/O kmem caches 71789Sahrens * ========================================================================== 72789Sahrens */ 734055Seschrock kmem_cache_t *zio_cache; 748632SBill.Moore@Sun.COM kmem_cache_t *zio_link_cache; 75789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 763290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 773290Sjohansen 783290Sjohansen #ifdef _KERNEL 793290Sjohansen extern vmem_t *zio_alloc_arena; 803290Sjohansen #endif 81789Sahrens 825329Sgw25295 /* 837754SJeff.Bonwick@Sun.COM * An allocating zio is one that either currently has the DVA allocate 847754SJeff.Bonwick@Sun.COM * stage set or will have it later in its lifetime. 855329Sgw25295 */ 8610922SJeff.Bonwick@Sun.COM #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 8710922SJeff.Bonwick@Sun.COM 8811173SJonathan.Adams@Sun.COM boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 8911173SJonathan.Adams@Sun.COM 9010922SJeff.Bonwick@Sun.COM #ifdef ZFS_DEBUG 9110922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 16384; 9210922SJeff.Bonwick@Sun.COM #else 9310922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 0; 9410922SJeff.Bonwick@Sun.COM #endif 955329Sgw25295 96789Sahrens void 97789Sahrens zio_init(void) 98789Sahrens { 99789Sahrens size_t c; 1003290Sjohansen vmem_t *data_alloc_arena = NULL; 1013290Sjohansen 1023290Sjohansen #ifdef _KERNEL 1033290Sjohansen data_alloc_arena = zio_alloc_arena; 1043290Sjohansen #endif 1058632SBill.Moore@Sun.COM zio_cache = kmem_cache_create("zio_cache", 1068632SBill.Moore@Sun.COM sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1078632SBill.Moore@Sun.COM zio_link_cache = kmem_cache_create("zio_link_cache", 1088632SBill.Moore@Sun.COM sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1094055Seschrock 110789Sahrens /* 111789Sahrens * For small buffers, we want a cache for each multiple of 112789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 113789Sahrens * for each quarter-power of 2. For large buffers, we want 114789Sahrens * a cache for each multiple of PAGESIZE. 115789Sahrens */ 116789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 117789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 118789Sahrens size_t p2 = size; 119789Sahrens size_t align = 0; 120789Sahrens 121789Sahrens while (p2 & (p2 - 1)) 122789Sahrens p2 &= p2 - 1; 123789Sahrens 124789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 125789Sahrens align = SPA_MINBLOCKSIZE; 126789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 127789Sahrens align = PAGESIZE; 128789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 129789Sahrens align = p2 >> 2; 130789Sahrens } 131789Sahrens 132789Sahrens if (align != 0) { 1333290Sjohansen char name[36]; 1342856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 135789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 13610922SJeff.Bonwick@Sun.COM align, NULL, NULL, NULL, NULL, NULL, 13710922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 1383290Sjohansen 1393290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1403290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1413290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 14210922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 143789Sahrens } 144789Sahrens } 145789Sahrens 146789Sahrens while (--c != 0) { 147789Sahrens ASSERT(zio_buf_cache[c] != NULL); 148789Sahrens if (zio_buf_cache[c - 1] == NULL) 149789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1503290Sjohansen 1513290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1523290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1533290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 154789Sahrens } 1551544Seschrock 1561544Seschrock zio_inject_init(); 157789Sahrens } 158789Sahrens 159789Sahrens void 160789Sahrens zio_fini(void) 161789Sahrens { 162789Sahrens size_t c; 163789Sahrens kmem_cache_t *last_cache = NULL; 1643290Sjohansen kmem_cache_t *last_data_cache = NULL; 165789Sahrens 166789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167789Sahrens if (zio_buf_cache[c] != last_cache) { 168789Sahrens last_cache = zio_buf_cache[c]; 169789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 170789Sahrens } 171789Sahrens zio_buf_cache[c] = NULL; 1723290Sjohansen 1733290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1743290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1753290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1763290Sjohansen } 1773290Sjohansen zio_data_buf_cache[c] = NULL; 178789Sahrens } 1791544Seschrock 1808632SBill.Moore@Sun.COM kmem_cache_destroy(zio_link_cache); 1814055Seschrock kmem_cache_destroy(zio_cache); 1824055Seschrock 1831544Seschrock zio_inject_fini(); 184789Sahrens } 185789Sahrens 186789Sahrens /* 187789Sahrens * ========================================================================== 188789Sahrens * Allocate and free I/O buffers 189789Sahrens * ========================================================================== 190789Sahrens */ 1913290Sjohansen 1923290Sjohansen /* 1933290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1943290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1953290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1963290Sjohansen * excess / transient data in-core during a crashdump. 1973290Sjohansen */ 198789Sahrens void * 199789Sahrens zio_buf_alloc(size_t size) 200789Sahrens { 201789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 202789Sahrens 203789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 204789Sahrens 2056245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 206789Sahrens } 207789Sahrens 2083290Sjohansen /* 2093290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2103290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2113290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2123290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2133290Sjohansen */ 2143290Sjohansen void * 2153290Sjohansen zio_data_buf_alloc(size_t size) 2163290Sjohansen { 2173290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2183290Sjohansen 2193290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2203290Sjohansen 2216245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2223290Sjohansen } 2233290Sjohansen 224789Sahrens void 225789Sahrens zio_buf_free(void *buf, size_t size) 226789Sahrens { 227789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 228789Sahrens 229789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 230789Sahrens 231789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 232789Sahrens } 233789Sahrens 2343290Sjohansen void 2353290Sjohansen zio_data_buf_free(void *buf, size_t size) 2363290Sjohansen { 2373290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2383290Sjohansen 2393290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2403290Sjohansen 2413290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2423290Sjohansen } 2433463Sahrens 244789Sahrens /* 245789Sahrens * ========================================================================== 246789Sahrens * Push and pop I/O transform buffers 247789Sahrens * ========================================================================== 248789Sahrens */ 249789Sahrens static void 2507754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 2517754SJeff.Bonwick@Sun.COM zio_transform_func_t *transform) 252789Sahrens { 253789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 254789Sahrens 2557754SJeff.Bonwick@Sun.COM zt->zt_orig_data = zio->io_data; 2567754SJeff.Bonwick@Sun.COM zt->zt_orig_size = zio->io_size; 257789Sahrens zt->zt_bufsize = bufsize; 2587754SJeff.Bonwick@Sun.COM zt->zt_transform = transform; 259789Sahrens 260789Sahrens zt->zt_next = zio->io_transform_stack; 261789Sahrens zio->io_transform_stack = zt; 262789Sahrens 263789Sahrens zio->io_data = data; 264789Sahrens zio->io_size = size; 265789Sahrens } 266789Sahrens 267789Sahrens static void 2687754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio) 269789Sahrens { 2707754SJeff.Bonwick@Sun.COM zio_transform_t *zt; 271789Sahrens 2727754SJeff.Bonwick@Sun.COM while ((zt = zio->io_transform_stack) != NULL) { 2737754SJeff.Bonwick@Sun.COM if (zt->zt_transform != NULL) 2747754SJeff.Bonwick@Sun.COM zt->zt_transform(zio, 2757754SJeff.Bonwick@Sun.COM zt->zt_orig_data, zt->zt_orig_size); 276789Sahrens 27710922SJeff.Bonwick@Sun.COM if (zt->zt_bufsize != 0) 27810922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zt->zt_bufsize); 279789Sahrens 2807754SJeff.Bonwick@Sun.COM zio->io_data = zt->zt_orig_data; 2817754SJeff.Bonwick@Sun.COM zio->io_size = zt->zt_orig_size; 2827754SJeff.Bonwick@Sun.COM zio->io_transform_stack = zt->zt_next; 283789Sahrens 2847754SJeff.Bonwick@Sun.COM kmem_free(zt, sizeof (zio_transform_t)); 285789Sahrens } 286789Sahrens } 287789Sahrens 288789Sahrens /* 289789Sahrens * ========================================================================== 2907754SJeff.Bonwick@Sun.COM * I/O transform callbacks for subblocks and decompression 2917754SJeff.Bonwick@Sun.COM * ========================================================================== 2927754SJeff.Bonwick@Sun.COM */ 2937754SJeff.Bonwick@Sun.COM static void 2947754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size) 2957754SJeff.Bonwick@Sun.COM { 2967754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size > size); 2977754SJeff.Bonwick@Sun.COM 2987754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ) 2997754SJeff.Bonwick@Sun.COM bcopy(zio->io_data, data, size); 3007754SJeff.Bonwick@Sun.COM } 3017754SJeff.Bonwick@Sun.COM 3027754SJeff.Bonwick@Sun.COM static void 3037754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size) 3047754SJeff.Bonwick@Sun.COM { 3057754SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && 3067754SJeff.Bonwick@Sun.COM zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 30710922SJeff.Bonwick@Sun.COM zio->io_data, data, zio->io_size, size) != 0) 3087754SJeff.Bonwick@Sun.COM zio->io_error = EIO; 3097754SJeff.Bonwick@Sun.COM } 3107754SJeff.Bonwick@Sun.COM 3117754SJeff.Bonwick@Sun.COM /* 3127754SJeff.Bonwick@Sun.COM * ========================================================================== 3137754SJeff.Bonwick@Sun.COM * I/O parent/child relationships and pipeline interlocks 3147754SJeff.Bonwick@Sun.COM * ========================================================================== 3157754SJeff.Bonwick@Sun.COM */ 3168632SBill.Moore@Sun.COM /* 3178632SBill.Moore@Sun.COM * NOTE - Callers to zio_walk_parents() and zio_walk_children must 3188632SBill.Moore@Sun.COM * continue calling these functions until they return NULL. 3198632SBill.Moore@Sun.COM * Otherwise, the next caller will pick up the list walk in 3208632SBill.Moore@Sun.COM * some indeterminate state. (Otherwise every caller would 3218632SBill.Moore@Sun.COM * have to pass in a cookie to keep the state represented by 3228632SBill.Moore@Sun.COM * io_walk_link, which gets annoying.) 3238632SBill.Moore@Sun.COM */ 3248632SBill.Moore@Sun.COM zio_t * 3258632SBill.Moore@Sun.COM zio_walk_parents(zio_t *cio) 3268632SBill.Moore@Sun.COM { 3278632SBill.Moore@Sun.COM zio_link_t *zl = cio->io_walk_link; 3288632SBill.Moore@Sun.COM list_t *pl = &cio->io_parent_list; 3297754SJeff.Bonwick@Sun.COM 3308632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 3318632SBill.Moore@Sun.COM cio->io_walk_link = zl; 3328632SBill.Moore@Sun.COM 3338632SBill.Moore@Sun.COM if (zl == NULL) 3348632SBill.Moore@Sun.COM return (NULL); 3358632SBill.Moore@Sun.COM 3368632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 3378632SBill.Moore@Sun.COM return (zl->zl_parent); 3388632SBill.Moore@Sun.COM } 3398632SBill.Moore@Sun.COM 3408632SBill.Moore@Sun.COM zio_t * 3418632SBill.Moore@Sun.COM zio_walk_children(zio_t *pio) 3427754SJeff.Bonwick@Sun.COM { 3438632SBill.Moore@Sun.COM zio_link_t *zl = pio->io_walk_link; 3448632SBill.Moore@Sun.COM list_t *cl = &pio->io_child_list; 3458632SBill.Moore@Sun.COM 3468632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 3478632SBill.Moore@Sun.COM pio->io_walk_link = zl; 3488632SBill.Moore@Sun.COM 3498632SBill.Moore@Sun.COM if (zl == NULL) 3508632SBill.Moore@Sun.COM return (NULL); 3518632SBill.Moore@Sun.COM 3528632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 3538632SBill.Moore@Sun.COM return (zl->zl_child); 3548632SBill.Moore@Sun.COM } 3558632SBill.Moore@Sun.COM 3568632SBill.Moore@Sun.COM zio_t * 3578632SBill.Moore@Sun.COM zio_unique_parent(zio_t *cio) 3588632SBill.Moore@Sun.COM { 3598632SBill.Moore@Sun.COM zio_t *pio = zio_walk_parents(cio); 3608632SBill.Moore@Sun.COM 3618632SBill.Moore@Sun.COM VERIFY(zio_walk_parents(cio) == NULL); 3628632SBill.Moore@Sun.COM return (pio); 3638632SBill.Moore@Sun.COM } 3648632SBill.Moore@Sun.COM 3658632SBill.Moore@Sun.COM void 3668632SBill.Moore@Sun.COM zio_add_child(zio_t *pio, zio_t *cio) 3678632SBill.Moore@Sun.COM { 3688632SBill.Moore@Sun.COM zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 3698632SBill.Moore@Sun.COM 3708632SBill.Moore@Sun.COM /* 3718632SBill.Moore@Sun.COM * Logical I/Os can have logical, gang, or vdev children. 3728632SBill.Moore@Sun.COM * Gang I/Os can have gang or vdev children. 3738632SBill.Moore@Sun.COM * Vdev I/Os can only have vdev children. 3748632SBill.Moore@Sun.COM * The following ASSERT captures all of these constraints. 3758632SBill.Moore@Sun.COM */ 3768632SBill.Moore@Sun.COM ASSERT(cio->io_child_type <= pio->io_child_type); 3778632SBill.Moore@Sun.COM 3788632SBill.Moore@Sun.COM zl->zl_parent = pio; 3798632SBill.Moore@Sun.COM zl->zl_child = cio; 3808632SBill.Moore@Sun.COM 3818632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 3827754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 3838632SBill.Moore@Sun.COM 3848632SBill.Moore@Sun.COM ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 3858632SBill.Moore@Sun.COM 3868632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3878632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 3888632SBill.Moore@Sun.COM 3898632SBill.Moore@Sun.COM list_insert_head(&pio->io_child_list, zl); 3908632SBill.Moore@Sun.COM list_insert_head(&cio->io_parent_list, zl); 3918632SBill.Moore@Sun.COM 39210922SJeff.Bonwick@Sun.COM pio->io_child_count++; 39310922SJeff.Bonwick@Sun.COM cio->io_parent_count++; 39410922SJeff.Bonwick@Sun.COM 3957754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 3968632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 3977754SJeff.Bonwick@Sun.COM } 3987754SJeff.Bonwick@Sun.COM 3997754SJeff.Bonwick@Sun.COM static void 4008632SBill.Moore@Sun.COM zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 4017754SJeff.Bonwick@Sun.COM { 4028632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 4038632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 4047754SJeff.Bonwick@Sun.COM 4058632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 4067754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4078632SBill.Moore@Sun.COM 4088632SBill.Moore@Sun.COM list_remove(&pio->io_child_list, zl); 4098632SBill.Moore@Sun.COM list_remove(&cio->io_parent_list, zl); 4108632SBill.Moore@Sun.COM 41110922SJeff.Bonwick@Sun.COM pio->io_child_count--; 41210922SJeff.Bonwick@Sun.COM cio->io_parent_count--; 41310922SJeff.Bonwick@Sun.COM 4147754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4158632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 4168632SBill.Moore@Sun.COM 4178632SBill.Moore@Sun.COM kmem_cache_free(zio_link_cache, zl); 4187754SJeff.Bonwick@Sun.COM } 4197754SJeff.Bonwick@Sun.COM 4207754SJeff.Bonwick@Sun.COM static boolean_t 4217754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 4227754SJeff.Bonwick@Sun.COM { 4237754SJeff.Bonwick@Sun.COM uint64_t *countp = &zio->io_children[child][wait]; 4247754SJeff.Bonwick@Sun.COM boolean_t waiting = B_FALSE; 4257754SJeff.Bonwick@Sun.COM 4267754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 4277754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 4287754SJeff.Bonwick@Sun.COM if (*countp != 0) { 42910922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 4307754SJeff.Bonwick@Sun.COM zio->io_stall = countp; 4317754SJeff.Bonwick@Sun.COM waiting = B_TRUE; 4327754SJeff.Bonwick@Sun.COM } 4337754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 4347754SJeff.Bonwick@Sun.COM 4357754SJeff.Bonwick@Sun.COM return (waiting); 4367754SJeff.Bonwick@Sun.COM } 4377754SJeff.Bonwick@Sun.COM 4387754SJeff.Bonwick@Sun.COM static void 4397754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 4407754SJeff.Bonwick@Sun.COM { 4417754SJeff.Bonwick@Sun.COM uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 4427754SJeff.Bonwick@Sun.COM int *errorp = &pio->io_child_error[zio->io_child_type]; 4437754SJeff.Bonwick@Sun.COM 4447754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4457754SJeff.Bonwick@Sun.COM if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 4467754SJeff.Bonwick@Sun.COM *errorp = zio_worst_error(*errorp, zio->io_error); 4477754SJeff.Bonwick@Sun.COM pio->io_reexecute |= zio->io_reexecute; 4487754SJeff.Bonwick@Sun.COM ASSERT3U(*countp, >, 0); 4497754SJeff.Bonwick@Sun.COM if (--*countp == 0 && pio->io_stall == countp) { 4507754SJeff.Bonwick@Sun.COM pio->io_stall = NULL; 4517754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4527754SJeff.Bonwick@Sun.COM zio_execute(pio); 4537754SJeff.Bonwick@Sun.COM } else { 4547754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4557754SJeff.Bonwick@Sun.COM } 4567754SJeff.Bonwick@Sun.COM } 4577754SJeff.Bonwick@Sun.COM 4587754SJeff.Bonwick@Sun.COM static void 4597754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c) 4607754SJeff.Bonwick@Sun.COM { 4617754SJeff.Bonwick@Sun.COM if (zio->io_child_error[c] != 0 && zio->io_error == 0) 4627754SJeff.Bonwick@Sun.COM zio->io_error = zio->io_child_error[c]; 4637754SJeff.Bonwick@Sun.COM } 4647754SJeff.Bonwick@Sun.COM 4657754SJeff.Bonwick@Sun.COM /* 4667754SJeff.Bonwick@Sun.COM * ========================================================================== 4677754SJeff.Bonwick@Sun.COM * Create the various types of I/O (read, write, free, etc) 468789Sahrens * ========================================================================== 469789Sahrens */ 470789Sahrens static zio_t * 47110922SJeff.Bonwick@Sun.COM zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 472789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 47310922SJeff.Bonwick@Sun.COM zio_type_t type, int priority, enum zio_flag flags, 47410922SJeff.Bonwick@Sun.COM vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 47510922SJeff.Bonwick@Sun.COM enum zio_stage stage, enum zio_stage pipeline) 476789Sahrens { 477789Sahrens zio_t *zio; 478789Sahrens 479789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 480789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 4817754SJeff.Bonwick@Sun.COM ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 482789Sahrens 4837754SJeff.Bonwick@Sun.COM ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 4847754SJeff.Bonwick@Sun.COM ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 4857754SJeff.Bonwick@Sun.COM ASSERT(vd || stage == ZIO_STAGE_OPEN); 4867046Sahrens 4874055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 4884055Seschrock bzero(zio, sizeof (zio_t)); 4897754SJeff.Bonwick@Sun.COM 4907754SJeff.Bonwick@Sun.COM mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 4917754SJeff.Bonwick@Sun.COM cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 4927754SJeff.Bonwick@Sun.COM 4938632SBill.Moore@Sun.COM list_create(&zio->io_parent_list, sizeof (zio_link_t), 4948632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_parent_node)); 4958632SBill.Moore@Sun.COM list_create(&zio->io_child_list, sizeof (zio_link_t), 4968632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_child_node)); 4978632SBill.Moore@Sun.COM 4987754SJeff.Bonwick@Sun.COM if (vd != NULL) 4997754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_VDEV; 5007754SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_GANG_CHILD) 5017754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_GANG; 50210922SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_DDT_CHILD) 50310922SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_DDT; 5047754SJeff.Bonwick@Sun.COM else 5057754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_LOGICAL; 5067754SJeff.Bonwick@Sun.COM 507789Sahrens if (bp != NULL) { 50810922SJeff.Bonwick@Sun.COM zio->io_bp = (blkptr_t *)bp; 509789Sahrens zio->io_bp_copy = *bp; 510789Sahrens zio->io_bp_orig = *bp; 51110922SJeff.Bonwick@Sun.COM if (type != ZIO_TYPE_WRITE || 51210922SJeff.Bonwick@Sun.COM zio->io_child_type == ZIO_CHILD_DDT) 5137754SJeff.Bonwick@Sun.COM zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 5149443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) 5157754SJeff.Bonwick@Sun.COM zio->io_logical = zio; 5169443SBill.Moore@Sun.COM if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 5179443SBill.Moore@Sun.COM pipeline |= ZIO_GANG_STAGES; 518789Sahrens } 5197754SJeff.Bonwick@Sun.COM 5207754SJeff.Bonwick@Sun.COM zio->io_spa = spa; 5217754SJeff.Bonwick@Sun.COM zio->io_txg = txg; 522789Sahrens zio->io_done = done; 523789Sahrens zio->io_private = private; 524789Sahrens zio->io_type = type; 525789Sahrens zio->io_priority = priority; 5267754SJeff.Bonwick@Sun.COM zio->io_vd = vd; 5277754SJeff.Bonwick@Sun.COM zio->io_offset = offset; 52810922SJeff.Bonwick@Sun.COM zio->io_orig_data = zio->io_data = data; 52910922SJeff.Bonwick@Sun.COM zio->io_orig_size = zio->io_size = size; 5307754SJeff.Bonwick@Sun.COM zio->io_orig_flags = zio->io_flags = flags; 5317754SJeff.Bonwick@Sun.COM zio->io_orig_stage = zio->io_stage = stage; 5327754SJeff.Bonwick@Sun.COM zio->io_orig_pipeline = zio->io_pipeline = pipeline; 5337754SJeff.Bonwick@Sun.COM 5348632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 5358632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 5368632SBill.Moore@Sun.COM 5377754SJeff.Bonwick@Sun.COM if (zb != NULL) 5387754SJeff.Bonwick@Sun.COM zio->io_bookmark = *zb; 539789Sahrens 5407754SJeff.Bonwick@Sun.COM if (pio != NULL) { 5417754SJeff.Bonwick@Sun.COM if (zio->io_logical == NULL) 5421544Seschrock zio->io_logical = pio->io_logical; 5439443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_GANG) 5449443SBill.Moore@Sun.COM zio->io_gang_leader = pio->io_gang_leader; 5457754SJeff.Bonwick@Sun.COM zio_add_child(pio, zio); 546789Sahrens } 547789Sahrens 548789Sahrens return (zio); 549789Sahrens } 550789Sahrens 5515329Sgw25295 static void 5527754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio) 5535329Sgw25295 { 5548632SBill.Moore@Sun.COM list_destroy(&zio->io_parent_list); 5558632SBill.Moore@Sun.COM list_destroy(&zio->io_child_list); 5567754SJeff.Bonwick@Sun.COM mutex_destroy(&zio->io_lock); 5577754SJeff.Bonwick@Sun.COM cv_destroy(&zio->io_cv); 5587754SJeff.Bonwick@Sun.COM kmem_cache_free(zio_cache, zio); 5595329Sgw25295 } 5605329Sgw25295 561789Sahrens zio_t * 5628632SBill.Moore@Sun.COM zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 56310922SJeff.Bonwick@Sun.COM void *private, enum zio_flag flags) 564789Sahrens { 565789Sahrens zio_t *zio; 566789Sahrens 567789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 5688632SBill.Moore@Sun.COM ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 5697754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 570789Sahrens 571789Sahrens return (zio); 572789Sahrens } 573789Sahrens 574789Sahrens zio_t * 57510922SJeff.Bonwick@Sun.COM zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 576789Sahrens { 5778632SBill.Moore@Sun.COM return (zio_null(NULL, spa, NULL, done, private, flags)); 578789Sahrens } 579789Sahrens 580789Sahrens zio_t * 5817754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 5827754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_done_func_t *done, void *private, 58310922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 584789Sahrens { 585789Sahrens zio_t *zio; 586789Sahrens 58710922SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 5887046Sahrens data, size, done, private, 5897754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 59010922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 59110922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 592789Sahrens 593789Sahrens return (zio); 594789Sahrens } 595789Sahrens 596789Sahrens zio_t * 5977754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 59810922SJeff.Bonwick@Sun.COM void *data, uint64_t size, const zio_prop_t *zp, 5997754SJeff.Bonwick@Sun.COM zio_done_func_t *ready, zio_done_func_t *done, void *private, 60010922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 601789Sahrens { 602789Sahrens zio_t *zio; 603789Sahrens 6047754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 6057754SJeff.Bonwick@Sun.COM zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 6067754SJeff.Bonwick@Sun.COM zp->zp_compress >= ZIO_COMPRESS_OFF && 6077754SJeff.Bonwick@Sun.COM zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 6087754SJeff.Bonwick@Sun.COM zp->zp_type < DMU_OT_NUMTYPES && 6097754SJeff.Bonwick@Sun.COM zp->zp_level < 32 && 61010922SJeff.Bonwick@Sun.COM zp->zp_copies > 0 && 61110922SJeff.Bonwick@Sun.COM zp->zp_copies <= spa_max_replication(spa) && 61210922SJeff.Bonwick@Sun.COM zp->zp_dedup <= 1 && 61310922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify <= 1); 6145329Sgw25295 615789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6167754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 61710922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 61810922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 619789Sahrens 6203547Smaybee zio->io_ready = ready; 6217754SJeff.Bonwick@Sun.COM zio->io_prop = *zp; 622789Sahrens 623789Sahrens return (zio); 624789Sahrens } 625789Sahrens 626789Sahrens zio_t * 6277754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 6287754SJeff.Bonwick@Sun.COM uint64_t size, zio_done_func_t *done, void *private, int priority, 62910922SJeff.Bonwick@Sun.COM enum zio_flag flags, zbookmark_t *zb) 630789Sahrens { 631789Sahrens zio_t *zio; 632789Sahrens 6337181Sperrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6347754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 6357754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 636789Sahrens 637789Sahrens return (zio); 638789Sahrens } 639789Sahrens 64010922SJeff.Bonwick@Sun.COM void 64110922SJeff.Bonwick@Sun.COM zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 64210922SJeff.Bonwick@Sun.COM { 64310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 64410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 64510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 64610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 64710922SJeff.Bonwick@Sun.COM 64810922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies = copies; 64910922SJeff.Bonwick@Sun.COM zio->io_bp_override = bp; 65010922SJeff.Bonwick@Sun.COM } 65110922SJeff.Bonwick@Sun.COM 65210922SJeff.Bonwick@Sun.COM void 65310922SJeff.Bonwick@Sun.COM zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 65410922SJeff.Bonwick@Sun.COM { 65512470SMatthew.Ahrens@Sun.COM bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 65610922SJeff.Bonwick@Sun.COM } 65710922SJeff.Bonwick@Sun.COM 658789Sahrens zio_t * 65910922SJeff.Bonwick@Sun.COM zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 66010922SJeff.Bonwick@Sun.COM enum zio_flag flags) 661789Sahrens { 662789Sahrens zio_t *zio; 663789Sahrens 66412296SLin.Ling@Sun.COM dprintf_bp(bp, "freeing in txg %llu, pass %u", 66512296SLin.Ling@Sun.COM (longlong_t)txg, spa->spa_sync_pass); 66612296SLin.Ling@Sun.COM 667789Sahrens ASSERT(!BP_IS_HOLE(bp)); 66810922SJeff.Bonwick@Sun.COM ASSERT(spa_syncing_txg(spa) == txg); 66910922SJeff.Bonwick@Sun.COM ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 670789Sahrens 6717754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 67210922SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 6737754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 674789Sahrens 675789Sahrens return (zio); 676789Sahrens } 677789Sahrens 678789Sahrens zio_t * 67910922SJeff.Bonwick@Sun.COM zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 68010922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, enum zio_flag flags) 681789Sahrens { 682789Sahrens zio_t *zio; 683789Sahrens 684789Sahrens /* 685789Sahrens * A claim is an allocation of a specific block. Claims are needed 686789Sahrens * to support immediate writes in the intent log. The issue is that 687789Sahrens * immediate writes contain committed data, but in a txg that was 688789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 689789Sahrens * the intent log claims all blocks that contain immediate write data 690789Sahrens * so that the SPA knows they're in use. 691789Sahrens * 692789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 693789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 69410922SJeff.Bonwick@Sun.COM * If txg == 0 we just verify that the block is claimable. 695789Sahrens */ 696789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 69710922SJeff.Bonwick@Sun.COM ASSERT(txg == spa_first_txg(spa) || txg == 0); 69810922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 699789Sahrens 7007754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 7017754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 7027754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 703789Sahrens 704789Sahrens return (zio); 705789Sahrens } 706789Sahrens 707789Sahrens zio_t * 708789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 70910922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 710789Sahrens { 711789Sahrens zio_t *zio; 712789Sahrens int c; 713789Sahrens 714789Sahrens if (vd->vdev_children == 0) { 715789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 7167754SJeff.Bonwick@Sun.COM ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 717789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 718789Sahrens 719789Sahrens zio->io_cmd = cmd; 720789Sahrens } else { 7218632SBill.Moore@Sun.COM zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 722789Sahrens 723789Sahrens for (c = 0; c < vd->vdev_children; c++) 724789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 725789Sahrens done, private, priority, flags)); 726789Sahrens } 727789Sahrens 728789Sahrens return (zio); 729789Sahrens } 730789Sahrens 731789Sahrens zio_t * 732789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 733789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 73410922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 735789Sahrens { 736789Sahrens zio_t *zio; 7375329Sgw25295 7387754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7397754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7407754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7417754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 742789Sahrens 7437754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7447754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 745789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 746789Sahrens 7477754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 748789Sahrens 749789Sahrens return (zio); 750789Sahrens } 751789Sahrens 752789Sahrens zio_t * 753789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 754789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 75510922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 756789Sahrens { 757789Sahrens zio_t *zio; 758789Sahrens 7597754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7607754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7617754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7627754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 7635329Sgw25295 7647754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7657754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 766789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 767789Sahrens 7687754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 769789Sahrens 77011670SNeil.Perrin@Sun.COM if (zio_checksum_table[checksum].ci_eck) { 771789Sahrens /* 77211670SNeil.Perrin@Sun.COM * zec checksums are necessarily destructive -- they modify 7737754SJeff.Bonwick@Sun.COM * the end of the write buffer to hold the verifier/checksum. 774789Sahrens * Therefore, we must make a local copy in case the data is 7757754SJeff.Bonwick@Sun.COM * being written to multiple places in parallel. 776789Sahrens */ 7777754SJeff.Bonwick@Sun.COM void *wbuf = zio_buf_alloc(size); 778789Sahrens bcopy(data, wbuf, size); 7797754SJeff.Bonwick@Sun.COM zio_push_transform(zio, wbuf, size, size, NULL); 780789Sahrens } 781789Sahrens 782789Sahrens return (zio); 783789Sahrens } 784789Sahrens 785789Sahrens /* 7867754SJeff.Bonwick@Sun.COM * Create a child I/O to do some work for us. 787789Sahrens */ 788789Sahrens zio_t * 7897754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 79010922SJeff.Bonwick@Sun.COM void *data, uint64_t size, int type, int priority, enum zio_flag flags, 791789Sahrens zio_done_func_t *done, void *private) 792789Sahrens { 79310922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 7947754SJeff.Bonwick@Sun.COM zio_t *zio; 7957754SJeff.Bonwick@Sun.COM 7967754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_parent == 7977754SJeff.Bonwick@Sun.COM (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 798789Sahrens 799789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 800789Sahrens /* 801789Sahrens * If we have the bp, then the child should perform the 802789Sahrens * checksum and the parent need not. This pushes error 803789Sahrens * detection as close to the leaves as possible and 804789Sahrens * eliminates redundant checksums in the interior nodes. 805789Sahrens */ 80610922SJeff.Bonwick@Sun.COM pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 80710922SJeff.Bonwick@Sun.COM pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 8087754SJeff.Bonwick@Sun.COM } 8097754SJeff.Bonwick@Sun.COM 8107754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0) 8117754SJeff.Bonwick@Sun.COM offset += VDEV_LABEL_START_SIZE; 8127754SJeff.Bonwick@Sun.COM 81310922SJeff.Bonwick@Sun.COM flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 81410922SJeff.Bonwick@Sun.COM 81510922SJeff.Bonwick@Sun.COM /* 81610922SJeff.Bonwick@Sun.COM * If we've decided to do a repair, the write is not speculative -- 81710922SJeff.Bonwick@Sun.COM * even if the original read was. 81810922SJeff.Bonwick@Sun.COM */ 81910922SJeff.Bonwick@Sun.COM if (flags & ZIO_FLAG_IO_REPAIR) 82010922SJeff.Bonwick@Sun.COM flags &= ~ZIO_FLAG_SPECULATIVE; 82110922SJeff.Bonwick@Sun.COM 8227754SJeff.Bonwick@Sun.COM zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 82310922SJeff.Bonwick@Sun.COM done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 82410922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 8257754SJeff.Bonwick@Sun.COM 8267754SJeff.Bonwick@Sun.COM return (zio); 8277754SJeff.Bonwick@Sun.COM } 8287754SJeff.Bonwick@Sun.COM 8297754SJeff.Bonwick@Sun.COM zio_t * 8307754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 83110922SJeff.Bonwick@Sun.COM int type, int priority, enum zio_flag flags, 83210922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private) 8337754SJeff.Bonwick@Sun.COM { 8347754SJeff.Bonwick@Sun.COM zio_t *zio; 8357754SJeff.Bonwick@Sun.COM 8367754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_ops->vdev_op_leaf); 8377754SJeff.Bonwick@Sun.COM 8387754SJeff.Bonwick@Sun.COM zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 8397754SJeff.Bonwick@Sun.COM data, size, done, private, type, priority, 8407754SJeff.Bonwick@Sun.COM flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 8417754SJeff.Bonwick@Sun.COM vd, offset, NULL, 84210922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 8437754SJeff.Bonwick@Sun.COM 8447754SJeff.Bonwick@Sun.COM return (zio); 8457754SJeff.Bonwick@Sun.COM } 8467754SJeff.Bonwick@Sun.COM 8477754SJeff.Bonwick@Sun.COM void 8487754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd) 8497754SJeff.Bonwick@Sun.COM { 8507754SJeff.Bonwick@Sun.COM zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 8517754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_PRIORITY_NOW, 8527754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 8537754SJeff.Bonwick@Sun.COM } 8547754SJeff.Bonwick@Sun.COM 85511670SNeil.Perrin@Sun.COM void 85611670SNeil.Perrin@Sun.COM zio_shrink(zio_t *zio, uint64_t size) 85711670SNeil.Perrin@Sun.COM { 85811670SNeil.Perrin@Sun.COM ASSERT(zio->io_executor == NULL); 85911670SNeil.Perrin@Sun.COM ASSERT(zio->io_orig_size == zio->io_size); 86011670SNeil.Perrin@Sun.COM ASSERT(size <= zio->io_size); 86111670SNeil.Perrin@Sun.COM 86211670SNeil.Perrin@Sun.COM /* 86311670SNeil.Perrin@Sun.COM * We don't shrink for raidz because of problems with the 86411670SNeil.Perrin@Sun.COM * reconstruction when reading back less than the block size. 86511670SNeil.Perrin@Sun.COM * Note, BP_IS_RAIDZ() assumes no compression. 86611670SNeil.Perrin@Sun.COM */ 86711670SNeil.Perrin@Sun.COM ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 86811670SNeil.Perrin@Sun.COM if (!BP_IS_RAIDZ(zio->io_bp)) 86911670SNeil.Perrin@Sun.COM zio->io_orig_size = zio->io_size = size; 87011670SNeil.Perrin@Sun.COM } 87111670SNeil.Perrin@Sun.COM 8727754SJeff.Bonwick@Sun.COM /* 8737754SJeff.Bonwick@Sun.COM * ========================================================================== 8747754SJeff.Bonwick@Sun.COM * Prepare to read and write logical blocks 8757754SJeff.Bonwick@Sun.COM * ========================================================================== 8767754SJeff.Bonwick@Sun.COM */ 8777754SJeff.Bonwick@Sun.COM 8787754SJeff.Bonwick@Sun.COM static int 8797754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio) 8807754SJeff.Bonwick@Sun.COM { 8817754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8827754SJeff.Bonwick@Sun.COM 8838274SJeff.Bonwick@Sun.COM if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 8849443SBill.Moore@Sun.COM zio->io_child_type == ZIO_CHILD_LOGICAL && 8859443SBill.Moore@Sun.COM !(zio->io_flags & ZIO_FLAG_RAW)) { 88610922SJeff.Bonwick@Sun.COM uint64_t psize = BP_GET_PSIZE(bp); 88710922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(psize); 88810922SJeff.Bonwick@Sun.COM 88910922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 8907754SJeff.Bonwick@Sun.COM } 8917754SJeff.Bonwick@Sun.COM 8927754SJeff.Bonwick@Sun.COM if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 8937754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 8947754SJeff.Bonwick@Sun.COM 89511125SJeff.Bonwick@Sun.COM if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 89611125SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 89711125SJeff.Bonwick@Sun.COM 89810922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 89910922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 90010922SJeff.Bonwick@Sun.COM 9017754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9027754SJeff.Bonwick@Sun.COM } 9037754SJeff.Bonwick@Sun.COM 9047754SJeff.Bonwick@Sun.COM static int 9057754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio) 9067754SJeff.Bonwick@Sun.COM { 90710922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 9087754SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 90910922SJeff.Bonwick@Sun.COM enum zio_compress compress = zp->zp_compress; 9107754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 9117754SJeff.Bonwick@Sun.COM uint64_t lsize = zio->io_size; 91210922SJeff.Bonwick@Sun.COM uint64_t psize = lsize; 9137754SJeff.Bonwick@Sun.COM int pass = 1; 9147754SJeff.Bonwick@Sun.COM 9157754SJeff.Bonwick@Sun.COM /* 9167754SJeff.Bonwick@Sun.COM * If our children haven't all reached the ready stage, 9177754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 9187754SJeff.Bonwick@Sun.COM */ 9197754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 9207754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 9217754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 9227754SJeff.Bonwick@Sun.COM 9237754SJeff.Bonwick@Sun.COM if (!IO_IS_ALLOCATING(zio)) 9247754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9257754SJeff.Bonwick@Sun.COM 92610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 92710922SJeff.Bonwick@Sun.COM 92810922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 92910922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth != zio->io_txg); 93010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 93110922SJeff.Bonwick@Sun.COM 93210922SJeff.Bonwick@Sun.COM *bp = *zio->io_bp_override; 93310922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 93410922SJeff.Bonwick@Sun.COM 93510922SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(bp) || !zp->zp_dedup) 93610922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 93710922SJeff.Bonwick@Sun.COM 93810922SJeff.Bonwick@Sun.COM ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 93910922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify); 94010922SJeff.Bonwick@Sun.COM 94110922SJeff.Bonwick@Sun.COM if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 94210922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, 1); 94310922SJeff.Bonwick@Sun.COM zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 94410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 94510922SJeff.Bonwick@Sun.COM } 94610922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 94710922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 94810922SJeff.Bonwick@Sun.COM } 9497754SJeff.Bonwick@Sun.COM 9507754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg) { 9517754SJeff.Bonwick@Sun.COM /* 9527754SJeff.Bonwick@Sun.COM * We're rewriting an existing block, which means we're 9537754SJeff.Bonwick@Sun.COM * working on behalf of spa_sync(). For spa_sync() to 9547754SJeff.Bonwick@Sun.COM * converge, it must eventually be the case that we don't 9557754SJeff.Bonwick@Sun.COM * have to allocate new blocks. But compression changes 9567754SJeff.Bonwick@Sun.COM * the blocksize, which forces a reallocate, and makes 9577754SJeff.Bonwick@Sun.COM * convergence take longer. Therefore, after the first 9587754SJeff.Bonwick@Sun.COM * few passes, stop compressing to ensure convergence. 9597754SJeff.Bonwick@Sun.COM */ 96010922SJeff.Bonwick@Sun.COM pass = spa_sync_pass(spa); 96110922SJeff.Bonwick@Sun.COM 96210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(spa)); 96310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 96410922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp)); 9657754SJeff.Bonwick@Sun.COM 9667754SJeff.Bonwick@Sun.COM if (pass > SYNC_PASS_DONT_COMPRESS) 9677754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 9687754SJeff.Bonwick@Sun.COM 9697754SJeff.Bonwick@Sun.COM /* Make sure someone doesn't change their mind on overwrites */ 97010922SJeff.Bonwick@Sun.COM ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 97110922SJeff.Bonwick@Sun.COM spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 9727754SJeff.Bonwick@Sun.COM } 9737754SJeff.Bonwick@Sun.COM 9747754SJeff.Bonwick@Sun.COM if (compress != ZIO_COMPRESS_OFF) { 97510922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(lsize); 97610922SJeff.Bonwick@Sun.COM psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 97710922SJeff.Bonwick@Sun.COM if (psize == 0 || psize == lsize) { 9787754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 97910922SJeff.Bonwick@Sun.COM zio_buf_free(cbuf, lsize); 98010922SJeff.Bonwick@Sun.COM } else { 98110922SJeff.Bonwick@Sun.COM ASSERT(psize < lsize); 98210922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, lsize, NULL); 9837754SJeff.Bonwick@Sun.COM } 984789Sahrens } 985789Sahrens 9867754SJeff.Bonwick@Sun.COM /* 9877754SJeff.Bonwick@Sun.COM * The final pass of spa_sync() must be all rewrites, but the first 9887754SJeff.Bonwick@Sun.COM * few passes offer a trade-off: allocating blocks defers convergence, 9897754SJeff.Bonwick@Sun.COM * but newly allocated blocks are sequential, so they can be written 9907754SJeff.Bonwick@Sun.COM * to disk faster. Therefore, we allow the first few passes of 9917754SJeff.Bonwick@Sun.COM * spa_sync() to allocate new blocks, but force rewrites after that. 9927754SJeff.Bonwick@Sun.COM * There should only be a handful of blocks after pass 1 in any case. 9937754SJeff.Bonwick@Sun.COM */ 99410922SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 9957754SJeff.Bonwick@Sun.COM pass > SYNC_PASS_REWRITE) { 99610922SJeff.Bonwick@Sun.COM ASSERT(psize != 0); 99710922SJeff.Bonwick@Sun.COM enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 9987754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 9997754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_REWRITE; 10007754SJeff.Bonwick@Sun.COM } else { 10017754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 10027754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 10037754SJeff.Bonwick@Sun.COM } 10047754SJeff.Bonwick@Sun.COM 100510922SJeff.Bonwick@Sun.COM if (psize == 0) { 10067754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 10077754SJeff.Bonwick@Sun.COM } else { 10087754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 10097754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(bp, lsize); 101010922SJeff.Bonwick@Sun.COM BP_SET_PSIZE(bp, psize); 10117754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(bp, compress); 10127754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(bp, zp->zp_checksum); 10137754SJeff.Bonwick@Sun.COM BP_SET_TYPE(bp, zp->zp_type); 10147754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(bp, zp->zp_level); 101510922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, zp->zp_dedup); 10167754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 101710922SJeff.Bonwick@Sun.COM if (zp->zp_dedup) { 101810922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 101910922SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 102010922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 102110922SJeff.Bonwick@Sun.COM } 102210922SJeff.Bonwick@Sun.COM } 102310922SJeff.Bonwick@Sun.COM 102410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 102510922SJeff.Bonwick@Sun.COM } 102610922SJeff.Bonwick@Sun.COM 102710922SJeff.Bonwick@Sun.COM static int 102810922SJeff.Bonwick@Sun.COM zio_free_bp_init(zio_t *zio) 102910922SJeff.Bonwick@Sun.COM { 103010922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 103110922SJeff.Bonwick@Sun.COM 103210922SJeff.Bonwick@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 103310922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp)) 103410922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 10357754SJeff.Bonwick@Sun.COM } 10367754SJeff.Bonwick@Sun.COM 10377754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 10387754SJeff.Bonwick@Sun.COM } 10397754SJeff.Bonwick@Sun.COM 10407754SJeff.Bonwick@Sun.COM /* 10417754SJeff.Bonwick@Sun.COM * ========================================================================== 10427754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline 10437754SJeff.Bonwick@Sun.COM * ========================================================================== 10447754SJeff.Bonwick@Sun.COM */ 10457754SJeff.Bonwick@Sun.COM 10467754SJeff.Bonwick@Sun.COM static void 104711173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 10487754SJeff.Bonwick@Sun.COM { 104911146SGeorge.Wilson@Sun.COM spa_t *spa = zio->io_spa; 10507754SJeff.Bonwick@Sun.COM zio_type_t t = zio->io_type; 105111173SJonathan.Adams@Sun.COM int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 10527754SJeff.Bonwick@Sun.COM 10537754SJeff.Bonwick@Sun.COM /* 10549722SGeorge.Wilson@Sun.COM * If we're a config writer or a probe, the normal issue and 10559722SGeorge.Wilson@Sun.COM * interrupt threads may all be blocked waiting for the config lock. 10569722SGeorge.Wilson@Sun.COM * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 10577754SJeff.Bonwick@Sun.COM */ 10589722SGeorge.Wilson@Sun.COM if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 10597754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10607754SJeff.Bonwick@Sun.COM 10617754SJeff.Bonwick@Sun.COM /* 10627754SJeff.Bonwick@Sun.COM * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 10637754SJeff.Bonwick@Sun.COM */ 10647754SJeff.Bonwick@Sun.COM if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 10657754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10667754SJeff.Bonwick@Sun.COM 106711146SGeorge.Wilson@Sun.COM /* 106811146SGeorge.Wilson@Sun.COM * If this is a high priority I/O, then use the high priority taskq. 106911146SGeorge.Wilson@Sun.COM */ 107011146SGeorge.Wilson@Sun.COM if (zio->io_priority == ZIO_PRIORITY_NOW && 107111146SGeorge.Wilson@Sun.COM spa->spa_zio_taskq[t][q + 1] != NULL) 107211146SGeorge.Wilson@Sun.COM q++; 107311146SGeorge.Wilson@Sun.COM 107411146SGeorge.Wilson@Sun.COM ASSERT3U(q, <, ZIO_TASKQ_TYPES); 107511146SGeorge.Wilson@Sun.COM (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 107611173SJonathan.Adams@Sun.COM (task_func_t *)zio_execute, zio, flags); 10777754SJeff.Bonwick@Sun.COM } 10787754SJeff.Bonwick@Sun.COM 10797754SJeff.Bonwick@Sun.COM static boolean_t 10807754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 10817754SJeff.Bonwick@Sun.COM { 10827754SJeff.Bonwick@Sun.COM kthread_t *executor = zio->io_executor; 10837754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 1084789Sahrens 10857754SJeff.Bonwick@Sun.COM for (zio_type_t t = 0; t < ZIO_TYPES; t++) 10867754SJeff.Bonwick@Sun.COM if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 10877754SJeff.Bonwick@Sun.COM return (B_TRUE); 10887754SJeff.Bonwick@Sun.COM 10897754SJeff.Bonwick@Sun.COM return (B_FALSE); 10907754SJeff.Bonwick@Sun.COM } 10917754SJeff.Bonwick@Sun.COM 10927754SJeff.Bonwick@Sun.COM static int 10937754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio) 10947754SJeff.Bonwick@Sun.COM { 109511173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 10967754SJeff.Bonwick@Sun.COM 10977754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 10987754SJeff.Bonwick@Sun.COM } 10997754SJeff.Bonwick@Sun.COM 11007754SJeff.Bonwick@Sun.COM void 11017754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio) 11027754SJeff.Bonwick@Sun.COM { 110311173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 11047754SJeff.Bonwick@Sun.COM } 11057754SJeff.Bonwick@Sun.COM 11067754SJeff.Bonwick@Sun.COM /* 11077754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline until one of the following occurs: 11087754SJeff.Bonwick@Sun.COM * (1) the I/O completes; (2) the pipeline stalls waiting for 11097754SJeff.Bonwick@Sun.COM * dependent child I/Os; (3) the I/O issues, so we're waiting 11107754SJeff.Bonwick@Sun.COM * for an I/O completion interrupt; (4) the I/O is delegated by 11117754SJeff.Bonwick@Sun.COM * vdev-level caching or aggregation; (5) the I/O is deferred 11127754SJeff.Bonwick@Sun.COM * due to vdev-level queueing; (6) the I/O is handed off to 11137754SJeff.Bonwick@Sun.COM * another thread. In all cases, the pipeline stops whenever 11147754SJeff.Bonwick@Sun.COM * there's no CPU work; it never burns a thread in cv_wait(). 11157754SJeff.Bonwick@Sun.COM * 11167754SJeff.Bonwick@Sun.COM * There's no locking on io_stage because there's no legitimate way 11177754SJeff.Bonwick@Sun.COM * for multiple threads to be attempting to process the same I/O. 11187754SJeff.Bonwick@Sun.COM */ 111910922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[]; 1120789Sahrens 11217754SJeff.Bonwick@Sun.COM void 11227754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio) 11237754SJeff.Bonwick@Sun.COM { 11247754SJeff.Bonwick@Sun.COM zio->io_executor = curthread; 11257754SJeff.Bonwick@Sun.COM 11267754SJeff.Bonwick@Sun.COM while (zio->io_stage < ZIO_STAGE_DONE) { 112710922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = zio->io_pipeline; 112810922SJeff.Bonwick@Sun.COM enum zio_stage stage = zio->io_stage; 11297754SJeff.Bonwick@Sun.COM int rv; 11307754SJeff.Bonwick@Sun.COM 11317754SJeff.Bonwick@Sun.COM ASSERT(!MUTEX_HELD(&zio->io_lock)); 113210922SJeff.Bonwick@Sun.COM ASSERT(ISP2(stage)); 113310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 113410922SJeff.Bonwick@Sun.COM 113510922SJeff.Bonwick@Sun.COM do { 113610922SJeff.Bonwick@Sun.COM stage <<= 1; 113710922SJeff.Bonwick@Sun.COM } while ((stage & pipeline) == 0); 11387754SJeff.Bonwick@Sun.COM 11397754SJeff.Bonwick@Sun.COM ASSERT(stage <= ZIO_STAGE_DONE); 11407754SJeff.Bonwick@Sun.COM 11417754SJeff.Bonwick@Sun.COM /* 11427754SJeff.Bonwick@Sun.COM * If we are in interrupt context and this pipeline stage 11437754SJeff.Bonwick@Sun.COM * will grab a config lock that is held across I/O, 114410922SJeff.Bonwick@Sun.COM * or may wait for an I/O that needs an interrupt thread 114510922SJeff.Bonwick@Sun.COM * to complete, issue async to avoid deadlock. 114611173SJonathan.Adams@Sun.COM * 114711173SJonathan.Adams@Sun.COM * For VDEV_IO_START, we cut in line so that the io will 114811173SJonathan.Adams@Sun.COM * be sent to disk promptly. 11497754SJeff.Bonwick@Sun.COM */ 115010922SJeff.Bonwick@Sun.COM if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 11517754SJeff.Bonwick@Sun.COM zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 115211173SJonathan.Adams@Sun.COM boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 115311173SJonathan.Adams@Sun.COM zio_requeue_io_start_cut_in_line : B_FALSE; 115411173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 11557754SJeff.Bonwick@Sun.COM return; 11567754SJeff.Bonwick@Sun.COM } 11577754SJeff.Bonwick@Sun.COM 11587754SJeff.Bonwick@Sun.COM zio->io_stage = stage; 115910922SJeff.Bonwick@Sun.COM rv = zio_pipeline[highbit(stage) - 1](zio); 11607754SJeff.Bonwick@Sun.COM 11617754SJeff.Bonwick@Sun.COM if (rv == ZIO_PIPELINE_STOP) 11627754SJeff.Bonwick@Sun.COM return; 11637754SJeff.Bonwick@Sun.COM 11647754SJeff.Bonwick@Sun.COM ASSERT(rv == ZIO_PIPELINE_CONTINUE); 11657754SJeff.Bonwick@Sun.COM } 1166789Sahrens } 1167789Sahrens 1168789Sahrens /* 1169789Sahrens * ========================================================================== 1170789Sahrens * Initiate I/O, either sync or async 1171789Sahrens * ========================================================================== 1172789Sahrens */ 1173789Sahrens int 1174789Sahrens zio_wait(zio_t *zio) 1175789Sahrens { 1176789Sahrens int error; 1177789Sahrens 1178789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 11797754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 1180789Sahrens 1181789Sahrens zio->io_waiter = curthread; 1182789Sahrens 11835530Sbonwick zio_execute(zio); 1184789Sahrens 1185789Sahrens mutex_enter(&zio->io_lock); 11867754SJeff.Bonwick@Sun.COM while (zio->io_executor != NULL) 1187789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 1188789Sahrens mutex_exit(&zio->io_lock); 1189789Sahrens 1190789Sahrens error = zio->io_error; 11916523Sek110237 zio_destroy(zio); 1192789Sahrens 1193789Sahrens return (error); 1194789Sahrens } 1195789Sahrens 1196789Sahrens void 1197789Sahrens zio_nowait(zio_t *zio) 1198789Sahrens { 11997754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 12007754SJeff.Bonwick@Sun.COM 12018632SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL && 12028632SBill.Moore@Sun.COM zio_unique_parent(zio) == NULL) { 12037754SJeff.Bonwick@Sun.COM /* 12047754SJeff.Bonwick@Sun.COM * This is a logical async I/O with no parent to wait for it. 12059234SGeorge.Wilson@Sun.COM * We add it to the spa_async_root_zio "Godfather" I/O which 12069234SGeorge.Wilson@Sun.COM * will ensure they complete prior to unloading the pool. 12077754SJeff.Bonwick@Sun.COM */ 12087754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 12099234SGeorge.Wilson@Sun.COM 12109234SGeorge.Wilson@Sun.COM zio_add_child(spa->spa_async_zio_root, zio); 12117754SJeff.Bonwick@Sun.COM } 12127754SJeff.Bonwick@Sun.COM 12135530Sbonwick zio_execute(zio); 12145530Sbonwick } 12155530Sbonwick 12167754SJeff.Bonwick@Sun.COM /* 12177754SJeff.Bonwick@Sun.COM * ========================================================================== 12187754SJeff.Bonwick@Sun.COM * Reexecute or suspend/resume failed I/O 12197754SJeff.Bonwick@Sun.COM * ========================================================================== 12207754SJeff.Bonwick@Sun.COM */ 12217754SJeff.Bonwick@Sun.COM 12227754SJeff.Bonwick@Sun.COM static void 12237754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio) 12247754SJeff.Bonwick@Sun.COM { 12258632SBill.Moore@Sun.COM zio_t *cio, *cio_next; 12268632SBill.Moore@Sun.COM 12278632SBill.Moore@Sun.COM ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 12288632SBill.Moore@Sun.COM ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 12299443SBill.Moore@Sun.COM ASSERT(pio->io_gang_leader == NULL); 12309443SBill.Moore@Sun.COM ASSERT(pio->io_gang_tree == NULL); 12317754SJeff.Bonwick@Sun.COM 12327754SJeff.Bonwick@Sun.COM pio->io_flags = pio->io_orig_flags; 12337754SJeff.Bonwick@Sun.COM pio->io_stage = pio->io_orig_stage; 12347754SJeff.Bonwick@Sun.COM pio->io_pipeline = pio->io_orig_pipeline; 12357754SJeff.Bonwick@Sun.COM pio->io_reexecute = 0; 12367754SJeff.Bonwick@Sun.COM pio->io_error = 0; 12378632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12388632SBill.Moore@Sun.COM pio->io_state[w] = 0; 12397754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 12407754SJeff.Bonwick@Sun.COM pio->io_child_error[c] = 0; 12417754SJeff.Bonwick@Sun.COM 124210922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(pio)) 124310922SJeff.Bonwick@Sun.COM BP_ZERO(pio->io_bp); 12447754SJeff.Bonwick@Sun.COM 12457754SJeff.Bonwick@Sun.COM /* 12467754SJeff.Bonwick@Sun.COM * As we reexecute pio's children, new children could be created. 12478632SBill.Moore@Sun.COM * New children go to the head of pio's io_child_list, however, 12487754SJeff.Bonwick@Sun.COM * so we will (correctly) not reexecute them. The key is that 12498632SBill.Moore@Sun.COM * the remainder of pio's io_child_list, from 'cio_next' onward, 12508632SBill.Moore@Sun.COM * cannot be affected by any side effects of reexecuting 'cio'. 12517754SJeff.Bonwick@Sun.COM */ 12528632SBill.Moore@Sun.COM for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 12538632SBill.Moore@Sun.COM cio_next = zio_walk_children(pio); 12547754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 12558632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12568632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w]++; 12577754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 12588632SBill.Moore@Sun.COM zio_reexecute(cio); 12597754SJeff.Bonwick@Sun.COM } 12607754SJeff.Bonwick@Sun.COM 12617754SJeff.Bonwick@Sun.COM /* 12627754SJeff.Bonwick@Sun.COM * Now that all children have been reexecuted, execute the parent. 12639234SGeorge.Wilson@Sun.COM * We don't reexecute "The Godfather" I/O here as it's the 12649234SGeorge.Wilson@Sun.COM * responsibility of the caller to wait on him. 12657754SJeff.Bonwick@Sun.COM */ 12669234SGeorge.Wilson@Sun.COM if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 12679234SGeorge.Wilson@Sun.COM zio_execute(pio); 12687754SJeff.Bonwick@Sun.COM } 12697754SJeff.Bonwick@Sun.COM 12705530Sbonwick void 12717754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio) 12725530Sbonwick { 12737754SJeff.Bonwick@Sun.COM if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 12747754SJeff.Bonwick@Sun.COM fm_panic("Pool '%s' has encountered an uncorrectable I/O " 12757754SJeff.Bonwick@Sun.COM "failure and the failure mode property for this pool " 12767754SJeff.Bonwick@Sun.COM "is set to panic.", spa_name(spa)); 12777754SJeff.Bonwick@Sun.COM 12787754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 12797754SJeff.Bonwick@Sun.COM 12807754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12817754SJeff.Bonwick@Sun.COM 12827754SJeff.Bonwick@Sun.COM if (spa->spa_suspend_zio_root == NULL) 12839234SGeorge.Wilson@Sun.COM spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 12849234SGeorge.Wilson@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 12859234SGeorge.Wilson@Sun.COM ZIO_FLAG_GODFATHER); 12867754SJeff.Bonwick@Sun.COM 12877754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_TRUE; 12887754SJeff.Bonwick@Sun.COM 12897754SJeff.Bonwick@Sun.COM if (zio != NULL) { 12909234SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 12917754SJeff.Bonwick@Sun.COM ASSERT(zio != spa->spa_suspend_zio_root); 12927754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 12938632SBill.Moore@Sun.COM ASSERT(zio_unique_parent(zio) == NULL); 12947754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_DONE); 12957754SJeff.Bonwick@Sun.COM zio_add_child(spa->spa_suspend_zio_root, zio); 12967754SJeff.Bonwick@Sun.COM } 12977754SJeff.Bonwick@Sun.COM 12987754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12995530Sbonwick } 13005530Sbonwick 13019234SGeorge.Wilson@Sun.COM int 13027754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa) 13035530Sbonwick { 13049234SGeorge.Wilson@Sun.COM zio_t *pio; 13057754SJeff.Bonwick@Sun.COM 13067754SJeff.Bonwick@Sun.COM /* 13077754SJeff.Bonwick@Sun.COM * Reexecute all previously suspended i/o. 13087754SJeff.Bonwick@Sun.COM */ 13097754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 13107754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_FALSE; 13117754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_suspend_cv); 13127754SJeff.Bonwick@Sun.COM pio = spa->spa_suspend_zio_root; 13137754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = NULL; 13147754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 13157754SJeff.Bonwick@Sun.COM 13167754SJeff.Bonwick@Sun.COM if (pio == NULL) 13179234SGeorge.Wilson@Sun.COM return (0); 13185530Sbonwick 13199234SGeorge.Wilson@Sun.COM zio_reexecute(pio); 13209234SGeorge.Wilson@Sun.COM return (zio_wait(pio)); 13217754SJeff.Bonwick@Sun.COM } 13227754SJeff.Bonwick@Sun.COM 13237754SJeff.Bonwick@Sun.COM void 13247754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa) 13257754SJeff.Bonwick@Sun.COM { 13267754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 13277754SJeff.Bonwick@Sun.COM while (spa_suspended(spa)) 13287754SJeff.Bonwick@Sun.COM cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 13297754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1330789Sahrens } 1331789Sahrens 1332789Sahrens /* 1333789Sahrens * ========================================================================== 13347754SJeff.Bonwick@Sun.COM * Gang blocks. 13357754SJeff.Bonwick@Sun.COM * 13367754SJeff.Bonwick@Sun.COM * A gang block is a collection of small blocks that looks to the DMU 13377754SJeff.Bonwick@Sun.COM * like one large block. When zio_dva_allocate() cannot find a block 13387754SJeff.Bonwick@Sun.COM * of the requested size, due to either severe fragmentation or the pool 13397754SJeff.Bonwick@Sun.COM * being nearly full, it calls zio_write_gang_block() to construct the 13407754SJeff.Bonwick@Sun.COM * block from smaller fragments. 13417754SJeff.Bonwick@Sun.COM * 13427754SJeff.Bonwick@Sun.COM * A gang block consists of a gang header (zio_gbh_phys_t) and up to 13437754SJeff.Bonwick@Sun.COM * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 13447754SJeff.Bonwick@Sun.COM * an indirect block: it's an array of block pointers. It consumes 13457754SJeff.Bonwick@Sun.COM * only one sector and hence is allocatable regardless of fragmentation. 13467754SJeff.Bonwick@Sun.COM * The gang header's bps point to its gang members, which hold the data. 13477754SJeff.Bonwick@Sun.COM * 13487754SJeff.Bonwick@Sun.COM * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 13497754SJeff.Bonwick@Sun.COM * as the verifier to ensure uniqueness of the SHA256 checksum. 13507754SJeff.Bonwick@Sun.COM * Critically, the gang block bp's blk_cksum is the checksum of the data, 13517754SJeff.Bonwick@Sun.COM * not the gang header. This ensures that data block signatures (needed for 13527754SJeff.Bonwick@Sun.COM * deduplication) are independent of how the block is physically stored. 13537754SJeff.Bonwick@Sun.COM * 13547754SJeff.Bonwick@Sun.COM * Gang blocks can be nested: a gang member may itself be a gang block. 13557754SJeff.Bonwick@Sun.COM * Thus every gang block is a tree in which root and all interior nodes are 13567754SJeff.Bonwick@Sun.COM * gang headers, and the leaves are normal blocks that contain user data. 13577754SJeff.Bonwick@Sun.COM * The root of the gang tree is called the gang leader. 13587754SJeff.Bonwick@Sun.COM * 13597754SJeff.Bonwick@Sun.COM * To perform any operation (read, rewrite, free, claim) on a gang block, 13607754SJeff.Bonwick@Sun.COM * zio_gang_assemble() first assembles the gang tree (minus data leaves) 13617754SJeff.Bonwick@Sun.COM * in the io_gang_tree field of the original logical i/o by recursively 13627754SJeff.Bonwick@Sun.COM * reading the gang leader and all gang headers below it. This yields 13637754SJeff.Bonwick@Sun.COM * an in-core tree containing the contents of every gang header and the 13647754SJeff.Bonwick@Sun.COM * bps for every constituent of the gang block. 13657754SJeff.Bonwick@Sun.COM * 13667754SJeff.Bonwick@Sun.COM * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 13677754SJeff.Bonwick@Sun.COM * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 13687754SJeff.Bonwick@Sun.COM * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 13697754SJeff.Bonwick@Sun.COM * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 13707754SJeff.Bonwick@Sun.COM * zio_read_gang() is a wrapper around zio_read() that omits reading gang 13717754SJeff.Bonwick@Sun.COM * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 13727754SJeff.Bonwick@Sun.COM * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 13737754SJeff.Bonwick@Sun.COM * of the gang header plus zio_checksum_compute() of the data to update the 13747754SJeff.Bonwick@Sun.COM * gang header's blk_cksum as described above. 13757754SJeff.Bonwick@Sun.COM * 13767754SJeff.Bonwick@Sun.COM * The two-phase assemble/issue model solves the problem of partial failure -- 13777754SJeff.Bonwick@Sun.COM * what if you'd freed part of a gang block but then couldn't read the 13787754SJeff.Bonwick@Sun.COM * gang header for another part? Assembling the entire gang tree first 13797754SJeff.Bonwick@Sun.COM * ensures that all the necessary gang header I/O has succeeded before 13807754SJeff.Bonwick@Sun.COM * starting the actual work of free, claim, or write. Once the gang tree 13817754SJeff.Bonwick@Sun.COM * is assembled, free and claim are in-memory operations that cannot fail. 13827754SJeff.Bonwick@Sun.COM * 13837754SJeff.Bonwick@Sun.COM * In the event that a gang write fails, zio_dva_unallocate() walks the 13847754SJeff.Bonwick@Sun.COM * gang tree to immediately free (i.e. insert back into the space map) 13857754SJeff.Bonwick@Sun.COM * everything we've allocated. This ensures that we don't get ENOSPC 13867754SJeff.Bonwick@Sun.COM * errors during repeated suspend/resume cycles due to a flaky device. 13877754SJeff.Bonwick@Sun.COM * 13887754SJeff.Bonwick@Sun.COM * Gang rewrites only happen during sync-to-convergence. If we can't assemble 13897754SJeff.Bonwick@Sun.COM * the gang tree, we won't modify the block, so we can safely defer the free 13907754SJeff.Bonwick@Sun.COM * (knowing that the block is still intact). If we *can* assemble the gang 13917754SJeff.Bonwick@Sun.COM * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 13927754SJeff.Bonwick@Sun.COM * each constituent bp and we can allocate a new block on the next sync pass. 13937754SJeff.Bonwick@Sun.COM * 13947754SJeff.Bonwick@Sun.COM * In all cases, the gang tree allows complete recovery from partial failure. 1395789Sahrens * ========================================================================== 1396789Sahrens */ 13975530Sbonwick 13987754SJeff.Bonwick@Sun.COM static zio_t * 13997754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14007754SJeff.Bonwick@Sun.COM { 14017754SJeff.Bonwick@Sun.COM if (gn != NULL) 14027754SJeff.Bonwick@Sun.COM return (pio); 14035530Sbonwick 14047754SJeff.Bonwick@Sun.COM return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 14057754SJeff.Bonwick@Sun.COM NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 14067754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1407789Sahrens } 1408789Sahrens 14097754SJeff.Bonwick@Sun.COM zio_t * 14107754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14116523Sek110237 { 14127754SJeff.Bonwick@Sun.COM zio_t *zio; 14136523Sek110237 14147754SJeff.Bonwick@Sun.COM if (gn != NULL) { 14157754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 14167754SJeff.Bonwick@Sun.COM gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 14177754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14187754SJeff.Bonwick@Sun.COM /* 14197754SJeff.Bonwick@Sun.COM * As we rewrite each gang header, the pipeline will compute 14207754SJeff.Bonwick@Sun.COM * a new gang block header checksum for it; but no one will 14217754SJeff.Bonwick@Sun.COM * compute a new data checksum, so we do that here. The one 14227754SJeff.Bonwick@Sun.COM * exception is the gang leader: the pipeline already computed 14237754SJeff.Bonwick@Sun.COM * its data checksum because that stage precedes gang assembly. 14247754SJeff.Bonwick@Sun.COM * (Presently, nothing actually uses interior data checksums; 14257754SJeff.Bonwick@Sun.COM * this is just good hygiene.) 14267754SJeff.Bonwick@Sun.COM */ 14279443SBill.Moore@Sun.COM if (gn != pio->io_gang_leader->io_gang_tree) { 14287754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 14297754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp)); 14307754SJeff.Bonwick@Sun.COM } 143110922SJeff.Bonwick@Sun.COM /* 143210922SJeff.Bonwick@Sun.COM * If we are here to damage data for testing purposes, 143310922SJeff.Bonwick@Sun.COM * leave the GBH alone so that we can detect the damage. 143410922SJeff.Bonwick@Sun.COM */ 143510922SJeff.Bonwick@Sun.COM if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 143610922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 14377754SJeff.Bonwick@Sun.COM } else { 14387754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 14397754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 14407754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14416523Sek110237 } 14426523Sek110237 14437754SJeff.Bonwick@Sun.COM return (zio); 14447754SJeff.Bonwick@Sun.COM } 14457754SJeff.Bonwick@Sun.COM 14467754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14477754SJeff.Bonwick@Sun.COM zio_t * 14487754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14497754SJeff.Bonwick@Sun.COM { 145010922SJeff.Bonwick@Sun.COM return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 145110922SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio))); 14527754SJeff.Bonwick@Sun.COM } 14537754SJeff.Bonwick@Sun.COM 14547754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14557754SJeff.Bonwick@Sun.COM zio_t * 14567754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14577754SJeff.Bonwick@Sun.COM { 14587754SJeff.Bonwick@Sun.COM return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 14597754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 14607754SJeff.Bonwick@Sun.COM } 14617754SJeff.Bonwick@Sun.COM 14627754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 14637754SJeff.Bonwick@Sun.COM NULL, 14647754SJeff.Bonwick@Sun.COM zio_read_gang, 14657754SJeff.Bonwick@Sun.COM zio_rewrite_gang, 14667754SJeff.Bonwick@Sun.COM zio_free_gang, 14677754SJeff.Bonwick@Sun.COM zio_claim_gang, 14687754SJeff.Bonwick@Sun.COM NULL 14697754SJeff.Bonwick@Sun.COM }; 14707754SJeff.Bonwick@Sun.COM 14717754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio); 14727754SJeff.Bonwick@Sun.COM 14737754SJeff.Bonwick@Sun.COM static zio_gang_node_t * 14747754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp) 14757754SJeff.Bonwick@Sun.COM { 14767754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn; 14777754SJeff.Bonwick@Sun.COM 14787754SJeff.Bonwick@Sun.COM ASSERT(*gnpp == NULL); 14797754SJeff.Bonwick@Sun.COM 14807754SJeff.Bonwick@Sun.COM gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 14817754SJeff.Bonwick@Sun.COM gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 14827754SJeff.Bonwick@Sun.COM *gnpp = gn; 14837754SJeff.Bonwick@Sun.COM 14847754SJeff.Bonwick@Sun.COM return (gn); 14856523Sek110237 } 14866523Sek110237 14876523Sek110237 static void 14887754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp) 14897754SJeff.Bonwick@Sun.COM { 14907754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 14917754SJeff.Bonwick@Sun.COM 14927754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 14937754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_child[g] == NULL); 14947754SJeff.Bonwick@Sun.COM 14957754SJeff.Bonwick@Sun.COM zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 14967754SJeff.Bonwick@Sun.COM kmem_free(gn, sizeof (*gn)); 14977754SJeff.Bonwick@Sun.COM *gnpp = NULL; 14987754SJeff.Bonwick@Sun.COM } 14997754SJeff.Bonwick@Sun.COM 15007754SJeff.Bonwick@Sun.COM static void 15017754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp) 1502789Sahrens { 15037754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 15047754SJeff.Bonwick@Sun.COM 15057754SJeff.Bonwick@Sun.COM if (gn == NULL) 15067754SJeff.Bonwick@Sun.COM return; 15077754SJeff.Bonwick@Sun.COM 15087754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 15097754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&gn->gn_child[g]); 15107754SJeff.Bonwick@Sun.COM 15117754SJeff.Bonwick@Sun.COM zio_gang_node_free(gnpp); 15127754SJeff.Bonwick@Sun.COM } 15137754SJeff.Bonwick@Sun.COM 15147754SJeff.Bonwick@Sun.COM static void 15159443SBill.Moore@Sun.COM zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 15167754SJeff.Bonwick@Sun.COM { 15177754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1518789Sahrens 15199443SBill.Moore@Sun.COM ASSERT(gio->io_gang_leader == gio); 15207754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp)); 15217754SJeff.Bonwick@Sun.COM 15229443SBill.Moore@Sun.COM zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 15237754SJeff.Bonwick@Sun.COM SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 15249443SBill.Moore@Sun.COM gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 15257754SJeff.Bonwick@Sun.COM } 15267754SJeff.Bonwick@Sun.COM 15277754SJeff.Bonwick@Sun.COM static void 15287754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio) 15297754SJeff.Bonwick@Sun.COM { 15309443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 15317754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio->io_private; 15327754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 15337754SJeff.Bonwick@Sun.COM 15349443SBill.Moore@Sun.COM ASSERT(gio == zio_unique_parent(zio)); 153510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 15367754SJeff.Bonwick@Sun.COM 15377754SJeff.Bonwick@Sun.COM if (zio->io_error) 15387754SJeff.Bonwick@Sun.COM return; 15397754SJeff.Bonwick@Sun.COM 15407754SJeff.Bonwick@Sun.COM if (BP_SHOULD_BYTESWAP(bp)) 15417754SJeff.Bonwick@Sun.COM byteswap_uint64_array(zio->io_data, zio->io_size); 15427754SJeff.Bonwick@Sun.COM 15437754SJeff.Bonwick@Sun.COM ASSERT(zio->io_data == gn->gn_gbh); 15447754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 154511670SNeil.Perrin@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 15467754SJeff.Bonwick@Sun.COM 15477754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15487754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15497754SJeff.Bonwick@Sun.COM if (!BP_IS_GANG(gbp)) 15507754SJeff.Bonwick@Sun.COM continue; 15519443SBill.Moore@Sun.COM zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1552789Sahrens } 1553789Sahrens } 1554789Sahrens 15557754SJeff.Bonwick@Sun.COM static void 15567754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1557789Sahrens { 15589443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 15597754SJeff.Bonwick@Sun.COM zio_t *zio; 15607754SJeff.Bonwick@Sun.COM 15617754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) == !!gn); 15629443SBill.Moore@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 15639443SBill.Moore@Sun.COM ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 15647754SJeff.Bonwick@Sun.COM 15657754SJeff.Bonwick@Sun.COM /* 15667754SJeff.Bonwick@Sun.COM * If you're a gang header, your data is in gn->gn_gbh. 15677754SJeff.Bonwick@Sun.COM * If you're a gang member, your data is in 'data' and gn == NULL. 15687754SJeff.Bonwick@Sun.COM */ 15699443SBill.Moore@Sun.COM zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1570789Sahrens 15717754SJeff.Bonwick@Sun.COM if (gn != NULL) { 157211670SNeil.Perrin@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 15737754SJeff.Bonwick@Sun.COM 15747754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15757754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15767754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(gbp)) 15777754SJeff.Bonwick@Sun.COM continue; 15787754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 15797754SJeff.Bonwick@Sun.COM data = (char *)data + BP_GET_PSIZE(gbp); 15807754SJeff.Bonwick@Sun.COM } 15817754SJeff.Bonwick@Sun.COM } 15827754SJeff.Bonwick@Sun.COM 15839443SBill.Moore@Sun.COM if (gn == gio->io_gang_tree) 15849443SBill.Moore@Sun.COM ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 15857754SJeff.Bonwick@Sun.COM 15867754SJeff.Bonwick@Sun.COM if (zio != pio) 15877754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1588789Sahrens } 1589789Sahrens 15905530Sbonwick static int 15917754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio) 15925329Sgw25295 { 15935530Sbonwick blkptr_t *bp = zio->io_bp; 15945530Sbonwick 15959443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 15969443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 15979443SBill.Moore@Sun.COM 15989443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 15995530Sbonwick 16007754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1601789Sahrens 16025530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1603789Sahrens } 1604789Sahrens 16055530Sbonwick static int 16067754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio) 16076523Sek110237 { 16086523Sek110237 blkptr_t *bp = zio->io_bp; 1609789Sahrens 16107754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 16117754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 16125329Sgw25295 16139443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 16149443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1615789Sahrens 16167754SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 16179443SBill.Moore@Sun.COM zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 16187754SJeff.Bonwick@Sun.COM else 16199443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 1620789Sahrens 16217754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 16225530Sbonwick 16235530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1624789Sahrens } 1625789Sahrens 1626789Sahrens static void 16277754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio) 1628789Sahrens { 16298632SBill.Moore@Sun.COM zio_t *pio = zio_unique_parent(zio); 16309443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 16311775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 16321775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1633789Sahrens uint64_t asize; 16347754SJeff.Bonwick@Sun.COM 16357754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(zio->io_bp)) 16367754SJeff.Bonwick@Sun.COM return; 16377754SJeff.Bonwick@Sun.COM 16387754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1639789Sahrens 16407754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 164110922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 164210922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 164310922SJeff.Bonwick@Sun.COM ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 16441775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 16451775Sbillm 1646789Sahrens mutex_enter(&pio->io_lock); 16477754SJeff.Bonwick@Sun.COM for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 16481775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 16491775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 16501775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 16511775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 16521775Sbillm } 1653789Sahrens mutex_exit(&pio->io_lock); 1654789Sahrens } 1655789Sahrens 16565329Sgw25295 static int 16577754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio) 1658789Sahrens { 16597754SJeff.Bonwick@Sun.COM spa_t *spa = pio->io_spa; 16607754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 16619443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 16627754SJeff.Bonwick@Sun.COM zio_t *zio; 16637754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn, **gnpp; 1664789Sahrens zio_gbh_phys_t *gbh; 16657754SJeff.Bonwick@Sun.COM uint64_t txg = pio->io_txg; 16667754SJeff.Bonwick@Sun.COM uint64_t resid = pio->io_size; 16677754SJeff.Bonwick@Sun.COM uint64_t lsize; 166810922SJeff.Bonwick@Sun.COM int copies = gio->io_prop.zp_copies; 166910922SJeff.Bonwick@Sun.COM int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 16707754SJeff.Bonwick@Sun.COM zio_prop_t zp; 1671789Sahrens int error; 1672789Sahrens 167310922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 167410922SJeff.Bonwick@Sun.COM bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 16757754SJeff.Bonwick@Sun.COM METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 16765530Sbonwick if (error) { 16777754SJeff.Bonwick@Sun.COM pio->io_error = error; 16785530Sbonwick return (ZIO_PIPELINE_CONTINUE); 16795530Sbonwick } 1680789Sahrens 16819443SBill.Moore@Sun.COM if (pio == gio) { 16829443SBill.Moore@Sun.COM gnpp = &gio->io_gang_tree; 16837754SJeff.Bonwick@Sun.COM } else { 16847754SJeff.Bonwick@Sun.COM gnpp = pio->io_private; 16857754SJeff.Bonwick@Sun.COM ASSERT(pio->io_ready == zio_write_gang_member_ready); 1686789Sahrens } 1687789Sahrens 16887754SJeff.Bonwick@Sun.COM gn = zio_gang_node_alloc(gnpp); 16897754SJeff.Bonwick@Sun.COM gbh = gn->gn_gbh; 16907754SJeff.Bonwick@Sun.COM bzero(gbh, SPA_GANGBLOCKSIZE); 1691789Sahrens 16927754SJeff.Bonwick@Sun.COM /* 16937754SJeff.Bonwick@Sun.COM * Create the gang header. 16947754SJeff.Bonwick@Sun.COM */ 16957754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 16967754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 16975530Sbonwick 16981775Sbillm /* 16997754SJeff.Bonwick@Sun.COM * Create and nowait the gang children. 17001775Sbillm */ 17017754SJeff.Bonwick@Sun.COM for (int g = 0; resid != 0; resid -= lsize, g++) { 17027754SJeff.Bonwick@Sun.COM lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 17037754SJeff.Bonwick@Sun.COM SPA_MINBLOCKSIZE); 17047754SJeff.Bonwick@Sun.COM ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 17057754SJeff.Bonwick@Sun.COM 17069443SBill.Moore@Sun.COM zp.zp_checksum = gio->io_prop.zp_checksum; 17077754SJeff.Bonwick@Sun.COM zp.zp_compress = ZIO_COMPRESS_OFF; 17087754SJeff.Bonwick@Sun.COM zp.zp_type = DMU_OT_NONE; 17097754SJeff.Bonwick@Sun.COM zp.zp_level = 0; 171010922SJeff.Bonwick@Sun.COM zp.zp_copies = gio->io_prop.zp_copies; 171110922SJeff.Bonwick@Sun.COM zp.zp_dedup = 0; 171210922SJeff.Bonwick@Sun.COM zp.zp_dedup_verify = 0; 17137754SJeff.Bonwick@Sun.COM 17147754SJeff.Bonwick@Sun.COM zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 17157754SJeff.Bonwick@Sun.COM (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 17167754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready, NULL, &gn->gn_child[g], 17177754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 17187754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 17197754SJeff.Bonwick@Sun.COM } 17207754SJeff.Bonwick@Sun.COM 17217754SJeff.Bonwick@Sun.COM /* 17227754SJeff.Bonwick@Sun.COM * Set pio's pipeline to just wait for zio to finish. 17237754SJeff.Bonwick@Sun.COM */ 17247754SJeff.Bonwick@Sun.COM pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 17257754SJeff.Bonwick@Sun.COM 17267754SJeff.Bonwick@Sun.COM zio_nowait(zio); 17277754SJeff.Bonwick@Sun.COM 17287754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1729789Sahrens } 1730789Sahrens 1731789Sahrens /* 1732789Sahrens * ========================================================================== 173310922SJeff.Bonwick@Sun.COM * Dedup 173410922SJeff.Bonwick@Sun.COM * ========================================================================== 173510922SJeff.Bonwick@Sun.COM */ 173610922SJeff.Bonwick@Sun.COM static void 173710922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done(zio_t *zio) 173810922SJeff.Bonwick@Sun.COM { 173910922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 174010922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 174110922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 174210922SJeff.Bonwick@Sun.COM zio_t *pio = zio_unique_parent(zio); 174310922SJeff.Bonwick@Sun.COM 174410922SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 174510922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 174610922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) 174710922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 174810922SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && dde->dde_repair_data == NULL) 174910922SJeff.Bonwick@Sun.COM dde->dde_repair_data = zio->io_data; 175010922SJeff.Bonwick@Sun.COM else 175110922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zio->io_size); 175210922SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 175310922SJeff.Bonwick@Sun.COM } 175410922SJeff.Bonwick@Sun.COM 175510922SJeff.Bonwick@Sun.COM static int 175610922SJeff.Bonwick@Sun.COM zio_ddt_read_start(zio_t *zio) 175710922SJeff.Bonwick@Sun.COM { 175810922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 175910922SJeff.Bonwick@Sun.COM 176010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 176110922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 176210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 176310922SJeff.Bonwick@Sun.COM 176410922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 176510922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 176610922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = ddt_repair_start(ddt, bp); 176710922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = dde->dde_phys; 176810922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 176910922SJeff.Bonwick@Sun.COM blkptr_t blk; 177010922SJeff.Bonwick@Sun.COM 177110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 177210922SJeff.Bonwick@Sun.COM zio->io_vsd = dde; 177310922SJeff.Bonwick@Sun.COM 177410922SJeff.Bonwick@Sun.COM if (ddp_self == NULL) 177510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 177610922SJeff.Bonwick@Sun.COM 177710922SJeff.Bonwick@Sun.COM for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 177810922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 177910922SJeff.Bonwick@Sun.COM continue; 178011125SJeff.Bonwick@Sun.COM ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 178111125SJeff.Bonwick@Sun.COM &blk); 178210922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, &blk, 178310922SJeff.Bonwick@Sun.COM zio_buf_alloc(zio->io_size), zio->io_size, 178410922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done, dde, zio->io_priority, 178510922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 178610922SJeff.Bonwick@Sun.COM &zio->io_bookmark)); 178710922SJeff.Bonwick@Sun.COM } 178810922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 178910922SJeff.Bonwick@Sun.COM } 179010922SJeff.Bonwick@Sun.COM 179110922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, bp, 179210922SJeff.Bonwick@Sun.COM zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 179310922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 179410922SJeff.Bonwick@Sun.COM 179510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 179610922SJeff.Bonwick@Sun.COM } 179710922SJeff.Bonwick@Sun.COM 179810922SJeff.Bonwick@Sun.COM static int 179910922SJeff.Bonwick@Sun.COM zio_ddt_read_done(zio_t *zio) 180010922SJeff.Bonwick@Sun.COM { 180110922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 180210922SJeff.Bonwick@Sun.COM 180310922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 180410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 180510922SJeff.Bonwick@Sun.COM 180610922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 180710922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 180810922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 180910922SJeff.Bonwick@Sun.COM 181010922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 181110922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 181210922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_vsd; 181310922SJeff.Bonwick@Sun.COM if (ddt == NULL) { 181411147SGeorge.Wilson@Sun.COM ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 181510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 181610922SJeff.Bonwick@Sun.COM } 181710922SJeff.Bonwick@Sun.COM if (dde == NULL) { 181810922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 181911173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 182010922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 182110922SJeff.Bonwick@Sun.COM } 182210922SJeff.Bonwick@Sun.COM if (dde->dde_repair_data != NULL) { 182310922SJeff.Bonwick@Sun.COM bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 182410922SJeff.Bonwick@Sun.COM zio->io_child_error[ZIO_CHILD_DDT] = 0; 182510922SJeff.Bonwick@Sun.COM } 182610922SJeff.Bonwick@Sun.COM ddt_repair_done(ddt, dde); 182710922SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 182810922SJeff.Bonwick@Sun.COM } 182910922SJeff.Bonwick@Sun.COM 183010922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 183110922SJeff.Bonwick@Sun.COM 183210922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 183310922SJeff.Bonwick@Sun.COM } 183410922SJeff.Bonwick@Sun.COM 183510922SJeff.Bonwick@Sun.COM static boolean_t 183610922SJeff.Bonwick@Sun.COM zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 183710922SJeff.Bonwick@Sun.COM { 183810922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 183910922SJeff.Bonwick@Sun.COM 184010922SJeff.Bonwick@Sun.COM /* 184110922SJeff.Bonwick@Sun.COM * Note: we compare the original data, not the transformed data, 184210922SJeff.Bonwick@Sun.COM * because when zio->io_bp is an override bp, we will not have 184310922SJeff.Bonwick@Sun.COM * pushed the I/O transforms. That's an important optimization 184410922SJeff.Bonwick@Sun.COM * because otherwise we'd compress/encrypt all dmu_sync() data twice. 184510922SJeff.Bonwick@Sun.COM */ 184610922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 184710922SJeff.Bonwick@Sun.COM zio_t *lio = dde->dde_lead_zio[p]; 184810922SJeff.Bonwick@Sun.COM 184910922SJeff.Bonwick@Sun.COM if (lio != NULL) { 185010922SJeff.Bonwick@Sun.COM return (lio->io_orig_size != zio->io_orig_size || 185110922SJeff.Bonwick@Sun.COM bcmp(zio->io_orig_data, lio->io_orig_data, 185210922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0); 185310922SJeff.Bonwick@Sun.COM } 185410922SJeff.Bonwick@Sun.COM } 185510922SJeff.Bonwick@Sun.COM 185610922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 185710922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 185810922SJeff.Bonwick@Sun.COM 185910922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) { 186010922SJeff.Bonwick@Sun.COM arc_buf_t *abuf = NULL; 186110922SJeff.Bonwick@Sun.COM uint32_t aflags = ARC_WAIT; 186210922SJeff.Bonwick@Sun.COM blkptr_t blk = *zio->io_bp; 186310922SJeff.Bonwick@Sun.COM int error; 186410922SJeff.Bonwick@Sun.COM 186510922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 186610922SJeff.Bonwick@Sun.COM 186710922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 186810922SJeff.Bonwick@Sun.COM 186910922SJeff.Bonwick@Sun.COM error = arc_read_nolock(NULL, spa, &blk, 187010922SJeff.Bonwick@Sun.COM arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 187110922SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 187210922SJeff.Bonwick@Sun.COM &aflags, &zio->io_bookmark); 187310922SJeff.Bonwick@Sun.COM 187410922SJeff.Bonwick@Sun.COM if (error == 0) { 187510922SJeff.Bonwick@Sun.COM if (arc_buf_size(abuf) != zio->io_orig_size || 187610922SJeff.Bonwick@Sun.COM bcmp(abuf->b_data, zio->io_orig_data, 187710922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0) 187810922SJeff.Bonwick@Sun.COM error = EEXIST; 187910922SJeff.Bonwick@Sun.COM VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 188010922SJeff.Bonwick@Sun.COM } 188110922SJeff.Bonwick@Sun.COM 188210922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 188310922SJeff.Bonwick@Sun.COM return (error != 0); 188410922SJeff.Bonwick@Sun.COM } 188510922SJeff.Bonwick@Sun.COM } 188610922SJeff.Bonwick@Sun.COM 188710922SJeff.Bonwick@Sun.COM return (B_FALSE); 188810922SJeff.Bonwick@Sun.COM } 188910922SJeff.Bonwick@Sun.COM 189010922SJeff.Bonwick@Sun.COM static void 189110922SJeff.Bonwick@Sun.COM zio_ddt_child_write_ready(zio_t *zio) 189210922SJeff.Bonwick@Sun.COM { 189310922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 189410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 189510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 189610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 189710922SJeff.Bonwick@Sun.COM zio_t *pio; 189810922SJeff.Bonwick@Sun.COM 189910922SJeff.Bonwick@Sun.COM if (zio->io_error) 190010922SJeff.Bonwick@Sun.COM return; 190110922SJeff.Bonwick@Sun.COM 190210922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 190310922SJeff.Bonwick@Sun.COM 190410922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 190510922SJeff.Bonwick@Sun.COM 190610922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, zio->io_bp); 190710922SJeff.Bonwick@Sun.COM 190810922SJeff.Bonwick@Sun.COM while ((pio = zio_walk_parents(zio)) != NULL) 190910922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 191010922SJeff.Bonwick@Sun.COM 191110922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 191210922SJeff.Bonwick@Sun.COM } 191310922SJeff.Bonwick@Sun.COM 191410922SJeff.Bonwick@Sun.COM static void 191510922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done(zio_t *zio) 191610922SJeff.Bonwick@Sun.COM { 191710922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 191810922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 191910922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 192010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 192110922SJeff.Bonwick@Sun.COM 192210922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 192310922SJeff.Bonwick@Sun.COM 192410922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 192510922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 192610922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 192710922SJeff.Bonwick@Sun.COM 192810922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 192910922SJeff.Bonwick@Sun.COM while (zio_walk_parents(zio) != NULL) 193010922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 193110922SJeff.Bonwick@Sun.COM } else { 193210922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); 193310922SJeff.Bonwick@Sun.COM } 193410922SJeff.Bonwick@Sun.COM 193510922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 193610922SJeff.Bonwick@Sun.COM } 193710922SJeff.Bonwick@Sun.COM 193810922SJeff.Bonwick@Sun.COM static void 193910922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done(zio_t *zio) 194010922SJeff.Bonwick@Sun.COM { 194110922SJeff.Bonwick@Sun.COM int p = DDT_PHYS_DITTO; 194210922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 194310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 194410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 194510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 194610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 194710922SJeff.Bonwick@Sun.COM ddt_key_t *ddk = &dde->dde_key; 194810922SJeff.Bonwick@Sun.COM 194910922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 195010922SJeff.Bonwick@Sun.COM 195110922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 195210922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 195310922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 195410922SJeff.Bonwick@Sun.COM 195510922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 195610922SJeff.Bonwick@Sun.COM ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 195710922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 195810922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 195910922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 196010922SJeff.Bonwick@Sun.COM ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 196110922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 196210922SJeff.Bonwick@Sun.COM } 196310922SJeff.Bonwick@Sun.COM 196410922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 196510922SJeff.Bonwick@Sun.COM } 196610922SJeff.Bonwick@Sun.COM 196710922SJeff.Bonwick@Sun.COM static int 196810922SJeff.Bonwick@Sun.COM zio_ddt_write(zio_t *zio) 196910922SJeff.Bonwick@Sun.COM { 197010922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 197110922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 197210922SJeff.Bonwick@Sun.COM uint64_t txg = zio->io_txg; 197310922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 197410922SJeff.Bonwick@Sun.COM int p = zp->zp_copies; 197510922SJeff.Bonwick@Sun.COM int ditto_copies; 197610922SJeff.Bonwick@Sun.COM zio_t *cio = NULL; 197710922SJeff.Bonwick@Sun.COM zio_t *dio = NULL; 197810922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 197910922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 198010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 198110922SJeff.Bonwick@Sun.COM 198210922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 198310922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 198410922SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 198510922SJeff.Bonwick@Sun.COM 198610922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 198710922SJeff.Bonwick@Sun.COM dde = ddt_lookup(ddt, bp, B_TRUE); 198810922SJeff.Bonwick@Sun.COM ddp = &dde->dde_phys[p]; 198910922SJeff.Bonwick@Sun.COM 199010922SJeff.Bonwick@Sun.COM if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 199110922SJeff.Bonwick@Sun.COM /* 199210922SJeff.Bonwick@Sun.COM * If we're using a weak checksum, upgrade to a strong checksum 199310922SJeff.Bonwick@Sun.COM * and try again. If we're already using a strong checksum, 199410922SJeff.Bonwick@Sun.COM * we can't resolve it, so just convert to an ordinary write. 199510922SJeff.Bonwick@Sun.COM * (And automatically e-mail a paper to Nature?) 199610922SJeff.Bonwick@Sun.COM */ 199710922SJeff.Bonwick@Sun.COM if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 199810922SJeff.Bonwick@Sun.COM zp->zp_checksum = spa_dedup_checksum(spa); 199910922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 200010922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 200110922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 200210922SJeff.Bonwick@Sun.COM } else { 200310922SJeff.Bonwick@Sun.COM zp->zp_dedup = 0; 200410922SJeff.Bonwick@Sun.COM } 200510922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 200610922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 200710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 200810922SJeff.Bonwick@Sun.COM } 200910922SJeff.Bonwick@Sun.COM 201010922SJeff.Bonwick@Sun.COM ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 201110922SJeff.Bonwick@Sun.COM ASSERT(ditto_copies < SPA_DVAS_PER_BP); 201210922SJeff.Bonwick@Sun.COM 201310922SJeff.Bonwick@Sun.COM if (ditto_copies > ddt_ditto_copies_present(dde) && 201410922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 201510922SJeff.Bonwick@Sun.COM zio_prop_t czp = *zp; 201610922SJeff.Bonwick@Sun.COM 201710922SJeff.Bonwick@Sun.COM czp.zp_copies = ditto_copies; 201810922SJeff.Bonwick@Sun.COM 201910922SJeff.Bonwick@Sun.COM /* 202010922SJeff.Bonwick@Sun.COM * If we arrived here with an override bp, we won't have run 202110922SJeff.Bonwick@Sun.COM * the transform stack, so we won't have the data we need to 202210922SJeff.Bonwick@Sun.COM * generate a child i/o. So, toss the override bp and restart. 202310922SJeff.Bonwick@Sun.COM * This is safe, because using the override bp is just an 202410922SJeff.Bonwick@Sun.COM * optimization; and it's rare, so the cost doesn't matter. 202510922SJeff.Bonwick@Sun.COM */ 202610922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 202710922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 202810922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 202910922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 203010922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 203110922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 203210922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 203310922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 203410922SJeff.Bonwick@Sun.COM } 203510922SJeff.Bonwick@Sun.COM 203610922SJeff.Bonwick@Sun.COM dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 203710922SJeff.Bonwick@Sun.COM zio->io_orig_size, &czp, NULL, 203810922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done, dde, zio->io_priority, 203910922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 204010922SJeff.Bonwick@Sun.COM 204110922SJeff.Bonwick@Sun.COM zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 204210922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 204310922SJeff.Bonwick@Sun.COM } 204410922SJeff.Bonwick@Sun.COM 204510922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 204610922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 204710922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, bp, txg); 204810922SJeff.Bonwick@Sun.COM if (dde->dde_lead_zio[p] != NULL) 204910922SJeff.Bonwick@Sun.COM zio_add_child(zio, dde->dde_lead_zio[p]); 205010922SJeff.Bonwick@Sun.COM else 205110922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 205210922SJeff.Bonwick@Sun.COM } else if (zio->io_bp_override) { 205310922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == txg); 205410922SJeff.Bonwick@Sun.COM ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 205510922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 205610922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 205710922SJeff.Bonwick@Sun.COM } else { 205810922SJeff.Bonwick@Sun.COM cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 205910922SJeff.Bonwick@Sun.COM zio->io_orig_size, zp, zio_ddt_child_write_ready, 206010922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done, dde, zio->io_priority, 206110922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 206210922SJeff.Bonwick@Sun.COM 206310922SJeff.Bonwick@Sun.COM zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 206410922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = cio; 206510922SJeff.Bonwick@Sun.COM } 206610922SJeff.Bonwick@Sun.COM 206710922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 206810922SJeff.Bonwick@Sun.COM 206910922SJeff.Bonwick@Sun.COM if (cio) 207010922SJeff.Bonwick@Sun.COM zio_nowait(cio); 207110922SJeff.Bonwick@Sun.COM if (dio) 207210922SJeff.Bonwick@Sun.COM zio_nowait(dio); 207310922SJeff.Bonwick@Sun.COM 207410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 207510922SJeff.Bonwick@Sun.COM } 207610922SJeff.Bonwick@Sun.COM 207712296SLin.Ling@Sun.COM ddt_entry_t *freedde; /* for debugging */ 207812296SLin.Ling@Sun.COM 207910922SJeff.Bonwick@Sun.COM static int 208010922SJeff.Bonwick@Sun.COM zio_ddt_free(zio_t *zio) 208110922SJeff.Bonwick@Sun.COM { 208210922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 208310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 208410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 208510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 208610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 208710922SJeff.Bonwick@Sun.COM 208810922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 208910922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 209010922SJeff.Bonwick@Sun.COM 209110922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 209212296SLin.Ling@Sun.COM freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 209310922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 209410922SJeff.Bonwick@Sun.COM ddt_phys_decref(ddp); 209510922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 209610922SJeff.Bonwick@Sun.COM 209710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 209810922SJeff.Bonwick@Sun.COM } 209910922SJeff.Bonwick@Sun.COM 210010922SJeff.Bonwick@Sun.COM /* 210110922SJeff.Bonwick@Sun.COM * ========================================================================== 2102789Sahrens * Allocate and free blocks 2103789Sahrens * ========================================================================== 2104789Sahrens */ 21055530Sbonwick static int 2106789Sahrens zio_dva_allocate(zio_t *zio) 2107789Sahrens { 21084527Sperrin spa_t *spa = zio->io_spa; 210910922SJeff.Bonwick@Sun.COM metaslab_class_t *mc = spa_normal_class(spa); 2110789Sahrens blkptr_t *bp = zio->io_bp; 2111789Sahrens int error; 2112789Sahrens 21139443SBill.Moore@Sun.COM if (zio->io_gang_leader == NULL) { 21149443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 21159443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 21169443SBill.Moore@Sun.COM } 21179443SBill.Moore@Sun.COM 2118789Sahrens ASSERT(BP_IS_HOLE(bp)); 21191775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 212010922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, >, 0); 212110922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2122789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2123789Sahrens 21247754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, mc, zio->io_size, bp, 212510922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2126789Sahrens 21277754SJeff.Bonwick@Sun.COM if (error) { 21287754SJeff.Bonwick@Sun.COM if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 21297754SJeff.Bonwick@Sun.COM return (zio_write_gang_block(zio)); 2130789Sahrens zio->io_error = error; 2131789Sahrens } 21325530Sbonwick 21335530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2134789Sahrens } 2135789Sahrens 21365530Sbonwick static int 2137789Sahrens zio_dva_free(zio_t *zio) 2138789Sahrens { 21397754SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2140789Sahrens 21415530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2142789Sahrens } 2143789Sahrens 21445530Sbonwick static int 2145789Sahrens zio_dva_claim(zio_t *zio) 2146789Sahrens { 21477754SJeff.Bonwick@Sun.COM int error; 21487754SJeff.Bonwick@Sun.COM 21497754SJeff.Bonwick@Sun.COM error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 21507754SJeff.Bonwick@Sun.COM if (error) 21517754SJeff.Bonwick@Sun.COM zio->io_error = error; 2152789Sahrens 21535530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2154789Sahrens } 2155789Sahrens 2156789Sahrens /* 21577754SJeff.Bonwick@Sun.COM * Undo an allocation. This is used by zio_done() when an I/O fails 21587754SJeff.Bonwick@Sun.COM * and we want to give back the block we just allocated. 21597754SJeff.Bonwick@Sun.COM * This handles both normal blocks and gang blocks. 21607754SJeff.Bonwick@Sun.COM */ 21617754SJeff.Bonwick@Sun.COM static void 21627754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 21637754SJeff.Bonwick@Sun.COM { 21647754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 216510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_bp_override == NULL); 21667754SJeff.Bonwick@Sun.COM 21677754SJeff.Bonwick@Sun.COM if (!BP_IS_HOLE(bp)) 216810922SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 21697754SJeff.Bonwick@Sun.COM 21707754SJeff.Bonwick@Sun.COM if (gn != NULL) { 21717754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 21727754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, gn->gn_child[g], 21737754SJeff.Bonwick@Sun.COM &gn->gn_gbh->zg_blkptr[g]); 21747754SJeff.Bonwick@Sun.COM } 21757754SJeff.Bonwick@Sun.COM } 21767754SJeff.Bonwick@Sun.COM } 21777754SJeff.Bonwick@Sun.COM 21787754SJeff.Bonwick@Sun.COM /* 21797754SJeff.Bonwick@Sun.COM * Try to allocate an intent log block. Return 0 on success, errno on failure. 21807754SJeff.Bonwick@Sun.COM */ 21817754SJeff.Bonwick@Sun.COM int 218210922SJeff.Bonwick@Sun.COM zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 218310922SJeff.Bonwick@Sun.COM uint64_t size, boolean_t use_slog) 21847754SJeff.Bonwick@Sun.COM { 218510310SNeil.Perrin@Sun.COM int error = 1; 21867754SJeff.Bonwick@Sun.COM 218710922SJeff.Bonwick@Sun.COM ASSERT(txg > spa_syncing_txg(spa)); 218810922SJeff.Bonwick@Sun.COM 218910879SNeil.Perrin@Sun.COM if (use_slog) 219010922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_log_class(spa), size, 219110310SNeil.Perrin@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21927754SJeff.Bonwick@Sun.COM 21937754SJeff.Bonwick@Sun.COM if (error) 219410922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), size, 21957754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21967754SJeff.Bonwick@Sun.COM 21977754SJeff.Bonwick@Sun.COM if (error == 0) { 21987754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(new_bp, size); 21997754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(new_bp, size); 22007754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 220111670SNeil.Perrin@Sun.COM BP_SET_CHECKSUM(new_bp, 220211670SNeil.Perrin@Sun.COM spa_version(spa) >= SPA_VERSION_SLIM_ZIL 220311670SNeil.Perrin@Sun.COM ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 22047754SJeff.Bonwick@Sun.COM BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 22057754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(new_bp, 0); 220610922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(new_bp, 0); 22077754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 22087754SJeff.Bonwick@Sun.COM } 22097754SJeff.Bonwick@Sun.COM 22107754SJeff.Bonwick@Sun.COM return (error); 22117754SJeff.Bonwick@Sun.COM } 22127754SJeff.Bonwick@Sun.COM 22137754SJeff.Bonwick@Sun.COM /* 221410922SJeff.Bonwick@Sun.COM * Free an intent log block. 22157754SJeff.Bonwick@Sun.COM */ 22167754SJeff.Bonwick@Sun.COM void 221710922SJeff.Bonwick@Sun.COM zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 22187754SJeff.Bonwick@Sun.COM { 221910922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 22207754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(bp)); 22217754SJeff.Bonwick@Sun.COM 222210922SJeff.Bonwick@Sun.COM zio_free(spa, txg, bp); 22237754SJeff.Bonwick@Sun.COM } 22247754SJeff.Bonwick@Sun.COM 22257754SJeff.Bonwick@Sun.COM /* 2226789Sahrens * ========================================================================== 2227789Sahrens * Read and write to physical devices 2228789Sahrens * ========================================================================== 2229789Sahrens */ 22305530Sbonwick static int 22311775Sbillm zio_vdev_io_start(zio_t *zio) 2232789Sahrens { 2233789Sahrens vdev_t *vd = zio->io_vd; 22341775Sbillm uint64_t align; 22355329Sgw25295 spa_t *spa = zio->io_spa; 22365329Sgw25295 22377754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0); 22387754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 22397754SJeff.Bonwick@Sun.COM 22407754SJeff.Bonwick@Sun.COM if (vd == NULL) { 22417754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 22427754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2243789Sahrens 22447754SJeff.Bonwick@Sun.COM /* 22457754SJeff.Bonwick@Sun.COM * The mirror_ops handle multiple DVAs in a single BP. 22467754SJeff.Bonwick@Sun.COM */ 22475530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 22487754SJeff.Bonwick@Sun.COM } 22491775Sbillm 225012586SGeorge.Wilson@Sun.COM /* 225112586SGeorge.Wilson@Sun.COM * We keep track of time-sensitive I/Os so that the scan thread 225212586SGeorge.Wilson@Sun.COM * can quickly react to certain workloads. In particular, we care 225312586SGeorge.Wilson@Sun.COM * about non-scrubbing, top-level reads and writes with the following 225412586SGeorge.Wilson@Sun.COM * characteristics: 225512586SGeorge.Wilson@Sun.COM * - synchronous writes of user data to non-slog devices 225612586SGeorge.Wilson@Sun.COM * - any reads of user data 225712586SGeorge.Wilson@Sun.COM * When these conditions are met, adjust the timestamp of spa_last_io 225812586SGeorge.Wilson@Sun.COM * which allows the scan thread to adjust its workload accordingly. 225912586SGeorge.Wilson@Sun.COM */ 226012586SGeorge.Wilson@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 226112586SGeorge.Wilson@Sun.COM vd == vd->vdev_top && !vd->vdev_islog && 226212586SGeorge.Wilson@Sun.COM zio->io_bookmark.zb_objset != DMU_META_OBJSET && 226312586SGeorge.Wilson@Sun.COM zio->io_txg != spa_syncing_txg(spa)) { 226412586SGeorge.Wilson@Sun.COM uint64_t old = spa->spa_last_io; 226512586SGeorge.Wilson@Sun.COM uint64_t new = ddi_get_lbolt64(); 226612586SGeorge.Wilson@Sun.COM if (old != new) 226712586SGeorge.Wilson@Sun.COM (void) atomic_cas_64(&spa->spa_last_io, old, new); 226812586SGeorge.Wilson@Sun.COM } 226912586SGeorge.Wilson@Sun.COM 22707754SJeff.Bonwick@Sun.COM align = 1ULL << vd->vdev_top->vdev_ashift; 2271789Sahrens 22721732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 22731732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 22741732Sbonwick char *abuf = zio_buf_alloc(asize); 22757754SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 22761732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 22771732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 22781732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 22791732Sbonwick } 22807754SJeff.Bonwick@Sun.COM zio_push_transform(zio, abuf, asize, asize, zio_subblock); 22811732Sbonwick } 22821732Sbonwick 22831732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 22841732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 2285*13049SGeorge.Wilson@Sun.COM VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 22868241SJeff.Bonwick@Sun.COM 22878241SJeff.Bonwick@Sun.COM /* 22888241SJeff.Bonwick@Sun.COM * If this is a repair I/O, and there's no self-healing involved -- 22898241SJeff.Bonwick@Sun.COM * that is, we're just resilvering what we expect to resilver -- 22908241SJeff.Bonwick@Sun.COM * then don't do the I/O unless zio's txg is actually in vd's DTL. 22918241SJeff.Bonwick@Sun.COM * This prevents spurious resilvering with nested replication. 22928241SJeff.Bonwick@Sun.COM * For example, given a mirror of mirrors, (A+B)+(C+D), if only 22938241SJeff.Bonwick@Sun.COM * A is out of date, we'll read from C+D, then use the data to 22948241SJeff.Bonwick@Sun.COM * resilver A+B -- but we don't actually want to resilver B, just A. 22958241SJeff.Bonwick@Sun.COM * The top-level mirror has no way to know this, so instead we just 22968241SJeff.Bonwick@Sun.COM * discard unnecessary repairs as we work our way down the vdev tree. 22978241SJeff.Bonwick@Sun.COM * The same logic applies to any form of nested replication: 22988241SJeff.Bonwick@Sun.COM * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 22998241SJeff.Bonwick@Sun.COM */ 23008241SJeff.Bonwick@Sun.COM if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 23018241SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 23028241SJeff.Bonwick@Sun.COM zio->io_txg != 0 && /* not a delegated i/o */ 23038241SJeff.Bonwick@Sun.COM !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 23048241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 23058241SJeff.Bonwick@Sun.COM zio_vdev_io_bypass(zio); 23068241SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 23078241SJeff.Bonwick@Sun.COM } 2308789Sahrens 23097754SJeff.Bonwick@Sun.COM if (vd->vdev_ops->vdev_op_leaf && 23107754SJeff.Bonwick@Sun.COM (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 23117754SJeff.Bonwick@Sun.COM 23127754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 23138632SBill.Moore@Sun.COM return (ZIO_PIPELINE_CONTINUE); 23147754SJeff.Bonwick@Sun.COM 23157754SJeff.Bonwick@Sun.COM if ((zio = vdev_queue_io(zio)) == NULL) 23167754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23177754SJeff.Bonwick@Sun.COM 23187754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 23197754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23207754SJeff.Bonwick@Sun.COM zio_interrupt(zio); 23217754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23227754SJeff.Bonwick@Sun.COM } 23237754SJeff.Bonwick@Sun.COM } 23247754SJeff.Bonwick@Sun.COM 23255530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 2326789Sahrens } 2327789Sahrens 23285530Sbonwick static int 2329789Sahrens zio_vdev_io_done(zio_t *zio) 2330789Sahrens { 23317754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 23327754SJeff.Bonwick@Sun.COM vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 23337754SJeff.Bonwick@Sun.COM boolean_t unexpected_error = B_FALSE; 23345530Sbonwick 23357754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 23367754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23377754SJeff.Bonwick@Sun.COM 23387754SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2339789Sahrens 23407754SJeff.Bonwick@Sun.COM if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 23417754SJeff.Bonwick@Sun.COM 23427754SJeff.Bonwick@Sun.COM vdev_queue_io_done(zio); 23437754SJeff.Bonwick@Sun.COM 23447754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE) 23457754SJeff.Bonwick@Sun.COM vdev_cache_write(zio); 23467754SJeff.Bonwick@Sun.COM 23477754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23489725SEric.Schrock@Sun.COM zio->io_error = zio_handle_device_injection(vd, 23499725SEric.Schrock@Sun.COM zio, EIO); 2350789Sahrens 23517754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23527754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_label_injection(zio, EIO); 23537754SJeff.Bonwick@Sun.COM 23547754SJeff.Bonwick@Sun.COM if (zio->io_error) { 23557754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 23567754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23577754SJeff.Bonwick@Sun.COM } else { 23587754SJeff.Bonwick@Sun.COM unexpected_error = B_TRUE; 23597754SJeff.Bonwick@Sun.COM } 23607754SJeff.Bonwick@Sun.COM } 23616976Seschrock } 23627754SJeff.Bonwick@Sun.COM 23637754SJeff.Bonwick@Sun.COM ops->vdev_op_io_done(zio); 2364789Sahrens 23657754SJeff.Bonwick@Sun.COM if (unexpected_error) 23668632SBill.Moore@Sun.COM VERIFY(vdev_probe(vd, zio) == NULL); 23677754SJeff.Bonwick@Sun.COM 23687754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2369789Sahrens } 2370789Sahrens 237110614SJonathan.Adams@Sun.COM /* 237210614SJonathan.Adams@Sun.COM * For non-raidz ZIOs, we can just copy aside the bad data read from the 237310614SJonathan.Adams@Sun.COM * disk, and use that to finish the checksum ereport later. 237410614SJonathan.Adams@Sun.COM */ 237510614SJonathan.Adams@Sun.COM static void 237610614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 237710614SJonathan.Adams@Sun.COM const void *good_buf) 237810614SJonathan.Adams@Sun.COM { 237910614SJonathan.Adams@Sun.COM /* no processing needed */ 238010614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 238110614SJonathan.Adams@Sun.COM } 238210614SJonathan.Adams@Sun.COM 238310614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 238410614SJonathan.Adams@Sun.COM void 238510614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 238610614SJonathan.Adams@Sun.COM { 238710614SJonathan.Adams@Sun.COM void *buf = zio_buf_alloc(zio->io_size); 238810614SJonathan.Adams@Sun.COM 238910614SJonathan.Adams@Sun.COM bcopy(zio->io_data, buf, zio->io_size); 239010614SJonathan.Adams@Sun.COM 239110614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = zio->io_size; 239210614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = buf; 239310614SJonathan.Adams@Sun.COM zcr->zcr_finish = zio_vsd_default_cksum_finish; 239410614SJonathan.Adams@Sun.COM zcr->zcr_free = zio_buf_free; 239510614SJonathan.Adams@Sun.COM } 239610614SJonathan.Adams@Sun.COM 23975530Sbonwick static int 2398789Sahrens zio_vdev_io_assess(zio_t *zio) 2399789Sahrens { 2400789Sahrens vdev_t *vd = zio->io_vd; 2401789Sahrens 24027754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 24037754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2404789Sahrens 24057754SJeff.Bonwick@Sun.COM if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 24067754SJeff.Bonwick@Sun.COM spa_config_exit(zio->io_spa, SCL_ZIO, zio); 24077754SJeff.Bonwick@Sun.COM 24087754SJeff.Bonwick@Sun.COM if (zio->io_vsd != NULL) { 240910614SJonathan.Adams@Sun.COM zio->io_vsd_ops->vsd_free(zio); 24107754SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 24111732Sbonwick } 24121732Sbonwick 24137754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 24141544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 2415789Sahrens 2416789Sahrens /* 2417789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 241811173SJonathan.Adams@Sun.COM * 241911173SJonathan.Adams@Sun.COM * On retry, we cut in line in the issue queue, since we don't want 242011173SJonathan.Adams@Sun.COM * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2421789Sahrens */ 24227754SJeff.Bonwick@Sun.COM if (zio->io_error && vd == NULL && 24237754SJeff.Bonwick@Sun.COM !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 24247754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 24257754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2426789Sahrens zio->io_error = 0; 24277754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_RETRY | 24287754SJeff.Bonwick@Sun.COM ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 242910922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 243011173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 243111173SJonathan.Adams@Sun.COM zio_requeue_io_start_cut_in_line); 24327754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 24337754SJeff.Bonwick@Sun.COM } 2434789Sahrens 24357754SJeff.Bonwick@Sun.COM /* 24367754SJeff.Bonwick@Sun.COM * If we got an error on a leaf device, convert it to ENXIO 24377754SJeff.Bonwick@Sun.COM * if the device is not accessible at all. 24387754SJeff.Bonwick@Sun.COM */ 24397754SJeff.Bonwick@Sun.COM if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 24407754SJeff.Bonwick@Sun.COM !vdev_accessible(vd, zio)) 24417754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 24427754SJeff.Bonwick@Sun.COM 24437754SJeff.Bonwick@Sun.COM /* 24447754SJeff.Bonwick@Sun.COM * If we can't write to an interior vdev (mirror or RAID-Z), 24457754SJeff.Bonwick@Sun.COM * set vdev_cant_write so that we stop trying to allocate from it. 24467754SJeff.Bonwick@Sun.COM */ 24477754SJeff.Bonwick@Sun.COM if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 24487754SJeff.Bonwick@Sun.COM vd != NULL && !vd->vdev_ops->vdev_op_leaf) 24497754SJeff.Bonwick@Sun.COM vd->vdev_cant_write = B_TRUE; 24507754SJeff.Bonwick@Sun.COM 24517754SJeff.Bonwick@Sun.COM if (zio->io_error) 24527754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2453789Sahrens 24545530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2455789Sahrens } 2456789Sahrens 2457789Sahrens void 2458789Sahrens zio_vdev_io_reissue(zio_t *zio) 2459789Sahrens { 2460789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2461789Sahrens ASSERT(zio->io_error == 0); 2462789Sahrens 246310922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2464789Sahrens } 2465789Sahrens 2466789Sahrens void 2467789Sahrens zio_vdev_io_redone(zio_t *zio) 2468789Sahrens { 2469789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2470789Sahrens 247110922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2472789Sahrens } 2473789Sahrens 2474789Sahrens void 2475789Sahrens zio_vdev_io_bypass(zio_t *zio) 2476789Sahrens { 2477789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2478789Sahrens ASSERT(zio->io_error == 0); 2479789Sahrens 2480789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 248110922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2482789Sahrens } 2483789Sahrens 2484789Sahrens /* 2485789Sahrens * ========================================================================== 2486789Sahrens * Generate and verify checksums 2487789Sahrens * ========================================================================== 2488789Sahrens */ 24895530Sbonwick static int 2490789Sahrens zio_checksum_generate(zio_t *zio) 2491789Sahrens { 2492789Sahrens blkptr_t *bp = zio->io_bp; 24937754SJeff.Bonwick@Sun.COM enum zio_checksum checksum; 2494789Sahrens 24957754SJeff.Bonwick@Sun.COM if (bp == NULL) { 24967754SJeff.Bonwick@Sun.COM /* 24977754SJeff.Bonwick@Sun.COM * This is zio_write_phys(). 24987754SJeff.Bonwick@Sun.COM * We're either generating a label checksum, or none at all. 24997754SJeff.Bonwick@Sun.COM */ 25007754SJeff.Bonwick@Sun.COM checksum = zio->io_prop.zp_checksum; 2501789Sahrens 25027754SJeff.Bonwick@Sun.COM if (checksum == ZIO_CHECKSUM_OFF) 25037754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2504789Sahrens 25057754SJeff.Bonwick@Sun.COM ASSERT(checksum == ZIO_CHECKSUM_LABEL); 25067754SJeff.Bonwick@Sun.COM } else { 25077754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 25087754SJeff.Bonwick@Sun.COM ASSERT(!IO_IS_ALLOCATING(zio)); 25097754SJeff.Bonwick@Sun.COM checksum = ZIO_CHECKSUM_GANG_HEADER; 25107754SJeff.Bonwick@Sun.COM } else { 25117754SJeff.Bonwick@Sun.COM checksum = BP_GET_CHECKSUM(bp); 25127754SJeff.Bonwick@Sun.COM } 25137754SJeff.Bonwick@Sun.COM } 2514789Sahrens 25157754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2516789Sahrens 25175530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2518789Sahrens } 2519789Sahrens 25205530Sbonwick static int 2521789Sahrens zio_checksum_verify(zio_t *zio) 2522789Sahrens { 252310614SJonathan.Adams@Sun.COM zio_bad_cksum_t info; 25247754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 25257754SJeff.Bonwick@Sun.COM int error; 25267754SJeff.Bonwick@Sun.COM 252710922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vd != NULL); 252810922SJeff.Bonwick@Sun.COM 25297754SJeff.Bonwick@Sun.COM if (bp == NULL) { 25307754SJeff.Bonwick@Sun.COM /* 25317754SJeff.Bonwick@Sun.COM * This is zio_read_phys(). 25327754SJeff.Bonwick@Sun.COM * We're either verifying a label checksum, or nothing at all. 25337754SJeff.Bonwick@Sun.COM */ 25347754SJeff.Bonwick@Sun.COM if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 25357754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 25367754SJeff.Bonwick@Sun.COM 25377754SJeff.Bonwick@Sun.COM ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 25387754SJeff.Bonwick@Sun.COM } 25397754SJeff.Bonwick@Sun.COM 254010614SJonathan.Adams@Sun.COM if ((error = zio_checksum_error(zio, &info)) != 0) { 25417754SJeff.Bonwick@Sun.COM zio->io_error = error; 25427754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 254310614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum(zio->io_spa, 254410614SJonathan.Adams@Sun.COM zio->io_vd, zio, zio->io_offset, 254510614SJonathan.Adams@Sun.COM zio->io_size, NULL, &info); 25467754SJeff.Bonwick@Sun.COM } 2547789Sahrens } 2548789Sahrens 25495530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2550789Sahrens } 2551789Sahrens 2552789Sahrens /* 2553789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 2554789Sahrens */ 2555789Sahrens void 2556789Sahrens zio_checksum_verified(zio_t *zio) 2557789Sahrens { 255810922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2559789Sahrens } 2560789Sahrens 2561789Sahrens /* 25627754SJeff.Bonwick@Sun.COM * ========================================================================== 25637754SJeff.Bonwick@Sun.COM * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 25647754SJeff.Bonwick@Sun.COM * An error of 0 indictes success. ENXIO indicates whole-device failure, 25657754SJeff.Bonwick@Sun.COM * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 25667754SJeff.Bonwick@Sun.COM * indicate errors that are specific to one I/O, and most likely permanent. 25677754SJeff.Bonwick@Sun.COM * Any other error is presumed to be worse because we weren't expecting it. 25687754SJeff.Bonwick@Sun.COM * ========================================================================== 2569789Sahrens */ 25707754SJeff.Bonwick@Sun.COM int 25717754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2) 2572789Sahrens { 25737754SJeff.Bonwick@Sun.COM static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 25747754SJeff.Bonwick@Sun.COM int r1, r2; 25751775Sbillm 25767754SJeff.Bonwick@Sun.COM for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 25777754SJeff.Bonwick@Sun.COM if (e1 == zio_error_rank[r1]) 25787754SJeff.Bonwick@Sun.COM break; 25797754SJeff.Bonwick@Sun.COM 25807754SJeff.Bonwick@Sun.COM for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 25817754SJeff.Bonwick@Sun.COM if (e2 == zio_error_rank[r2]) 25827754SJeff.Bonwick@Sun.COM break; 25837754SJeff.Bonwick@Sun.COM 25847754SJeff.Bonwick@Sun.COM return (r1 > r2 ? e1 : e2); 2585789Sahrens } 2586789Sahrens 2587789Sahrens /* 2588789Sahrens * ========================================================================== 25897754SJeff.Bonwick@Sun.COM * I/O completion 2590789Sahrens * ========================================================================== 2591789Sahrens */ 25927754SJeff.Bonwick@Sun.COM static int 25937754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio) 25947754SJeff.Bonwick@Sun.COM { 25957754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 25968632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 25977754SJeff.Bonwick@Sun.COM 259810922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 259910922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 26009443SBill.Moore@Sun.COM return (ZIO_PIPELINE_STOP); 26019443SBill.Moore@Sun.COM 26027754SJeff.Bonwick@Sun.COM if (zio->io_ready) { 26037754SJeff.Bonwick@Sun.COM ASSERT(IO_IS_ALLOCATING(zio)); 26047754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 26057754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 26067754SJeff.Bonwick@Sun.COM 26077754SJeff.Bonwick@Sun.COM zio->io_ready(zio); 26087754SJeff.Bonwick@Sun.COM } 26097754SJeff.Bonwick@Sun.COM 26107754SJeff.Bonwick@Sun.COM if (bp != NULL && bp != &zio->io_bp_copy) 26117754SJeff.Bonwick@Sun.COM zio->io_bp_copy = *bp; 26127754SJeff.Bonwick@Sun.COM 26137754SJeff.Bonwick@Sun.COM if (zio->io_error) 26147754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 26157754SJeff.Bonwick@Sun.COM 26168632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 26178632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = 1; 26188632SBill.Moore@Sun.COM pio = zio_walk_parents(zio); 26198632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 26208632SBill.Moore@Sun.COM 26218632SBill.Moore@Sun.COM /* 26228632SBill.Moore@Sun.COM * As we notify zio's parents, new parents could be added. 26238632SBill.Moore@Sun.COM * New parents go to the head of zio's io_parent_list, however, 26248632SBill.Moore@Sun.COM * so we will (correctly) not notify them. The remainder of zio's 26258632SBill.Moore@Sun.COM * io_parent_list, from 'pio_next' onward, cannot change because 26268632SBill.Moore@Sun.COM * all parents must wait for us to be done before they can be done. 26278632SBill.Moore@Sun.COM */ 26288632SBill.Moore@Sun.COM for (; pio != NULL; pio = pio_next) { 26298632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 26307754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_READY); 26318632SBill.Moore@Sun.COM } 26327754SJeff.Bonwick@Sun.COM 263310922SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_NODATA) { 263410922SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp)) { 263510922SJeff.Bonwick@Sun.COM zio->io_flags &= ~ZIO_FLAG_NODATA; 263610922SJeff.Bonwick@Sun.COM } else { 263710922SJeff.Bonwick@Sun.COM ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 263810922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 263910922SJeff.Bonwick@Sun.COM } 264010922SJeff.Bonwick@Sun.COM } 264110922SJeff.Bonwick@Sun.COM 264211026STim.Haley@Sun.COM if (zio_injection_enabled && 264311026STim.Haley@Sun.COM zio->io_spa->spa_syncing_txg == zio->io_txg) 264411026STim.Haley@Sun.COM zio_handle_ignored_writes(zio); 264511026STim.Haley@Sun.COM 26467754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 26477754SJeff.Bonwick@Sun.COM } 26487754SJeff.Bonwick@Sun.COM 26497754SJeff.Bonwick@Sun.COM static int 26507754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio) 26517754SJeff.Bonwick@Sun.COM { 26527754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 26537754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 26547754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 26557754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 26567754SJeff.Bonwick@Sun.COM uint64_t psize = zio->io_size; 26578632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 26587754SJeff.Bonwick@Sun.COM 26597754SJeff.Bonwick@Sun.COM /* 26609443SBill.Moore@Sun.COM * If our children haven't all completed, 26617754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 26627754SJeff.Bonwick@Sun.COM */ 26637754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 26647754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 266510922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 26667754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 26677754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 26687754SJeff.Bonwick@Sun.COM 26697754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 26707754SJeff.Bonwick@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 26717754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[c][w] == 0); 26727754SJeff.Bonwick@Sun.COM 26737754SJeff.Bonwick@Sun.COM if (bp != NULL) { 26747754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[0] == 0); 26757754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[1] == 0); 26767754SJeff.Bonwick@Sun.COM ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 26778632SBill.Moore@Sun.COM (bp == zio_unique_parent(zio)->io_bp)); 26787754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 267910922SJeff.Bonwick@Sun.COM zio->io_bp_override == NULL && 26807754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 26817754SJeff.Bonwick@Sun.COM ASSERT(!BP_SHOULD_BYTESWAP(bp)); 268210922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 26837754SJeff.Bonwick@Sun.COM ASSERT(BP_COUNT_GANG(bp) == 0 || 26847754SJeff.Bonwick@Sun.COM (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 26857754SJeff.Bonwick@Sun.COM } 26867754SJeff.Bonwick@Sun.COM } 26877754SJeff.Bonwick@Sun.COM 26887754SJeff.Bonwick@Sun.COM /* 268910922SJeff.Bonwick@Sun.COM * If there were child vdev/gang/ddt errors, they apply to us now. 26907754SJeff.Bonwick@Sun.COM */ 26917754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 26927754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 269310922SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 269410922SJeff.Bonwick@Sun.COM 269510922SJeff.Bonwick@Sun.COM /* 269610922SJeff.Bonwick@Sun.COM * If the I/O on the transformed data was successful, generate any 269710922SJeff.Bonwick@Sun.COM * checksum reports now while we still have the transformed data. 269810922SJeff.Bonwick@Sun.COM */ 269910922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 270010922SJeff.Bonwick@Sun.COM while (zio->io_cksum_report != NULL) { 270110922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 270210922SJeff.Bonwick@Sun.COM uint64_t align = zcr->zcr_align; 270310922SJeff.Bonwick@Sun.COM uint64_t asize = P2ROUNDUP(psize, align); 270410922SJeff.Bonwick@Sun.COM char *abuf = zio->io_data; 270510922SJeff.Bonwick@Sun.COM 270610922SJeff.Bonwick@Sun.COM if (asize != psize) { 270710922SJeff.Bonwick@Sun.COM abuf = zio_buf_alloc(asize); 270810922SJeff.Bonwick@Sun.COM bcopy(zio->io_data, abuf, psize); 270910922SJeff.Bonwick@Sun.COM bzero(abuf + psize, asize - psize); 271010922SJeff.Bonwick@Sun.COM } 271110922SJeff.Bonwick@Sun.COM 271210922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 271310922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 271410922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, abuf); 271510922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 271610922SJeff.Bonwick@Sun.COM 271710922SJeff.Bonwick@Sun.COM if (asize != psize) 271810922SJeff.Bonwick@Sun.COM zio_buf_free(abuf, asize); 271910922SJeff.Bonwick@Sun.COM } 272010922SJeff.Bonwick@Sun.COM } 27217754SJeff.Bonwick@Sun.COM 27227754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); /* note: may set zio->io_error */ 27237754SJeff.Bonwick@Sun.COM 27247754SJeff.Bonwick@Sun.COM vdev_stat_update(zio, psize); 27257754SJeff.Bonwick@Sun.COM 27267754SJeff.Bonwick@Sun.COM if (zio->io_error) { 27277754SJeff.Bonwick@Sun.COM /* 27287754SJeff.Bonwick@Sun.COM * If this I/O is attached to a particular vdev, 27297754SJeff.Bonwick@Sun.COM * generate an error message describing the I/O failure 27307754SJeff.Bonwick@Sun.COM * at the block level. We ignore these errors if the 27317754SJeff.Bonwick@Sun.COM * device is currently unavailable. 27327754SJeff.Bonwick@Sun.COM */ 27337754SJeff.Bonwick@Sun.COM if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 27347754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 27357754SJeff.Bonwick@Sun.COM 273610685SGeorge.Wilson@Sun.COM if ((zio->io_error == EIO || !(zio->io_flags & 273710685SGeorge.Wilson@Sun.COM (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 273810685SGeorge.Wilson@Sun.COM zio == lio) { 27397754SJeff.Bonwick@Sun.COM /* 27407754SJeff.Bonwick@Sun.COM * For logical I/O requests, tell the SPA to log the 27417754SJeff.Bonwick@Sun.COM * error and generate a logical data ereport. 27427754SJeff.Bonwick@Sun.COM */ 27437754SJeff.Bonwick@Sun.COM spa_log_error(spa, zio); 27447754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 27457754SJeff.Bonwick@Sun.COM 0, 0); 27467754SJeff.Bonwick@Sun.COM } 27477754SJeff.Bonwick@Sun.COM } 27487754SJeff.Bonwick@Sun.COM 27497754SJeff.Bonwick@Sun.COM if (zio->io_error && zio == lio) { 27507754SJeff.Bonwick@Sun.COM /* 27517754SJeff.Bonwick@Sun.COM * Determine whether zio should be reexecuted. This will 27527754SJeff.Bonwick@Sun.COM * propagate all the way to the root via zio_notify_parent(). 27537754SJeff.Bonwick@Sun.COM */ 27547754SJeff.Bonwick@Sun.COM ASSERT(vd == NULL && bp != NULL); 275510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 275610922SJeff.Bonwick@Sun.COM 275710922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio) && 275810922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 27597754SJeff.Bonwick@Sun.COM if (zio->io_error != ENOSPC) 27607754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_NOW; 27617754SJeff.Bonwick@Sun.COM else 27627754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 276310922SJeff.Bonwick@Sun.COM } 27647754SJeff.Bonwick@Sun.COM 27657754SJeff.Bonwick@Sun.COM if ((zio->io_type == ZIO_TYPE_READ || 27667754SJeff.Bonwick@Sun.COM zio->io_type == ZIO_TYPE_FREE) && 276712586SGeorge.Wilson@Sun.COM !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 27687754SJeff.Bonwick@Sun.COM zio->io_error == ENXIO && 276911147SGeorge.Wilson@Sun.COM spa_load_state(spa) == SPA_LOAD_NONE && 27707754SJeff.Bonwick@Sun.COM spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 27717754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 27727754SJeff.Bonwick@Sun.COM 27737754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 27747754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 277510614SJonathan.Adams@Sun.COM 277610614SJonathan.Adams@Sun.COM /* 277710614SJonathan.Adams@Sun.COM * Here is a possibly good place to attempt to do 277810614SJonathan.Adams@Sun.COM * either combinatorial reconstruction or error correction 277910614SJonathan.Adams@Sun.COM * based on checksums. It also might be a good place 278010614SJonathan.Adams@Sun.COM * to send out preliminary ereports before we suspend 278110614SJonathan.Adams@Sun.COM * processing. 278210614SJonathan.Adams@Sun.COM */ 27837754SJeff.Bonwick@Sun.COM } 27847754SJeff.Bonwick@Sun.COM 27857754SJeff.Bonwick@Sun.COM /* 27867754SJeff.Bonwick@Sun.COM * If there were logical child errors, they apply to us now. 27877754SJeff.Bonwick@Sun.COM * We defer this until now to avoid conflating logical child 27887754SJeff.Bonwick@Sun.COM * errors with errors that happened to the zio itself when 27897754SJeff.Bonwick@Sun.COM * updating vdev stats and reporting FMA events above. 27907754SJeff.Bonwick@Sun.COM */ 27917754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 27927754SJeff.Bonwick@Sun.COM 279310922SJeff.Bonwick@Sun.COM if ((zio->io_error || zio->io_reexecute) && 279410922SJeff.Bonwick@Sun.COM IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 279510922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 27969443SBill.Moore@Sun.COM zio_dva_unallocate(zio, zio->io_gang_tree, bp); 27979443SBill.Moore@Sun.COM 27989443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 27999443SBill.Moore@Sun.COM 28009470SGeorge.Wilson@Sun.COM /* 28019470SGeorge.Wilson@Sun.COM * Godfather I/Os should never suspend. 28029470SGeorge.Wilson@Sun.COM */ 28039470SGeorge.Wilson@Sun.COM if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 28049470SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 28059470SGeorge.Wilson@Sun.COM zio->io_reexecute = 0; 28069470SGeorge.Wilson@Sun.COM 28079470SGeorge.Wilson@Sun.COM if (zio->io_reexecute) { 28087754SJeff.Bonwick@Sun.COM /* 28097754SJeff.Bonwick@Sun.COM * This is a logical I/O that wants to reexecute. 28107754SJeff.Bonwick@Sun.COM * 28117754SJeff.Bonwick@Sun.COM * Reexecute is top-down. When an i/o fails, if it's not 28127754SJeff.Bonwick@Sun.COM * the root, it simply notifies its parent and sticks around. 28137754SJeff.Bonwick@Sun.COM * The parent, seeing that it still has children in zio_done(), 28147754SJeff.Bonwick@Sun.COM * does the same. This percolates all the way up to the root. 28157754SJeff.Bonwick@Sun.COM * The root i/o will reexecute or suspend the entire tree. 28167754SJeff.Bonwick@Sun.COM * 28177754SJeff.Bonwick@Sun.COM * This approach ensures that zio_reexecute() honors 28187754SJeff.Bonwick@Sun.COM * all the original i/o dependency relationships, e.g. 28197754SJeff.Bonwick@Sun.COM * parents not executing until children are ready. 28207754SJeff.Bonwick@Sun.COM */ 28217754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 28227754SJeff.Bonwick@Sun.COM 28239443SBill.Moore@Sun.COM zio->io_gang_leader = NULL; 28247754SJeff.Bonwick@Sun.COM 28258632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 28268632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 28278632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 28288632SBill.Moore@Sun.COM 28299234SGeorge.Wilson@Sun.COM /* 28309234SGeorge.Wilson@Sun.COM * "The Godfather" I/O monitors its children but is 28319234SGeorge.Wilson@Sun.COM * not a true parent to them. It will track them through 28329234SGeorge.Wilson@Sun.COM * the pipeline but severs its ties whenever they get into 28339234SGeorge.Wilson@Sun.COM * trouble (e.g. suspended). This allows "The Godfather" 28349234SGeorge.Wilson@Sun.COM * I/O to return status without blocking. 28359234SGeorge.Wilson@Sun.COM */ 28369234SGeorge.Wilson@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 28379234SGeorge.Wilson@Sun.COM zio_link_t *zl = zio->io_walk_link; 28389234SGeorge.Wilson@Sun.COM pio_next = zio_walk_parents(zio); 28399234SGeorge.Wilson@Sun.COM 28409234SGeorge.Wilson@Sun.COM if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 28419234SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 28429234SGeorge.Wilson@Sun.COM zio_remove_child(pio, zio, zl); 28439234SGeorge.Wilson@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28449234SGeorge.Wilson@Sun.COM } 28459234SGeorge.Wilson@Sun.COM } 28469234SGeorge.Wilson@Sun.COM 28478632SBill.Moore@Sun.COM if ((pio = zio_unique_parent(zio)) != NULL) { 28487754SJeff.Bonwick@Sun.COM /* 28497754SJeff.Bonwick@Sun.COM * We're not a root i/o, so there's nothing to do 28507754SJeff.Bonwick@Sun.COM * but notify our parent. Don't propagate errors 28517754SJeff.Bonwick@Sun.COM * upward since we haven't permanently failed yet. 28527754SJeff.Bonwick@Sun.COM */ 28539470SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 28547754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 28557754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28567754SJeff.Bonwick@Sun.COM } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 28577754SJeff.Bonwick@Sun.COM /* 28587754SJeff.Bonwick@Sun.COM * We'd fail again if we reexecuted now, so suspend 28597754SJeff.Bonwick@Sun.COM * until conditions improve (e.g. device comes online). 28607754SJeff.Bonwick@Sun.COM */ 28617754SJeff.Bonwick@Sun.COM zio_suspend(spa, zio); 28627754SJeff.Bonwick@Sun.COM } else { 28637754SJeff.Bonwick@Sun.COM /* 28647754SJeff.Bonwick@Sun.COM * Reexecution is potentially a huge amount of work. 28657754SJeff.Bonwick@Sun.COM * Hand it off to the otherwise-unused claim taskq. 28667754SJeff.Bonwick@Sun.COM */ 28677754SJeff.Bonwick@Sun.COM (void) taskq_dispatch( 28687754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 28697754SJeff.Bonwick@Sun.COM (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 28707754SJeff.Bonwick@Sun.COM } 28717754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28727754SJeff.Bonwick@Sun.COM } 28737754SJeff.Bonwick@Sun.COM 287410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 28759470SGeorge.Wilson@Sun.COM ASSERT(zio->io_reexecute == 0); 28767754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 28777754SJeff.Bonwick@Sun.COM 287810922SJeff.Bonwick@Sun.COM /* 287910922SJeff.Bonwick@Sun.COM * Report any checksum errors, since the I/O is complete. 288010922SJeff.Bonwick@Sun.COM */ 288110614SJonathan.Adams@Sun.COM while (zio->io_cksum_report != NULL) { 288210922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 288310922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 288410922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 288510922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, NULL); 288610922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 288710614SJonathan.Adams@Sun.COM } 288810614SJonathan.Adams@Sun.COM 28898632SBill.Moore@Sun.COM /* 28908632SBill.Moore@Sun.COM * It is the responsibility of the done callback to ensure that this 28918632SBill.Moore@Sun.COM * particular zio is no longer discoverable for adoption, and as 28928632SBill.Moore@Sun.COM * such, cannot acquire any new parents. 28938632SBill.Moore@Sun.COM */ 28947754SJeff.Bonwick@Sun.COM if (zio->io_done) 28957754SJeff.Bonwick@Sun.COM zio->io_done(zio); 28967754SJeff.Bonwick@Sun.COM 28978632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 28988632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 28998632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 29007754SJeff.Bonwick@Sun.COM 29018632SBill.Moore@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 29028632SBill.Moore@Sun.COM zio_link_t *zl = zio->io_walk_link; 29038632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 29048632SBill.Moore@Sun.COM zio_remove_child(pio, zio, zl); 29057754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 29067754SJeff.Bonwick@Sun.COM } 29077754SJeff.Bonwick@Sun.COM 29087754SJeff.Bonwick@Sun.COM if (zio->io_waiter != NULL) { 29097754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 29107754SJeff.Bonwick@Sun.COM zio->io_executor = NULL; 29117754SJeff.Bonwick@Sun.COM cv_broadcast(&zio->io_cv); 29127754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 29137754SJeff.Bonwick@Sun.COM } else { 29147754SJeff.Bonwick@Sun.COM zio_destroy(zio); 29157754SJeff.Bonwick@Sun.COM } 29167754SJeff.Bonwick@Sun.COM 29177754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 29187754SJeff.Bonwick@Sun.COM } 29197754SJeff.Bonwick@Sun.COM 29207754SJeff.Bonwick@Sun.COM /* 29217754SJeff.Bonwick@Sun.COM * ========================================================================== 29227754SJeff.Bonwick@Sun.COM * I/O pipeline definition 29237754SJeff.Bonwick@Sun.COM * ========================================================================== 29247754SJeff.Bonwick@Sun.COM */ 292510922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[] = { 29265530Sbonwick NULL, 292710922SJeff.Bonwick@Sun.COM zio_read_bp_init, 292810922SJeff.Bonwick@Sun.COM zio_free_bp_init, 29295530Sbonwick zio_issue_async, 29307754SJeff.Bonwick@Sun.COM zio_write_bp_init, 2931789Sahrens zio_checksum_generate, 293210922SJeff.Bonwick@Sun.COM zio_ddt_read_start, 293310922SJeff.Bonwick@Sun.COM zio_ddt_read_done, 293410922SJeff.Bonwick@Sun.COM zio_ddt_write, 293510922SJeff.Bonwick@Sun.COM zio_ddt_free, 29367754SJeff.Bonwick@Sun.COM zio_gang_assemble, 29377754SJeff.Bonwick@Sun.COM zio_gang_issue, 2938789Sahrens zio_dva_allocate, 2939789Sahrens zio_dva_free, 2940789Sahrens zio_dva_claim, 2941789Sahrens zio_ready, 2942789Sahrens zio_vdev_io_start, 2943789Sahrens zio_vdev_io_done, 2944789Sahrens zio_vdev_io_assess, 2945789Sahrens zio_checksum_verify, 29467754SJeff.Bonwick@Sun.COM zio_done 2947789Sahrens }; 2948