1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 2212296SLin.Ling@Sun.COM * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23789Sahrens */ 24789Sahrens 25789Sahrens #include <sys/zfs_context.h> 261544Seschrock #include <sys/fm/fs/zfs.h> 27789Sahrens #include <sys/spa.h> 28789Sahrens #include <sys/txg.h> 29789Sahrens #include <sys/spa_impl.h> 30789Sahrens #include <sys/vdev_impl.h> 31789Sahrens #include <sys/zio_impl.h> 32789Sahrens #include <sys/zio_compress.h> 33789Sahrens #include <sys/zio_checksum.h> 3410922SJeff.Bonwick@Sun.COM #include <sys/dmu_objset.h> 3510922SJeff.Bonwick@Sun.COM #include <sys/arc.h> 3610922SJeff.Bonwick@Sun.COM #include <sys/ddt.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 4711146SGeorge.Wilson@Sun.COM 0, /* ZIO_PRIORITY_LOG_WRITE */ 4811146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_CACHE_FILL */ 4911146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_AGG */ 50789Sahrens 4, /* ZIO_PRIORITY_FREE */ 5111146SGeorge.Wilson@Sun.COM 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 5211146SGeorge.Wilson@Sun.COM 6, /* ZIO_PRIORITY_ASYNC_READ */ 53789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 54789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 5512450SGeorge.Wilson@Sun.COM 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 56789Sahrens }; 57789Sahrens 58789Sahrens /* 59789Sahrens * ========================================================================== 60789Sahrens * I/O type descriptions 61789Sahrens * ========================================================================== 62789Sahrens */ 63789Sahrens char *zio_type_name[ZIO_TYPES] = { 6411146SGeorge.Wilson@Sun.COM "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 6511146SGeorge.Wilson@Sun.COM "zio_ioctl" 6611146SGeorge.Wilson@Sun.COM }; 67789Sahrens 68789Sahrens /* 69789Sahrens * ========================================================================== 70789Sahrens * I/O kmem caches 71789Sahrens * ========================================================================== 72789Sahrens */ 734055Seschrock kmem_cache_t *zio_cache; 748632SBill.Moore@Sun.COM kmem_cache_t *zio_link_cache; 75789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 763290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 773290Sjohansen 783290Sjohansen #ifdef _KERNEL 793290Sjohansen extern vmem_t *zio_alloc_arena; 803290Sjohansen #endif 81789Sahrens 825329Sgw25295 /* 837754SJeff.Bonwick@Sun.COM * An allocating zio is one that either currently has the DVA allocate 847754SJeff.Bonwick@Sun.COM * stage set or will have it later in its lifetime. 855329Sgw25295 */ 8610922SJeff.Bonwick@Sun.COM #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 8710922SJeff.Bonwick@Sun.COM 8811173SJonathan.Adams@Sun.COM boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 8911173SJonathan.Adams@Sun.COM 9010922SJeff.Bonwick@Sun.COM #ifdef ZFS_DEBUG 9110922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 16384; 9210922SJeff.Bonwick@Sun.COM #else 9310922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 0; 9410922SJeff.Bonwick@Sun.COM #endif 955329Sgw25295 96789Sahrens void 97789Sahrens zio_init(void) 98789Sahrens { 99789Sahrens size_t c; 1003290Sjohansen vmem_t *data_alloc_arena = NULL; 1013290Sjohansen 1023290Sjohansen #ifdef _KERNEL 1033290Sjohansen data_alloc_arena = zio_alloc_arena; 1043290Sjohansen #endif 1058632SBill.Moore@Sun.COM zio_cache = kmem_cache_create("zio_cache", 1068632SBill.Moore@Sun.COM sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1078632SBill.Moore@Sun.COM zio_link_cache = kmem_cache_create("zio_link_cache", 1088632SBill.Moore@Sun.COM sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1094055Seschrock 110789Sahrens /* 111789Sahrens * For small buffers, we want a cache for each multiple of 112789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 113789Sahrens * for each quarter-power of 2. For large buffers, we want 114789Sahrens * a cache for each multiple of PAGESIZE. 115789Sahrens */ 116789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 117789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 118789Sahrens size_t p2 = size; 119789Sahrens size_t align = 0; 120789Sahrens 121789Sahrens while (p2 & (p2 - 1)) 122789Sahrens p2 &= p2 - 1; 123789Sahrens 124789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 125789Sahrens align = SPA_MINBLOCKSIZE; 126789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 127789Sahrens align = PAGESIZE; 128789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 129789Sahrens align = p2 >> 2; 130789Sahrens } 131789Sahrens 132789Sahrens if (align != 0) { 1333290Sjohansen char name[36]; 1342856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 135789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 13610922SJeff.Bonwick@Sun.COM align, NULL, NULL, NULL, NULL, NULL, 13710922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 1383290Sjohansen 1393290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1403290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1413290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 14210922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 143789Sahrens } 144789Sahrens } 145789Sahrens 146789Sahrens while (--c != 0) { 147789Sahrens ASSERT(zio_buf_cache[c] != NULL); 148789Sahrens if (zio_buf_cache[c - 1] == NULL) 149789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1503290Sjohansen 1513290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1523290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1533290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 154789Sahrens } 1551544Seschrock 1561544Seschrock zio_inject_init(); 157789Sahrens } 158789Sahrens 159789Sahrens void 160789Sahrens zio_fini(void) 161789Sahrens { 162789Sahrens size_t c; 163789Sahrens kmem_cache_t *last_cache = NULL; 1643290Sjohansen kmem_cache_t *last_data_cache = NULL; 165789Sahrens 166789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167789Sahrens if (zio_buf_cache[c] != last_cache) { 168789Sahrens last_cache = zio_buf_cache[c]; 169789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 170789Sahrens } 171789Sahrens zio_buf_cache[c] = NULL; 1723290Sjohansen 1733290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1743290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1753290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1763290Sjohansen } 1773290Sjohansen zio_data_buf_cache[c] = NULL; 178789Sahrens } 1791544Seschrock 1808632SBill.Moore@Sun.COM kmem_cache_destroy(zio_link_cache); 1814055Seschrock kmem_cache_destroy(zio_cache); 1824055Seschrock 1831544Seschrock zio_inject_fini(); 184789Sahrens } 185789Sahrens 186789Sahrens /* 187789Sahrens * ========================================================================== 188789Sahrens * Allocate and free I/O buffers 189789Sahrens * ========================================================================== 190789Sahrens */ 1913290Sjohansen 1923290Sjohansen /* 1933290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1943290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1953290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1963290Sjohansen * excess / transient data in-core during a crashdump. 1973290Sjohansen */ 198789Sahrens void * 199789Sahrens zio_buf_alloc(size_t size) 200789Sahrens { 201789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 202789Sahrens 203789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 204789Sahrens 2056245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 206789Sahrens } 207789Sahrens 2083290Sjohansen /* 2093290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2103290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2113290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2123290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2133290Sjohansen */ 2143290Sjohansen void * 2153290Sjohansen zio_data_buf_alloc(size_t size) 2163290Sjohansen { 2173290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2183290Sjohansen 2193290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2203290Sjohansen 2216245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2223290Sjohansen } 2233290Sjohansen 224789Sahrens void 225789Sahrens zio_buf_free(void *buf, size_t size) 226789Sahrens { 227789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 228789Sahrens 229789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 230789Sahrens 231789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 232789Sahrens } 233789Sahrens 2343290Sjohansen void 2353290Sjohansen zio_data_buf_free(void *buf, size_t size) 2363290Sjohansen { 2373290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2383290Sjohansen 2393290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2403290Sjohansen 2413290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2423290Sjohansen } 2433463Sahrens 244789Sahrens /* 245789Sahrens * ========================================================================== 246789Sahrens * Push and pop I/O transform buffers 247789Sahrens * ========================================================================== 248789Sahrens */ 249789Sahrens static void 2507754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 2517754SJeff.Bonwick@Sun.COM zio_transform_func_t *transform) 252789Sahrens { 253789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 254789Sahrens 2557754SJeff.Bonwick@Sun.COM zt->zt_orig_data = zio->io_data; 2567754SJeff.Bonwick@Sun.COM zt->zt_orig_size = zio->io_size; 257789Sahrens zt->zt_bufsize = bufsize; 2587754SJeff.Bonwick@Sun.COM zt->zt_transform = transform; 259789Sahrens 260789Sahrens zt->zt_next = zio->io_transform_stack; 261789Sahrens zio->io_transform_stack = zt; 262789Sahrens 263789Sahrens zio->io_data = data; 264789Sahrens zio->io_size = size; 265789Sahrens } 266789Sahrens 267789Sahrens static void 2687754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio) 269789Sahrens { 2707754SJeff.Bonwick@Sun.COM zio_transform_t *zt; 271789Sahrens 2727754SJeff.Bonwick@Sun.COM while ((zt = zio->io_transform_stack) != NULL) { 2737754SJeff.Bonwick@Sun.COM if (zt->zt_transform != NULL) 2747754SJeff.Bonwick@Sun.COM zt->zt_transform(zio, 2757754SJeff.Bonwick@Sun.COM zt->zt_orig_data, zt->zt_orig_size); 276789Sahrens 27710922SJeff.Bonwick@Sun.COM if (zt->zt_bufsize != 0) 27810922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zt->zt_bufsize); 279789Sahrens 2807754SJeff.Bonwick@Sun.COM zio->io_data = zt->zt_orig_data; 2817754SJeff.Bonwick@Sun.COM zio->io_size = zt->zt_orig_size; 2827754SJeff.Bonwick@Sun.COM zio->io_transform_stack = zt->zt_next; 283789Sahrens 2847754SJeff.Bonwick@Sun.COM kmem_free(zt, sizeof (zio_transform_t)); 285789Sahrens } 286789Sahrens } 287789Sahrens 288789Sahrens /* 289789Sahrens * ========================================================================== 2907754SJeff.Bonwick@Sun.COM * I/O transform callbacks for subblocks and decompression 2917754SJeff.Bonwick@Sun.COM * ========================================================================== 2927754SJeff.Bonwick@Sun.COM */ 2937754SJeff.Bonwick@Sun.COM static void 2947754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size) 2957754SJeff.Bonwick@Sun.COM { 2967754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size > size); 2977754SJeff.Bonwick@Sun.COM 2987754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ) 2997754SJeff.Bonwick@Sun.COM bcopy(zio->io_data, data, size); 3007754SJeff.Bonwick@Sun.COM } 3017754SJeff.Bonwick@Sun.COM 3027754SJeff.Bonwick@Sun.COM static void 3037754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size) 3047754SJeff.Bonwick@Sun.COM { 3057754SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && 3067754SJeff.Bonwick@Sun.COM zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 30710922SJeff.Bonwick@Sun.COM zio->io_data, data, zio->io_size, size) != 0) 3087754SJeff.Bonwick@Sun.COM zio->io_error = EIO; 3097754SJeff.Bonwick@Sun.COM } 3107754SJeff.Bonwick@Sun.COM 3117754SJeff.Bonwick@Sun.COM /* 3127754SJeff.Bonwick@Sun.COM * ========================================================================== 3137754SJeff.Bonwick@Sun.COM * I/O parent/child relationships and pipeline interlocks 3147754SJeff.Bonwick@Sun.COM * ========================================================================== 3157754SJeff.Bonwick@Sun.COM */ 3168632SBill.Moore@Sun.COM /* 3178632SBill.Moore@Sun.COM * NOTE - Callers to zio_walk_parents() and zio_walk_children must 3188632SBill.Moore@Sun.COM * continue calling these functions until they return NULL. 3198632SBill.Moore@Sun.COM * Otherwise, the next caller will pick up the list walk in 3208632SBill.Moore@Sun.COM * some indeterminate state. (Otherwise every caller would 3218632SBill.Moore@Sun.COM * have to pass in a cookie to keep the state represented by 3228632SBill.Moore@Sun.COM * io_walk_link, which gets annoying.) 3238632SBill.Moore@Sun.COM */ 3248632SBill.Moore@Sun.COM zio_t * 3258632SBill.Moore@Sun.COM zio_walk_parents(zio_t *cio) 3268632SBill.Moore@Sun.COM { 3278632SBill.Moore@Sun.COM zio_link_t *zl = cio->io_walk_link; 3288632SBill.Moore@Sun.COM list_t *pl = &cio->io_parent_list; 3297754SJeff.Bonwick@Sun.COM 3308632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 3318632SBill.Moore@Sun.COM cio->io_walk_link = zl; 3328632SBill.Moore@Sun.COM 3338632SBill.Moore@Sun.COM if (zl == NULL) 3348632SBill.Moore@Sun.COM return (NULL); 3358632SBill.Moore@Sun.COM 3368632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 3378632SBill.Moore@Sun.COM return (zl->zl_parent); 3388632SBill.Moore@Sun.COM } 3398632SBill.Moore@Sun.COM 3408632SBill.Moore@Sun.COM zio_t * 3418632SBill.Moore@Sun.COM zio_walk_children(zio_t *pio) 3427754SJeff.Bonwick@Sun.COM { 3438632SBill.Moore@Sun.COM zio_link_t *zl = pio->io_walk_link; 3448632SBill.Moore@Sun.COM list_t *cl = &pio->io_child_list; 3458632SBill.Moore@Sun.COM 3468632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 3478632SBill.Moore@Sun.COM pio->io_walk_link = zl; 3488632SBill.Moore@Sun.COM 3498632SBill.Moore@Sun.COM if (zl == NULL) 3508632SBill.Moore@Sun.COM return (NULL); 3518632SBill.Moore@Sun.COM 3528632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 3538632SBill.Moore@Sun.COM return (zl->zl_child); 3548632SBill.Moore@Sun.COM } 3558632SBill.Moore@Sun.COM 3568632SBill.Moore@Sun.COM zio_t * 3578632SBill.Moore@Sun.COM zio_unique_parent(zio_t *cio) 3588632SBill.Moore@Sun.COM { 3598632SBill.Moore@Sun.COM zio_t *pio = zio_walk_parents(cio); 3608632SBill.Moore@Sun.COM 3618632SBill.Moore@Sun.COM VERIFY(zio_walk_parents(cio) == NULL); 3628632SBill.Moore@Sun.COM return (pio); 3638632SBill.Moore@Sun.COM } 3648632SBill.Moore@Sun.COM 3658632SBill.Moore@Sun.COM void 3668632SBill.Moore@Sun.COM zio_add_child(zio_t *pio, zio_t *cio) 3678632SBill.Moore@Sun.COM { 3688632SBill.Moore@Sun.COM zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 3698632SBill.Moore@Sun.COM 3708632SBill.Moore@Sun.COM /* 3718632SBill.Moore@Sun.COM * Logical I/Os can have logical, gang, or vdev children. 3728632SBill.Moore@Sun.COM * Gang I/Os can have gang or vdev children. 3738632SBill.Moore@Sun.COM * Vdev I/Os can only have vdev children. 3748632SBill.Moore@Sun.COM * The following ASSERT captures all of these constraints. 3758632SBill.Moore@Sun.COM */ 3768632SBill.Moore@Sun.COM ASSERT(cio->io_child_type <= pio->io_child_type); 3778632SBill.Moore@Sun.COM 3788632SBill.Moore@Sun.COM zl->zl_parent = pio; 3798632SBill.Moore@Sun.COM zl->zl_child = cio; 3808632SBill.Moore@Sun.COM 3818632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 3827754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 3838632SBill.Moore@Sun.COM 3848632SBill.Moore@Sun.COM ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 3858632SBill.Moore@Sun.COM 3868632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3878632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 3888632SBill.Moore@Sun.COM 3898632SBill.Moore@Sun.COM list_insert_head(&pio->io_child_list, zl); 3908632SBill.Moore@Sun.COM list_insert_head(&cio->io_parent_list, zl); 3918632SBill.Moore@Sun.COM 39210922SJeff.Bonwick@Sun.COM pio->io_child_count++; 39310922SJeff.Bonwick@Sun.COM cio->io_parent_count++; 39410922SJeff.Bonwick@Sun.COM 3957754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 3968632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 3977754SJeff.Bonwick@Sun.COM } 3987754SJeff.Bonwick@Sun.COM 3997754SJeff.Bonwick@Sun.COM static void 4008632SBill.Moore@Sun.COM zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 4017754SJeff.Bonwick@Sun.COM { 4028632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 4038632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 4047754SJeff.Bonwick@Sun.COM 4058632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 4067754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4078632SBill.Moore@Sun.COM 4088632SBill.Moore@Sun.COM list_remove(&pio->io_child_list, zl); 4098632SBill.Moore@Sun.COM list_remove(&cio->io_parent_list, zl); 4108632SBill.Moore@Sun.COM 41110922SJeff.Bonwick@Sun.COM pio->io_child_count--; 41210922SJeff.Bonwick@Sun.COM cio->io_parent_count--; 41310922SJeff.Bonwick@Sun.COM 4147754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4158632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 4168632SBill.Moore@Sun.COM 4178632SBill.Moore@Sun.COM kmem_cache_free(zio_link_cache, zl); 4187754SJeff.Bonwick@Sun.COM } 4197754SJeff.Bonwick@Sun.COM 4207754SJeff.Bonwick@Sun.COM static boolean_t 4217754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 4227754SJeff.Bonwick@Sun.COM { 4237754SJeff.Bonwick@Sun.COM uint64_t *countp = &zio->io_children[child][wait]; 4247754SJeff.Bonwick@Sun.COM boolean_t waiting = B_FALSE; 4257754SJeff.Bonwick@Sun.COM 4267754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 4277754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 4287754SJeff.Bonwick@Sun.COM if (*countp != 0) { 42910922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 4307754SJeff.Bonwick@Sun.COM zio->io_stall = countp; 4317754SJeff.Bonwick@Sun.COM waiting = B_TRUE; 4327754SJeff.Bonwick@Sun.COM } 4337754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 4347754SJeff.Bonwick@Sun.COM 4357754SJeff.Bonwick@Sun.COM return (waiting); 4367754SJeff.Bonwick@Sun.COM } 4377754SJeff.Bonwick@Sun.COM 4387754SJeff.Bonwick@Sun.COM static void 4397754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 4407754SJeff.Bonwick@Sun.COM { 4417754SJeff.Bonwick@Sun.COM uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 4427754SJeff.Bonwick@Sun.COM int *errorp = &pio->io_child_error[zio->io_child_type]; 4437754SJeff.Bonwick@Sun.COM 4447754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4457754SJeff.Bonwick@Sun.COM if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 4467754SJeff.Bonwick@Sun.COM *errorp = zio_worst_error(*errorp, zio->io_error); 4477754SJeff.Bonwick@Sun.COM pio->io_reexecute |= zio->io_reexecute; 4487754SJeff.Bonwick@Sun.COM ASSERT3U(*countp, >, 0); 4497754SJeff.Bonwick@Sun.COM if (--*countp == 0 && pio->io_stall == countp) { 4507754SJeff.Bonwick@Sun.COM pio->io_stall = NULL; 4517754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4527754SJeff.Bonwick@Sun.COM zio_execute(pio); 4537754SJeff.Bonwick@Sun.COM } else { 4547754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4557754SJeff.Bonwick@Sun.COM } 4567754SJeff.Bonwick@Sun.COM } 4577754SJeff.Bonwick@Sun.COM 4587754SJeff.Bonwick@Sun.COM static void 4597754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c) 4607754SJeff.Bonwick@Sun.COM { 4617754SJeff.Bonwick@Sun.COM if (zio->io_child_error[c] != 0 && zio->io_error == 0) 4627754SJeff.Bonwick@Sun.COM zio->io_error = zio->io_child_error[c]; 4637754SJeff.Bonwick@Sun.COM } 4647754SJeff.Bonwick@Sun.COM 4657754SJeff.Bonwick@Sun.COM /* 4667754SJeff.Bonwick@Sun.COM * ========================================================================== 4677754SJeff.Bonwick@Sun.COM * Create the various types of I/O (read, write, free, etc) 468789Sahrens * ========================================================================== 469789Sahrens */ 470789Sahrens static zio_t * 47110922SJeff.Bonwick@Sun.COM zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 472789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 47310922SJeff.Bonwick@Sun.COM zio_type_t type, int priority, enum zio_flag flags, 47410922SJeff.Bonwick@Sun.COM vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 47510922SJeff.Bonwick@Sun.COM enum zio_stage stage, enum zio_stage pipeline) 476789Sahrens { 477789Sahrens zio_t *zio; 478789Sahrens 479789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 480789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 4817754SJeff.Bonwick@Sun.COM ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 482789Sahrens 4837754SJeff.Bonwick@Sun.COM ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 4847754SJeff.Bonwick@Sun.COM ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 4857754SJeff.Bonwick@Sun.COM ASSERT(vd || stage == ZIO_STAGE_OPEN); 4867046Sahrens 4874055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 4884055Seschrock bzero(zio, sizeof (zio_t)); 4897754SJeff.Bonwick@Sun.COM 4907754SJeff.Bonwick@Sun.COM mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 4917754SJeff.Bonwick@Sun.COM cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 4927754SJeff.Bonwick@Sun.COM 4938632SBill.Moore@Sun.COM list_create(&zio->io_parent_list, sizeof (zio_link_t), 4948632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_parent_node)); 4958632SBill.Moore@Sun.COM list_create(&zio->io_child_list, sizeof (zio_link_t), 4968632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_child_node)); 4978632SBill.Moore@Sun.COM 4987754SJeff.Bonwick@Sun.COM if (vd != NULL) 4997754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_VDEV; 5007754SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_GANG_CHILD) 5017754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_GANG; 50210922SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_DDT_CHILD) 50310922SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_DDT; 5047754SJeff.Bonwick@Sun.COM else 5057754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_LOGICAL; 5067754SJeff.Bonwick@Sun.COM 507789Sahrens if (bp != NULL) { 50810922SJeff.Bonwick@Sun.COM zio->io_bp = (blkptr_t *)bp; 509789Sahrens zio->io_bp_copy = *bp; 510789Sahrens zio->io_bp_orig = *bp; 51110922SJeff.Bonwick@Sun.COM if (type != ZIO_TYPE_WRITE || 51210922SJeff.Bonwick@Sun.COM zio->io_child_type == ZIO_CHILD_DDT) 5137754SJeff.Bonwick@Sun.COM zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 5149443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) 5157754SJeff.Bonwick@Sun.COM zio->io_logical = zio; 5169443SBill.Moore@Sun.COM if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 5179443SBill.Moore@Sun.COM pipeline |= ZIO_GANG_STAGES; 518789Sahrens } 5197754SJeff.Bonwick@Sun.COM 5207754SJeff.Bonwick@Sun.COM zio->io_spa = spa; 5217754SJeff.Bonwick@Sun.COM zio->io_txg = txg; 522789Sahrens zio->io_done = done; 523789Sahrens zio->io_private = private; 524789Sahrens zio->io_type = type; 525789Sahrens zio->io_priority = priority; 5267754SJeff.Bonwick@Sun.COM zio->io_vd = vd; 5277754SJeff.Bonwick@Sun.COM zio->io_offset = offset; 52810922SJeff.Bonwick@Sun.COM zio->io_orig_data = zio->io_data = data; 52910922SJeff.Bonwick@Sun.COM zio->io_orig_size = zio->io_size = size; 5307754SJeff.Bonwick@Sun.COM zio->io_orig_flags = zio->io_flags = flags; 5317754SJeff.Bonwick@Sun.COM zio->io_orig_stage = zio->io_stage = stage; 5327754SJeff.Bonwick@Sun.COM zio->io_orig_pipeline = zio->io_pipeline = pipeline; 5337754SJeff.Bonwick@Sun.COM 5348632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 5358632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 5368632SBill.Moore@Sun.COM 5377754SJeff.Bonwick@Sun.COM if (zb != NULL) 5387754SJeff.Bonwick@Sun.COM zio->io_bookmark = *zb; 539789Sahrens 5407754SJeff.Bonwick@Sun.COM if (pio != NULL) { 5417754SJeff.Bonwick@Sun.COM if (zio->io_logical == NULL) 5421544Seschrock zio->io_logical = pio->io_logical; 5439443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_GANG) 5449443SBill.Moore@Sun.COM zio->io_gang_leader = pio->io_gang_leader; 5457754SJeff.Bonwick@Sun.COM zio_add_child(pio, zio); 546789Sahrens } 547789Sahrens 548789Sahrens return (zio); 549789Sahrens } 550789Sahrens 5515329Sgw25295 static void 5527754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio) 5535329Sgw25295 { 5548632SBill.Moore@Sun.COM list_destroy(&zio->io_parent_list); 5558632SBill.Moore@Sun.COM list_destroy(&zio->io_child_list); 5567754SJeff.Bonwick@Sun.COM mutex_destroy(&zio->io_lock); 5577754SJeff.Bonwick@Sun.COM cv_destroy(&zio->io_cv); 5587754SJeff.Bonwick@Sun.COM kmem_cache_free(zio_cache, zio); 5595329Sgw25295 } 5605329Sgw25295 561789Sahrens zio_t * 5628632SBill.Moore@Sun.COM zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 56310922SJeff.Bonwick@Sun.COM void *private, enum zio_flag flags) 564789Sahrens { 565789Sahrens zio_t *zio; 566789Sahrens 567789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 5688632SBill.Moore@Sun.COM ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 5697754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 570789Sahrens 571789Sahrens return (zio); 572789Sahrens } 573789Sahrens 574789Sahrens zio_t * 57510922SJeff.Bonwick@Sun.COM zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 576789Sahrens { 5778632SBill.Moore@Sun.COM return (zio_null(NULL, spa, NULL, done, private, flags)); 578789Sahrens } 579789Sahrens 580789Sahrens zio_t * 5817754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 5827754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_done_func_t *done, void *private, 58310922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 584789Sahrens { 585789Sahrens zio_t *zio; 586789Sahrens 58710922SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 5887046Sahrens data, size, done, private, 5897754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 59010922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 59110922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 592789Sahrens 593789Sahrens return (zio); 594789Sahrens } 595789Sahrens 596789Sahrens zio_t * 5977754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 59810922SJeff.Bonwick@Sun.COM void *data, uint64_t size, const zio_prop_t *zp, 5997754SJeff.Bonwick@Sun.COM zio_done_func_t *ready, zio_done_func_t *done, void *private, 60010922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 601789Sahrens { 602789Sahrens zio_t *zio; 603789Sahrens 6047754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 6057754SJeff.Bonwick@Sun.COM zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 6067754SJeff.Bonwick@Sun.COM zp->zp_compress >= ZIO_COMPRESS_OFF && 6077754SJeff.Bonwick@Sun.COM zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 6087754SJeff.Bonwick@Sun.COM zp->zp_type < DMU_OT_NUMTYPES && 6097754SJeff.Bonwick@Sun.COM zp->zp_level < 32 && 61010922SJeff.Bonwick@Sun.COM zp->zp_copies > 0 && 61110922SJeff.Bonwick@Sun.COM zp->zp_copies <= spa_max_replication(spa) && 61210922SJeff.Bonwick@Sun.COM zp->zp_dedup <= 1 && 61310922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify <= 1); 6145329Sgw25295 615789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6167754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 61710922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 61810922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 619789Sahrens 6203547Smaybee zio->io_ready = ready; 6217754SJeff.Bonwick@Sun.COM zio->io_prop = *zp; 622789Sahrens 623789Sahrens return (zio); 624789Sahrens } 625789Sahrens 626789Sahrens zio_t * 6277754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 6287754SJeff.Bonwick@Sun.COM uint64_t size, zio_done_func_t *done, void *private, int priority, 62910922SJeff.Bonwick@Sun.COM enum zio_flag flags, zbookmark_t *zb) 630789Sahrens { 631789Sahrens zio_t *zio; 632789Sahrens 6337181Sperrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6347754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 6357754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 636789Sahrens 637789Sahrens return (zio); 638789Sahrens } 639789Sahrens 64010922SJeff.Bonwick@Sun.COM void 64110922SJeff.Bonwick@Sun.COM zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 64210922SJeff.Bonwick@Sun.COM { 64310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 64410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 64510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 64610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 64710922SJeff.Bonwick@Sun.COM 64810922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies = copies; 64910922SJeff.Bonwick@Sun.COM zio->io_bp_override = bp; 65010922SJeff.Bonwick@Sun.COM } 65110922SJeff.Bonwick@Sun.COM 65210922SJeff.Bonwick@Sun.COM void 65310922SJeff.Bonwick@Sun.COM zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 65410922SJeff.Bonwick@Sun.COM { 655*12470SMatthew.Ahrens@Sun.COM bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 65610922SJeff.Bonwick@Sun.COM } 65710922SJeff.Bonwick@Sun.COM 658789Sahrens zio_t * 65910922SJeff.Bonwick@Sun.COM zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 66010922SJeff.Bonwick@Sun.COM enum zio_flag flags) 661789Sahrens { 662789Sahrens zio_t *zio; 663789Sahrens 66412296SLin.Ling@Sun.COM dprintf_bp(bp, "freeing in txg %llu, pass %u", 66512296SLin.Ling@Sun.COM (longlong_t)txg, spa->spa_sync_pass); 66612296SLin.Ling@Sun.COM 667789Sahrens ASSERT(!BP_IS_HOLE(bp)); 66810922SJeff.Bonwick@Sun.COM ASSERT(spa_syncing_txg(spa) == txg); 66910922SJeff.Bonwick@Sun.COM ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 670789Sahrens 6717754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 67210922SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 6737754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 674789Sahrens 675789Sahrens return (zio); 676789Sahrens } 677789Sahrens 678789Sahrens zio_t * 67910922SJeff.Bonwick@Sun.COM zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 68010922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, enum zio_flag flags) 681789Sahrens { 682789Sahrens zio_t *zio; 683789Sahrens 684789Sahrens /* 685789Sahrens * A claim is an allocation of a specific block. Claims are needed 686789Sahrens * to support immediate writes in the intent log. The issue is that 687789Sahrens * immediate writes contain committed data, but in a txg that was 688789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 689789Sahrens * the intent log claims all blocks that contain immediate write data 690789Sahrens * so that the SPA knows they're in use. 691789Sahrens * 692789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 693789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 69410922SJeff.Bonwick@Sun.COM * If txg == 0 we just verify that the block is claimable. 695789Sahrens */ 696789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 69710922SJeff.Bonwick@Sun.COM ASSERT(txg == spa_first_txg(spa) || txg == 0); 69810922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 699789Sahrens 7007754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 7017754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 7027754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 703789Sahrens 704789Sahrens return (zio); 705789Sahrens } 706789Sahrens 707789Sahrens zio_t * 708789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 70910922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 710789Sahrens { 711789Sahrens zio_t *zio; 712789Sahrens int c; 713789Sahrens 714789Sahrens if (vd->vdev_children == 0) { 715789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 7167754SJeff.Bonwick@Sun.COM ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 717789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 718789Sahrens 719789Sahrens zio->io_cmd = cmd; 720789Sahrens } else { 7218632SBill.Moore@Sun.COM zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 722789Sahrens 723789Sahrens for (c = 0; c < vd->vdev_children; c++) 724789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 725789Sahrens done, private, priority, flags)); 726789Sahrens } 727789Sahrens 728789Sahrens return (zio); 729789Sahrens } 730789Sahrens 731789Sahrens zio_t * 732789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 733789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 73410922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 735789Sahrens { 736789Sahrens zio_t *zio; 7375329Sgw25295 7387754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7397754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7407754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7417754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 742789Sahrens 7437754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7447754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 745789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 746789Sahrens 7477754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 748789Sahrens 749789Sahrens return (zio); 750789Sahrens } 751789Sahrens 752789Sahrens zio_t * 753789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 754789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 75510922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 756789Sahrens { 757789Sahrens zio_t *zio; 758789Sahrens 7597754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7607754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7617754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7627754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 7635329Sgw25295 7647754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7657754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 766789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 767789Sahrens 7687754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 769789Sahrens 77011670SNeil.Perrin@Sun.COM if (zio_checksum_table[checksum].ci_eck) { 771789Sahrens /* 77211670SNeil.Perrin@Sun.COM * zec checksums are necessarily destructive -- they modify 7737754SJeff.Bonwick@Sun.COM * the end of the write buffer to hold the verifier/checksum. 774789Sahrens * Therefore, we must make a local copy in case the data is 7757754SJeff.Bonwick@Sun.COM * being written to multiple places in parallel. 776789Sahrens */ 7777754SJeff.Bonwick@Sun.COM void *wbuf = zio_buf_alloc(size); 778789Sahrens bcopy(data, wbuf, size); 7797754SJeff.Bonwick@Sun.COM zio_push_transform(zio, wbuf, size, size, NULL); 780789Sahrens } 781789Sahrens 782789Sahrens return (zio); 783789Sahrens } 784789Sahrens 785789Sahrens /* 7867754SJeff.Bonwick@Sun.COM * Create a child I/O to do some work for us. 787789Sahrens */ 788789Sahrens zio_t * 7897754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 79010922SJeff.Bonwick@Sun.COM void *data, uint64_t size, int type, int priority, enum zio_flag flags, 791789Sahrens zio_done_func_t *done, void *private) 792789Sahrens { 79310922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 7947754SJeff.Bonwick@Sun.COM zio_t *zio; 7957754SJeff.Bonwick@Sun.COM 7967754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_parent == 7977754SJeff.Bonwick@Sun.COM (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 798789Sahrens 799789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 800789Sahrens /* 801789Sahrens * If we have the bp, then the child should perform the 802789Sahrens * checksum and the parent need not. This pushes error 803789Sahrens * detection as close to the leaves as possible and 804789Sahrens * eliminates redundant checksums in the interior nodes. 805789Sahrens */ 80610922SJeff.Bonwick@Sun.COM pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 80710922SJeff.Bonwick@Sun.COM pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 8087754SJeff.Bonwick@Sun.COM } 8097754SJeff.Bonwick@Sun.COM 8107754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0) 8117754SJeff.Bonwick@Sun.COM offset += VDEV_LABEL_START_SIZE; 8127754SJeff.Bonwick@Sun.COM 81310922SJeff.Bonwick@Sun.COM flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 81410922SJeff.Bonwick@Sun.COM 81510922SJeff.Bonwick@Sun.COM /* 81610922SJeff.Bonwick@Sun.COM * If we've decided to do a repair, the write is not speculative -- 81710922SJeff.Bonwick@Sun.COM * even if the original read was. 81810922SJeff.Bonwick@Sun.COM */ 81910922SJeff.Bonwick@Sun.COM if (flags & ZIO_FLAG_IO_REPAIR) 82010922SJeff.Bonwick@Sun.COM flags &= ~ZIO_FLAG_SPECULATIVE; 82110922SJeff.Bonwick@Sun.COM 8227754SJeff.Bonwick@Sun.COM zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 82310922SJeff.Bonwick@Sun.COM done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 82410922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 8257754SJeff.Bonwick@Sun.COM 8267754SJeff.Bonwick@Sun.COM return (zio); 8277754SJeff.Bonwick@Sun.COM } 8287754SJeff.Bonwick@Sun.COM 8297754SJeff.Bonwick@Sun.COM zio_t * 8307754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 83110922SJeff.Bonwick@Sun.COM int type, int priority, enum zio_flag flags, 83210922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private) 8337754SJeff.Bonwick@Sun.COM { 8347754SJeff.Bonwick@Sun.COM zio_t *zio; 8357754SJeff.Bonwick@Sun.COM 8367754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_ops->vdev_op_leaf); 8377754SJeff.Bonwick@Sun.COM 8387754SJeff.Bonwick@Sun.COM zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 8397754SJeff.Bonwick@Sun.COM data, size, done, private, type, priority, 8407754SJeff.Bonwick@Sun.COM flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 8417754SJeff.Bonwick@Sun.COM vd, offset, NULL, 84210922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 8437754SJeff.Bonwick@Sun.COM 8447754SJeff.Bonwick@Sun.COM return (zio); 8457754SJeff.Bonwick@Sun.COM } 8467754SJeff.Bonwick@Sun.COM 8477754SJeff.Bonwick@Sun.COM void 8487754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd) 8497754SJeff.Bonwick@Sun.COM { 8507754SJeff.Bonwick@Sun.COM zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 8517754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_PRIORITY_NOW, 8527754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 8537754SJeff.Bonwick@Sun.COM } 8547754SJeff.Bonwick@Sun.COM 85511670SNeil.Perrin@Sun.COM void 85611670SNeil.Perrin@Sun.COM zio_shrink(zio_t *zio, uint64_t size) 85711670SNeil.Perrin@Sun.COM { 85811670SNeil.Perrin@Sun.COM ASSERT(zio->io_executor == NULL); 85911670SNeil.Perrin@Sun.COM ASSERT(zio->io_orig_size == zio->io_size); 86011670SNeil.Perrin@Sun.COM ASSERT(size <= zio->io_size); 86111670SNeil.Perrin@Sun.COM 86211670SNeil.Perrin@Sun.COM /* 86311670SNeil.Perrin@Sun.COM * We don't shrink for raidz because of problems with the 86411670SNeil.Perrin@Sun.COM * reconstruction when reading back less than the block size. 86511670SNeil.Perrin@Sun.COM * Note, BP_IS_RAIDZ() assumes no compression. 86611670SNeil.Perrin@Sun.COM */ 86711670SNeil.Perrin@Sun.COM ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 86811670SNeil.Perrin@Sun.COM if (!BP_IS_RAIDZ(zio->io_bp)) 86911670SNeil.Perrin@Sun.COM zio->io_orig_size = zio->io_size = size; 87011670SNeil.Perrin@Sun.COM } 87111670SNeil.Perrin@Sun.COM 8727754SJeff.Bonwick@Sun.COM /* 8737754SJeff.Bonwick@Sun.COM * ========================================================================== 8747754SJeff.Bonwick@Sun.COM * Prepare to read and write logical blocks 8757754SJeff.Bonwick@Sun.COM * ========================================================================== 8767754SJeff.Bonwick@Sun.COM */ 8777754SJeff.Bonwick@Sun.COM 8787754SJeff.Bonwick@Sun.COM static int 8797754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio) 8807754SJeff.Bonwick@Sun.COM { 8817754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8827754SJeff.Bonwick@Sun.COM 8838274SJeff.Bonwick@Sun.COM if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 8849443SBill.Moore@Sun.COM zio->io_child_type == ZIO_CHILD_LOGICAL && 8859443SBill.Moore@Sun.COM !(zio->io_flags & ZIO_FLAG_RAW)) { 88610922SJeff.Bonwick@Sun.COM uint64_t psize = BP_GET_PSIZE(bp); 88710922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(psize); 88810922SJeff.Bonwick@Sun.COM 88910922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 8907754SJeff.Bonwick@Sun.COM } 8917754SJeff.Bonwick@Sun.COM 8927754SJeff.Bonwick@Sun.COM if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 8937754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 8947754SJeff.Bonwick@Sun.COM 89511125SJeff.Bonwick@Sun.COM if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 89611125SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 89711125SJeff.Bonwick@Sun.COM 89810922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 89910922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 90010922SJeff.Bonwick@Sun.COM 9017754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9027754SJeff.Bonwick@Sun.COM } 9037754SJeff.Bonwick@Sun.COM 9047754SJeff.Bonwick@Sun.COM static int 9057754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio) 9067754SJeff.Bonwick@Sun.COM { 90710922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 9087754SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 90910922SJeff.Bonwick@Sun.COM enum zio_compress compress = zp->zp_compress; 9107754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 9117754SJeff.Bonwick@Sun.COM uint64_t lsize = zio->io_size; 91210922SJeff.Bonwick@Sun.COM uint64_t psize = lsize; 9137754SJeff.Bonwick@Sun.COM int pass = 1; 9147754SJeff.Bonwick@Sun.COM 9157754SJeff.Bonwick@Sun.COM /* 9167754SJeff.Bonwick@Sun.COM * If our children haven't all reached the ready stage, 9177754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 9187754SJeff.Bonwick@Sun.COM */ 9197754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 9207754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 9217754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 9227754SJeff.Bonwick@Sun.COM 9237754SJeff.Bonwick@Sun.COM if (!IO_IS_ALLOCATING(zio)) 9247754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9257754SJeff.Bonwick@Sun.COM 92610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 92710922SJeff.Bonwick@Sun.COM 92810922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 92910922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth != zio->io_txg); 93010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 93110922SJeff.Bonwick@Sun.COM 93210922SJeff.Bonwick@Sun.COM *bp = *zio->io_bp_override; 93310922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 93410922SJeff.Bonwick@Sun.COM 93510922SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(bp) || !zp->zp_dedup) 93610922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 93710922SJeff.Bonwick@Sun.COM 93810922SJeff.Bonwick@Sun.COM ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 93910922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify); 94010922SJeff.Bonwick@Sun.COM 94110922SJeff.Bonwick@Sun.COM if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 94210922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, 1); 94310922SJeff.Bonwick@Sun.COM zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 94410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 94510922SJeff.Bonwick@Sun.COM } 94610922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 94710922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 94810922SJeff.Bonwick@Sun.COM } 9497754SJeff.Bonwick@Sun.COM 9507754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg) { 9517754SJeff.Bonwick@Sun.COM /* 9527754SJeff.Bonwick@Sun.COM * We're rewriting an existing block, which means we're 9537754SJeff.Bonwick@Sun.COM * working on behalf of spa_sync(). For spa_sync() to 9547754SJeff.Bonwick@Sun.COM * converge, it must eventually be the case that we don't 9557754SJeff.Bonwick@Sun.COM * have to allocate new blocks. But compression changes 9567754SJeff.Bonwick@Sun.COM * the blocksize, which forces a reallocate, and makes 9577754SJeff.Bonwick@Sun.COM * convergence take longer. Therefore, after the first 9587754SJeff.Bonwick@Sun.COM * few passes, stop compressing to ensure convergence. 9597754SJeff.Bonwick@Sun.COM */ 96010922SJeff.Bonwick@Sun.COM pass = spa_sync_pass(spa); 96110922SJeff.Bonwick@Sun.COM 96210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(spa)); 96310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 96410922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp)); 9657754SJeff.Bonwick@Sun.COM 9667754SJeff.Bonwick@Sun.COM if (pass > SYNC_PASS_DONT_COMPRESS) 9677754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 9687754SJeff.Bonwick@Sun.COM 9697754SJeff.Bonwick@Sun.COM /* Make sure someone doesn't change their mind on overwrites */ 97010922SJeff.Bonwick@Sun.COM ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 97110922SJeff.Bonwick@Sun.COM spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 9727754SJeff.Bonwick@Sun.COM } 9737754SJeff.Bonwick@Sun.COM 9747754SJeff.Bonwick@Sun.COM if (compress != ZIO_COMPRESS_OFF) { 97510922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(lsize); 97610922SJeff.Bonwick@Sun.COM psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 97710922SJeff.Bonwick@Sun.COM if (psize == 0 || psize == lsize) { 9787754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 97910922SJeff.Bonwick@Sun.COM zio_buf_free(cbuf, lsize); 98010922SJeff.Bonwick@Sun.COM } else { 98110922SJeff.Bonwick@Sun.COM ASSERT(psize < lsize); 98210922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, lsize, NULL); 9837754SJeff.Bonwick@Sun.COM } 984789Sahrens } 985789Sahrens 9867754SJeff.Bonwick@Sun.COM /* 9877754SJeff.Bonwick@Sun.COM * The final pass of spa_sync() must be all rewrites, but the first 9887754SJeff.Bonwick@Sun.COM * few passes offer a trade-off: allocating blocks defers convergence, 9897754SJeff.Bonwick@Sun.COM * but newly allocated blocks are sequential, so they can be written 9907754SJeff.Bonwick@Sun.COM * to disk faster. Therefore, we allow the first few passes of 9917754SJeff.Bonwick@Sun.COM * spa_sync() to allocate new blocks, but force rewrites after that. 9927754SJeff.Bonwick@Sun.COM * There should only be a handful of blocks after pass 1 in any case. 9937754SJeff.Bonwick@Sun.COM */ 99410922SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 9957754SJeff.Bonwick@Sun.COM pass > SYNC_PASS_REWRITE) { 99610922SJeff.Bonwick@Sun.COM ASSERT(psize != 0); 99710922SJeff.Bonwick@Sun.COM enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 9987754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 9997754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_REWRITE; 10007754SJeff.Bonwick@Sun.COM } else { 10017754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 10027754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 10037754SJeff.Bonwick@Sun.COM } 10047754SJeff.Bonwick@Sun.COM 100510922SJeff.Bonwick@Sun.COM if (psize == 0) { 10067754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 10077754SJeff.Bonwick@Sun.COM } else { 10087754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 10097754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(bp, lsize); 101010922SJeff.Bonwick@Sun.COM BP_SET_PSIZE(bp, psize); 10117754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(bp, compress); 10127754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(bp, zp->zp_checksum); 10137754SJeff.Bonwick@Sun.COM BP_SET_TYPE(bp, zp->zp_type); 10147754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(bp, zp->zp_level); 101510922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, zp->zp_dedup); 10167754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 101710922SJeff.Bonwick@Sun.COM if (zp->zp_dedup) { 101810922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 101910922SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 102010922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 102110922SJeff.Bonwick@Sun.COM } 102210922SJeff.Bonwick@Sun.COM } 102310922SJeff.Bonwick@Sun.COM 102410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 102510922SJeff.Bonwick@Sun.COM } 102610922SJeff.Bonwick@Sun.COM 102710922SJeff.Bonwick@Sun.COM static int 102810922SJeff.Bonwick@Sun.COM zio_free_bp_init(zio_t *zio) 102910922SJeff.Bonwick@Sun.COM { 103010922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 103110922SJeff.Bonwick@Sun.COM 103210922SJeff.Bonwick@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 103310922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp)) 103410922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 10357754SJeff.Bonwick@Sun.COM } 10367754SJeff.Bonwick@Sun.COM 10377754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 10387754SJeff.Bonwick@Sun.COM } 10397754SJeff.Bonwick@Sun.COM 10407754SJeff.Bonwick@Sun.COM /* 10417754SJeff.Bonwick@Sun.COM * ========================================================================== 10427754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline 10437754SJeff.Bonwick@Sun.COM * ========================================================================== 10447754SJeff.Bonwick@Sun.COM */ 10457754SJeff.Bonwick@Sun.COM 10467754SJeff.Bonwick@Sun.COM static void 104711173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 10487754SJeff.Bonwick@Sun.COM { 104911146SGeorge.Wilson@Sun.COM spa_t *spa = zio->io_spa; 10507754SJeff.Bonwick@Sun.COM zio_type_t t = zio->io_type; 105111173SJonathan.Adams@Sun.COM int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 10527754SJeff.Bonwick@Sun.COM 10537754SJeff.Bonwick@Sun.COM /* 10549722SGeorge.Wilson@Sun.COM * If we're a config writer or a probe, the normal issue and 10559722SGeorge.Wilson@Sun.COM * interrupt threads may all be blocked waiting for the config lock. 10569722SGeorge.Wilson@Sun.COM * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 10577754SJeff.Bonwick@Sun.COM */ 10589722SGeorge.Wilson@Sun.COM if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 10597754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10607754SJeff.Bonwick@Sun.COM 10617754SJeff.Bonwick@Sun.COM /* 10627754SJeff.Bonwick@Sun.COM * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 10637754SJeff.Bonwick@Sun.COM */ 10647754SJeff.Bonwick@Sun.COM if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 10657754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10667754SJeff.Bonwick@Sun.COM 106711146SGeorge.Wilson@Sun.COM /* 106811146SGeorge.Wilson@Sun.COM * If this is a high priority I/O, then use the high priority taskq. 106911146SGeorge.Wilson@Sun.COM */ 107011146SGeorge.Wilson@Sun.COM if (zio->io_priority == ZIO_PRIORITY_NOW && 107111146SGeorge.Wilson@Sun.COM spa->spa_zio_taskq[t][q + 1] != NULL) 107211146SGeorge.Wilson@Sun.COM q++; 107311146SGeorge.Wilson@Sun.COM 107411146SGeorge.Wilson@Sun.COM ASSERT3U(q, <, ZIO_TASKQ_TYPES); 107511146SGeorge.Wilson@Sun.COM (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 107611173SJonathan.Adams@Sun.COM (task_func_t *)zio_execute, zio, flags); 10777754SJeff.Bonwick@Sun.COM } 10787754SJeff.Bonwick@Sun.COM 10797754SJeff.Bonwick@Sun.COM static boolean_t 10807754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 10817754SJeff.Bonwick@Sun.COM { 10827754SJeff.Bonwick@Sun.COM kthread_t *executor = zio->io_executor; 10837754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 1084789Sahrens 10857754SJeff.Bonwick@Sun.COM for (zio_type_t t = 0; t < ZIO_TYPES; t++) 10867754SJeff.Bonwick@Sun.COM if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 10877754SJeff.Bonwick@Sun.COM return (B_TRUE); 10887754SJeff.Bonwick@Sun.COM 10897754SJeff.Bonwick@Sun.COM return (B_FALSE); 10907754SJeff.Bonwick@Sun.COM } 10917754SJeff.Bonwick@Sun.COM 10927754SJeff.Bonwick@Sun.COM static int 10937754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio) 10947754SJeff.Bonwick@Sun.COM { 109511173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 10967754SJeff.Bonwick@Sun.COM 10977754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 10987754SJeff.Bonwick@Sun.COM } 10997754SJeff.Bonwick@Sun.COM 11007754SJeff.Bonwick@Sun.COM void 11017754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio) 11027754SJeff.Bonwick@Sun.COM { 110311173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 11047754SJeff.Bonwick@Sun.COM } 11057754SJeff.Bonwick@Sun.COM 11067754SJeff.Bonwick@Sun.COM /* 11077754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline until one of the following occurs: 11087754SJeff.Bonwick@Sun.COM * (1) the I/O completes; (2) the pipeline stalls waiting for 11097754SJeff.Bonwick@Sun.COM * dependent child I/Os; (3) the I/O issues, so we're waiting 11107754SJeff.Bonwick@Sun.COM * for an I/O completion interrupt; (4) the I/O is delegated by 11117754SJeff.Bonwick@Sun.COM * vdev-level caching or aggregation; (5) the I/O is deferred 11127754SJeff.Bonwick@Sun.COM * due to vdev-level queueing; (6) the I/O is handed off to 11137754SJeff.Bonwick@Sun.COM * another thread. In all cases, the pipeline stops whenever 11147754SJeff.Bonwick@Sun.COM * there's no CPU work; it never burns a thread in cv_wait(). 11157754SJeff.Bonwick@Sun.COM * 11167754SJeff.Bonwick@Sun.COM * There's no locking on io_stage because there's no legitimate way 11177754SJeff.Bonwick@Sun.COM * for multiple threads to be attempting to process the same I/O. 11187754SJeff.Bonwick@Sun.COM */ 111910922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[]; 1120789Sahrens 11217754SJeff.Bonwick@Sun.COM void 11227754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio) 11237754SJeff.Bonwick@Sun.COM { 11247754SJeff.Bonwick@Sun.COM zio->io_executor = curthread; 11257754SJeff.Bonwick@Sun.COM 11267754SJeff.Bonwick@Sun.COM while (zio->io_stage < ZIO_STAGE_DONE) { 112710922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = zio->io_pipeline; 112810922SJeff.Bonwick@Sun.COM enum zio_stage stage = zio->io_stage; 11297754SJeff.Bonwick@Sun.COM int rv; 11307754SJeff.Bonwick@Sun.COM 11317754SJeff.Bonwick@Sun.COM ASSERT(!MUTEX_HELD(&zio->io_lock)); 113210922SJeff.Bonwick@Sun.COM ASSERT(ISP2(stage)); 113310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 113410922SJeff.Bonwick@Sun.COM 113510922SJeff.Bonwick@Sun.COM do { 113610922SJeff.Bonwick@Sun.COM stage <<= 1; 113710922SJeff.Bonwick@Sun.COM } while ((stage & pipeline) == 0); 11387754SJeff.Bonwick@Sun.COM 11397754SJeff.Bonwick@Sun.COM ASSERT(stage <= ZIO_STAGE_DONE); 11407754SJeff.Bonwick@Sun.COM 11417754SJeff.Bonwick@Sun.COM /* 11427754SJeff.Bonwick@Sun.COM * If we are in interrupt context and this pipeline stage 11437754SJeff.Bonwick@Sun.COM * will grab a config lock that is held across I/O, 114410922SJeff.Bonwick@Sun.COM * or may wait for an I/O that needs an interrupt thread 114510922SJeff.Bonwick@Sun.COM * to complete, issue async to avoid deadlock. 114611173SJonathan.Adams@Sun.COM * 114711173SJonathan.Adams@Sun.COM * For VDEV_IO_START, we cut in line so that the io will 114811173SJonathan.Adams@Sun.COM * be sent to disk promptly. 11497754SJeff.Bonwick@Sun.COM */ 115010922SJeff.Bonwick@Sun.COM if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 11517754SJeff.Bonwick@Sun.COM zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 115211173SJonathan.Adams@Sun.COM boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 115311173SJonathan.Adams@Sun.COM zio_requeue_io_start_cut_in_line : B_FALSE; 115411173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 11557754SJeff.Bonwick@Sun.COM return; 11567754SJeff.Bonwick@Sun.COM } 11577754SJeff.Bonwick@Sun.COM 11587754SJeff.Bonwick@Sun.COM zio->io_stage = stage; 115910922SJeff.Bonwick@Sun.COM rv = zio_pipeline[highbit(stage) - 1](zio); 11607754SJeff.Bonwick@Sun.COM 11617754SJeff.Bonwick@Sun.COM if (rv == ZIO_PIPELINE_STOP) 11627754SJeff.Bonwick@Sun.COM return; 11637754SJeff.Bonwick@Sun.COM 11647754SJeff.Bonwick@Sun.COM ASSERT(rv == ZIO_PIPELINE_CONTINUE); 11657754SJeff.Bonwick@Sun.COM } 1166789Sahrens } 1167789Sahrens 1168789Sahrens /* 1169789Sahrens * ========================================================================== 1170789Sahrens * Initiate I/O, either sync or async 1171789Sahrens * ========================================================================== 1172789Sahrens */ 1173789Sahrens int 1174789Sahrens zio_wait(zio_t *zio) 1175789Sahrens { 1176789Sahrens int error; 1177789Sahrens 1178789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 11797754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 1180789Sahrens 1181789Sahrens zio->io_waiter = curthread; 1182789Sahrens 11835530Sbonwick zio_execute(zio); 1184789Sahrens 1185789Sahrens mutex_enter(&zio->io_lock); 11867754SJeff.Bonwick@Sun.COM while (zio->io_executor != NULL) 1187789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 1188789Sahrens mutex_exit(&zio->io_lock); 1189789Sahrens 1190789Sahrens error = zio->io_error; 11916523Sek110237 zio_destroy(zio); 1192789Sahrens 1193789Sahrens return (error); 1194789Sahrens } 1195789Sahrens 1196789Sahrens void 1197789Sahrens zio_nowait(zio_t *zio) 1198789Sahrens { 11997754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 12007754SJeff.Bonwick@Sun.COM 12018632SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL && 12028632SBill.Moore@Sun.COM zio_unique_parent(zio) == NULL) { 12037754SJeff.Bonwick@Sun.COM /* 12047754SJeff.Bonwick@Sun.COM * This is a logical async I/O with no parent to wait for it. 12059234SGeorge.Wilson@Sun.COM * We add it to the spa_async_root_zio "Godfather" I/O which 12069234SGeorge.Wilson@Sun.COM * will ensure they complete prior to unloading the pool. 12077754SJeff.Bonwick@Sun.COM */ 12087754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 12099234SGeorge.Wilson@Sun.COM 12109234SGeorge.Wilson@Sun.COM zio_add_child(spa->spa_async_zio_root, zio); 12117754SJeff.Bonwick@Sun.COM } 12127754SJeff.Bonwick@Sun.COM 12135530Sbonwick zio_execute(zio); 12145530Sbonwick } 12155530Sbonwick 12167754SJeff.Bonwick@Sun.COM /* 12177754SJeff.Bonwick@Sun.COM * ========================================================================== 12187754SJeff.Bonwick@Sun.COM * Reexecute or suspend/resume failed I/O 12197754SJeff.Bonwick@Sun.COM * ========================================================================== 12207754SJeff.Bonwick@Sun.COM */ 12217754SJeff.Bonwick@Sun.COM 12227754SJeff.Bonwick@Sun.COM static void 12237754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio) 12247754SJeff.Bonwick@Sun.COM { 12258632SBill.Moore@Sun.COM zio_t *cio, *cio_next; 12268632SBill.Moore@Sun.COM 12278632SBill.Moore@Sun.COM ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 12288632SBill.Moore@Sun.COM ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 12299443SBill.Moore@Sun.COM ASSERT(pio->io_gang_leader == NULL); 12309443SBill.Moore@Sun.COM ASSERT(pio->io_gang_tree == NULL); 12317754SJeff.Bonwick@Sun.COM 12327754SJeff.Bonwick@Sun.COM pio->io_flags = pio->io_orig_flags; 12337754SJeff.Bonwick@Sun.COM pio->io_stage = pio->io_orig_stage; 12347754SJeff.Bonwick@Sun.COM pio->io_pipeline = pio->io_orig_pipeline; 12357754SJeff.Bonwick@Sun.COM pio->io_reexecute = 0; 12367754SJeff.Bonwick@Sun.COM pio->io_error = 0; 12378632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12388632SBill.Moore@Sun.COM pio->io_state[w] = 0; 12397754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 12407754SJeff.Bonwick@Sun.COM pio->io_child_error[c] = 0; 12417754SJeff.Bonwick@Sun.COM 124210922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(pio)) 124310922SJeff.Bonwick@Sun.COM BP_ZERO(pio->io_bp); 12447754SJeff.Bonwick@Sun.COM 12457754SJeff.Bonwick@Sun.COM /* 12467754SJeff.Bonwick@Sun.COM * As we reexecute pio's children, new children could be created. 12478632SBill.Moore@Sun.COM * New children go to the head of pio's io_child_list, however, 12487754SJeff.Bonwick@Sun.COM * so we will (correctly) not reexecute them. The key is that 12498632SBill.Moore@Sun.COM * the remainder of pio's io_child_list, from 'cio_next' onward, 12508632SBill.Moore@Sun.COM * cannot be affected by any side effects of reexecuting 'cio'. 12517754SJeff.Bonwick@Sun.COM */ 12528632SBill.Moore@Sun.COM for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 12538632SBill.Moore@Sun.COM cio_next = zio_walk_children(pio); 12547754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 12558632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12568632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w]++; 12577754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 12588632SBill.Moore@Sun.COM zio_reexecute(cio); 12597754SJeff.Bonwick@Sun.COM } 12607754SJeff.Bonwick@Sun.COM 12617754SJeff.Bonwick@Sun.COM /* 12627754SJeff.Bonwick@Sun.COM * Now that all children have been reexecuted, execute the parent. 12639234SGeorge.Wilson@Sun.COM * We don't reexecute "The Godfather" I/O here as it's the 12649234SGeorge.Wilson@Sun.COM * responsibility of the caller to wait on him. 12657754SJeff.Bonwick@Sun.COM */ 12669234SGeorge.Wilson@Sun.COM if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 12679234SGeorge.Wilson@Sun.COM zio_execute(pio); 12687754SJeff.Bonwick@Sun.COM } 12697754SJeff.Bonwick@Sun.COM 12705530Sbonwick void 12717754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio) 12725530Sbonwick { 12737754SJeff.Bonwick@Sun.COM if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 12747754SJeff.Bonwick@Sun.COM fm_panic("Pool '%s' has encountered an uncorrectable I/O " 12757754SJeff.Bonwick@Sun.COM "failure and the failure mode property for this pool " 12767754SJeff.Bonwick@Sun.COM "is set to panic.", spa_name(spa)); 12777754SJeff.Bonwick@Sun.COM 12787754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 12797754SJeff.Bonwick@Sun.COM 12807754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12817754SJeff.Bonwick@Sun.COM 12827754SJeff.Bonwick@Sun.COM if (spa->spa_suspend_zio_root == NULL) 12839234SGeorge.Wilson@Sun.COM spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 12849234SGeorge.Wilson@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 12859234SGeorge.Wilson@Sun.COM ZIO_FLAG_GODFATHER); 12867754SJeff.Bonwick@Sun.COM 12877754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_TRUE; 12887754SJeff.Bonwick@Sun.COM 12897754SJeff.Bonwick@Sun.COM if (zio != NULL) { 12909234SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 12917754SJeff.Bonwick@Sun.COM ASSERT(zio != spa->spa_suspend_zio_root); 12927754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 12938632SBill.Moore@Sun.COM ASSERT(zio_unique_parent(zio) == NULL); 12947754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_DONE); 12957754SJeff.Bonwick@Sun.COM zio_add_child(spa->spa_suspend_zio_root, zio); 12967754SJeff.Bonwick@Sun.COM } 12977754SJeff.Bonwick@Sun.COM 12987754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12995530Sbonwick } 13005530Sbonwick 13019234SGeorge.Wilson@Sun.COM int 13027754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa) 13035530Sbonwick { 13049234SGeorge.Wilson@Sun.COM zio_t *pio; 13057754SJeff.Bonwick@Sun.COM 13067754SJeff.Bonwick@Sun.COM /* 13077754SJeff.Bonwick@Sun.COM * Reexecute all previously suspended i/o. 13087754SJeff.Bonwick@Sun.COM */ 13097754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 13107754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_FALSE; 13117754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_suspend_cv); 13127754SJeff.Bonwick@Sun.COM pio = spa->spa_suspend_zio_root; 13137754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = NULL; 13147754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 13157754SJeff.Bonwick@Sun.COM 13167754SJeff.Bonwick@Sun.COM if (pio == NULL) 13179234SGeorge.Wilson@Sun.COM return (0); 13185530Sbonwick 13199234SGeorge.Wilson@Sun.COM zio_reexecute(pio); 13209234SGeorge.Wilson@Sun.COM return (zio_wait(pio)); 13217754SJeff.Bonwick@Sun.COM } 13227754SJeff.Bonwick@Sun.COM 13237754SJeff.Bonwick@Sun.COM void 13247754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa) 13257754SJeff.Bonwick@Sun.COM { 13267754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 13277754SJeff.Bonwick@Sun.COM while (spa_suspended(spa)) 13287754SJeff.Bonwick@Sun.COM cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 13297754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1330789Sahrens } 1331789Sahrens 1332789Sahrens /* 1333789Sahrens * ========================================================================== 13347754SJeff.Bonwick@Sun.COM * Gang blocks. 13357754SJeff.Bonwick@Sun.COM * 13367754SJeff.Bonwick@Sun.COM * A gang block is a collection of small blocks that looks to the DMU 13377754SJeff.Bonwick@Sun.COM * like one large block. When zio_dva_allocate() cannot find a block 13387754SJeff.Bonwick@Sun.COM * of the requested size, due to either severe fragmentation or the pool 13397754SJeff.Bonwick@Sun.COM * being nearly full, it calls zio_write_gang_block() to construct the 13407754SJeff.Bonwick@Sun.COM * block from smaller fragments. 13417754SJeff.Bonwick@Sun.COM * 13427754SJeff.Bonwick@Sun.COM * A gang block consists of a gang header (zio_gbh_phys_t) and up to 13437754SJeff.Bonwick@Sun.COM * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 13447754SJeff.Bonwick@Sun.COM * an indirect block: it's an array of block pointers. It consumes 13457754SJeff.Bonwick@Sun.COM * only one sector and hence is allocatable regardless of fragmentation. 13467754SJeff.Bonwick@Sun.COM * The gang header's bps point to its gang members, which hold the data. 13477754SJeff.Bonwick@Sun.COM * 13487754SJeff.Bonwick@Sun.COM * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 13497754SJeff.Bonwick@Sun.COM * as the verifier to ensure uniqueness of the SHA256 checksum. 13507754SJeff.Bonwick@Sun.COM * Critically, the gang block bp's blk_cksum is the checksum of the data, 13517754SJeff.Bonwick@Sun.COM * not the gang header. This ensures that data block signatures (needed for 13527754SJeff.Bonwick@Sun.COM * deduplication) are independent of how the block is physically stored. 13537754SJeff.Bonwick@Sun.COM * 13547754SJeff.Bonwick@Sun.COM * Gang blocks can be nested: a gang member may itself be a gang block. 13557754SJeff.Bonwick@Sun.COM * Thus every gang block is a tree in which root and all interior nodes are 13567754SJeff.Bonwick@Sun.COM * gang headers, and the leaves are normal blocks that contain user data. 13577754SJeff.Bonwick@Sun.COM * The root of the gang tree is called the gang leader. 13587754SJeff.Bonwick@Sun.COM * 13597754SJeff.Bonwick@Sun.COM * To perform any operation (read, rewrite, free, claim) on a gang block, 13607754SJeff.Bonwick@Sun.COM * zio_gang_assemble() first assembles the gang tree (minus data leaves) 13617754SJeff.Bonwick@Sun.COM * in the io_gang_tree field of the original logical i/o by recursively 13627754SJeff.Bonwick@Sun.COM * reading the gang leader and all gang headers below it. This yields 13637754SJeff.Bonwick@Sun.COM * an in-core tree containing the contents of every gang header and the 13647754SJeff.Bonwick@Sun.COM * bps for every constituent of the gang block. 13657754SJeff.Bonwick@Sun.COM * 13667754SJeff.Bonwick@Sun.COM * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 13677754SJeff.Bonwick@Sun.COM * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 13687754SJeff.Bonwick@Sun.COM * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 13697754SJeff.Bonwick@Sun.COM * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 13707754SJeff.Bonwick@Sun.COM * zio_read_gang() is a wrapper around zio_read() that omits reading gang 13717754SJeff.Bonwick@Sun.COM * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 13727754SJeff.Bonwick@Sun.COM * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 13737754SJeff.Bonwick@Sun.COM * of the gang header plus zio_checksum_compute() of the data to update the 13747754SJeff.Bonwick@Sun.COM * gang header's blk_cksum as described above. 13757754SJeff.Bonwick@Sun.COM * 13767754SJeff.Bonwick@Sun.COM * The two-phase assemble/issue model solves the problem of partial failure -- 13777754SJeff.Bonwick@Sun.COM * what if you'd freed part of a gang block but then couldn't read the 13787754SJeff.Bonwick@Sun.COM * gang header for another part? Assembling the entire gang tree first 13797754SJeff.Bonwick@Sun.COM * ensures that all the necessary gang header I/O has succeeded before 13807754SJeff.Bonwick@Sun.COM * starting the actual work of free, claim, or write. Once the gang tree 13817754SJeff.Bonwick@Sun.COM * is assembled, free and claim are in-memory operations that cannot fail. 13827754SJeff.Bonwick@Sun.COM * 13837754SJeff.Bonwick@Sun.COM * In the event that a gang write fails, zio_dva_unallocate() walks the 13847754SJeff.Bonwick@Sun.COM * gang tree to immediately free (i.e. insert back into the space map) 13857754SJeff.Bonwick@Sun.COM * everything we've allocated. This ensures that we don't get ENOSPC 13867754SJeff.Bonwick@Sun.COM * errors during repeated suspend/resume cycles due to a flaky device. 13877754SJeff.Bonwick@Sun.COM * 13887754SJeff.Bonwick@Sun.COM * Gang rewrites only happen during sync-to-convergence. If we can't assemble 13897754SJeff.Bonwick@Sun.COM * the gang tree, we won't modify the block, so we can safely defer the free 13907754SJeff.Bonwick@Sun.COM * (knowing that the block is still intact). If we *can* assemble the gang 13917754SJeff.Bonwick@Sun.COM * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 13927754SJeff.Bonwick@Sun.COM * each constituent bp and we can allocate a new block on the next sync pass. 13937754SJeff.Bonwick@Sun.COM * 13947754SJeff.Bonwick@Sun.COM * In all cases, the gang tree allows complete recovery from partial failure. 1395789Sahrens * ========================================================================== 1396789Sahrens */ 13975530Sbonwick 13987754SJeff.Bonwick@Sun.COM static zio_t * 13997754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14007754SJeff.Bonwick@Sun.COM { 14017754SJeff.Bonwick@Sun.COM if (gn != NULL) 14027754SJeff.Bonwick@Sun.COM return (pio); 14035530Sbonwick 14047754SJeff.Bonwick@Sun.COM return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 14057754SJeff.Bonwick@Sun.COM NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 14067754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1407789Sahrens } 1408789Sahrens 14097754SJeff.Bonwick@Sun.COM zio_t * 14107754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14116523Sek110237 { 14127754SJeff.Bonwick@Sun.COM zio_t *zio; 14136523Sek110237 14147754SJeff.Bonwick@Sun.COM if (gn != NULL) { 14157754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 14167754SJeff.Bonwick@Sun.COM gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 14177754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14187754SJeff.Bonwick@Sun.COM /* 14197754SJeff.Bonwick@Sun.COM * As we rewrite each gang header, the pipeline will compute 14207754SJeff.Bonwick@Sun.COM * a new gang block header checksum for it; but no one will 14217754SJeff.Bonwick@Sun.COM * compute a new data checksum, so we do that here. The one 14227754SJeff.Bonwick@Sun.COM * exception is the gang leader: the pipeline already computed 14237754SJeff.Bonwick@Sun.COM * its data checksum because that stage precedes gang assembly. 14247754SJeff.Bonwick@Sun.COM * (Presently, nothing actually uses interior data checksums; 14257754SJeff.Bonwick@Sun.COM * this is just good hygiene.) 14267754SJeff.Bonwick@Sun.COM */ 14279443SBill.Moore@Sun.COM if (gn != pio->io_gang_leader->io_gang_tree) { 14287754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 14297754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp)); 14307754SJeff.Bonwick@Sun.COM } 143110922SJeff.Bonwick@Sun.COM /* 143210922SJeff.Bonwick@Sun.COM * If we are here to damage data for testing purposes, 143310922SJeff.Bonwick@Sun.COM * leave the GBH alone so that we can detect the damage. 143410922SJeff.Bonwick@Sun.COM */ 143510922SJeff.Bonwick@Sun.COM if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 143610922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 14377754SJeff.Bonwick@Sun.COM } else { 14387754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 14397754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 14407754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14416523Sek110237 } 14426523Sek110237 14437754SJeff.Bonwick@Sun.COM return (zio); 14447754SJeff.Bonwick@Sun.COM } 14457754SJeff.Bonwick@Sun.COM 14467754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14477754SJeff.Bonwick@Sun.COM zio_t * 14487754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14497754SJeff.Bonwick@Sun.COM { 145010922SJeff.Bonwick@Sun.COM return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 145110922SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio))); 14527754SJeff.Bonwick@Sun.COM } 14537754SJeff.Bonwick@Sun.COM 14547754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14557754SJeff.Bonwick@Sun.COM zio_t * 14567754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14577754SJeff.Bonwick@Sun.COM { 14587754SJeff.Bonwick@Sun.COM return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 14597754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 14607754SJeff.Bonwick@Sun.COM } 14617754SJeff.Bonwick@Sun.COM 14627754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 14637754SJeff.Bonwick@Sun.COM NULL, 14647754SJeff.Bonwick@Sun.COM zio_read_gang, 14657754SJeff.Bonwick@Sun.COM zio_rewrite_gang, 14667754SJeff.Bonwick@Sun.COM zio_free_gang, 14677754SJeff.Bonwick@Sun.COM zio_claim_gang, 14687754SJeff.Bonwick@Sun.COM NULL 14697754SJeff.Bonwick@Sun.COM }; 14707754SJeff.Bonwick@Sun.COM 14717754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio); 14727754SJeff.Bonwick@Sun.COM 14737754SJeff.Bonwick@Sun.COM static zio_gang_node_t * 14747754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp) 14757754SJeff.Bonwick@Sun.COM { 14767754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn; 14777754SJeff.Bonwick@Sun.COM 14787754SJeff.Bonwick@Sun.COM ASSERT(*gnpp == NULL); 14797754SJeff.Bonwick@Sun.COM 14807754SJeff.Bonwick@Sun.COM gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 14817754SJeff.Bonwick@Sun.COM gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 14827754SJeff.Bonwick@Sun.COM *gnpp = gn; 14837754SJeff.Bonwick@Sun.COM 14847754SJeff.Bonwick@Sun.COM return (gn); 14856523Sek110237 } 14866523Sek110237 14876523Sek110237 static void 14887754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp) 14897754SJeff.Bonwick@Sun.COM { 14907754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 14917754SJeff.Bonwick@Sun.COM 14927754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 14937754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_child[g] == NULL); 14947754SJeff.Bonwick@Sun.COM 14957754SJeff.Bonwick@Sun.COM zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 14967754SJeff.Bonwick@Sun.COM kmem_free(gn, sizeof (*gn)); 14977754SJeff.Bonwick@Sun.COM *gnpp = NULL; 14987754SJeff.Bonwick@Sun.COM } 14997754SJeff.Bonwick@Sun.COM 15007754SJeff.Bonwick@Sun.COM static void 15017754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp) 1502789Sahrens { 15037754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 15047754SJeff.Bonwick@Sun.COM 15057754SJeff.Bonwick@Sun.COM if (gn == NULL) 15067754SJeff.Bonwick@Sun.COM return; 15077754SJeff.Bonwick@Sun.COM 15087754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 15097754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&gn->gn_child[g]); 15107754SJeff.Bonwick@Sun.COM 15117754SJeff.Bonwick@Sun.COM zio_gang_node_free(gnpp); 15127754SJeff.Bonwick@Sun.COM } 15137754SJeff.Bonwick@Sun.COM 15147754SJeff.Bonwick@Sun.COM static void 15159443SBill.Moore@Sun.COM zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 15167754SJeff.Bonwick@Sun.COM { 15177754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1518789Sahrens 15199443SBill.Moore@Sun.COM ASSERT(gio->io_gang_leader == gio); 15207754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp)); 15217754SJeff.Bonwick@Sun.COM 15229443SBill.Moore@Sun.COM zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 15237754SJeff.Bonwick@Sun.COM SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 15249443SBill.Moore@Sun.COM gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 15257754SJeff.Bonwick@Sun.COM } 15267754SJeff.Bonwick@Sun.COM 15277754SJeff.Bonwick@Sun.COM static void 15287754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio) 15297754SJeff.Bonwick@Sun.COM { 15309443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 15317754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio->io_private; 15327754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 15337754SJeff.Bonwick@Sun.COM 15349443SBill.Moore@Sun.COM ASSERT(gio == zio_unique_parent(zio)); 153510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 15367754SJeff.Bonwick@Sun.COM 15377754SJeff.Bonwick@Sun.COM if (zio->io_error) 15387754SJeff.Bonwick@Sun.COM return; 15397754SJeff.Bonwick@Sun.COM 15407754SJeff.Bonwick@Sun.COM if (BP_SHOULD_BYTESWAP(bp)) 15417754SJeff.Bonwick@Sun.COM byteswap_uint64_array(zio->io_data, zio->io_size); 15427754SJeff.Bonwick@Sun.COM 15437754SJeff.Bonwick@Sun.COM ASSERT(zio->io_data == gn->gn_gbh); 15447754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 154511670SNeil.Perrin@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 15467754SJeff.Bonwick@Sun.COM 15477754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15487754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15497754SJeff.Bonwick@Sun.COM if (!BP_IS_GANG(gbp)) 15507754SJeff.Bonwick@Sun.COM continue; 15519443SBill.Moore@Sun.COM zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1552789Sahrens } 1553789Sahrens } 1554789Sahrens 15557754SJeff.Bonwick@Sun.COM static void 15567754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1557789Sahrens { 15589443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 15597754SJeff.Bonwick@Sun.COM zio_t *zio; 15607754SJeff.Bonwick@Sun.COM 15617754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) == !!gn); 15629443SBill.Moore@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 15639443SBill.Moore@Sun.COM ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 15647754SJeff.Bonwick@Sun.COM 15657754SJeff.Bonwick@Sun.COM /* 15667754SJeff.Bonwick@Sun.COM * If you're a gang header, your data is in gn->gn_gbh. 15677754SJeff.Bonwick@Sun.COM * If you're a gang member, your data is in 'data' and gn == NULL. 15687754SJeff.Bonwick@Sun.COM */ 15699443SBill.Moore@Sun.COM zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1570789Sahrens 15717754SJeff.Bonwick@Sun.COM if (gn != NULL) { 157211670SNeil.Perrin@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 15737754SJeff.Bonwick@Sun.COM 15747754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15757754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15767754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(gbp)) 15777754SJeff.Bonwick@Sun.COM continue; 15787754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 15797754SJeff.Bonwick@Sun.COM data = (char *)data + BP_GET_PSIZE(gbp); 15807754SJeff.Bonwick@Sun.COM } 15817754SJeff.Bonwick@Sun.COM } 15827754SJeff.Bonwick@Sun.COM 15839443SBill.Moore@Sun.COM if (gn == gio->io_gang_tree) 15849443SBill.Moore@Sun.COM ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 15857754SJeff.Bonwick@Sun.COM 15867754SJeff.Bonwick@Sun.COM if (zio != pio) 15877754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1588789Sahrens } 1589789Sahrens 15905530Sbonwick static int 15917754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio) 15925329Sgw25295 { 15935530Sbonwick blkptr_t *bp = zio->io_bp; 15945530Sbonwick 15959443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 15969443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 15979443SBill.Moore@Sun.COM 15989443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 15995530Sbonwick 16007754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1601789Sahrens 16025530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1603789Sahrens } 1604789Sahrens 16055530Sbonwick static int 16067754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio) 16076523Sek110237 { 16086523Sek110237 blkptr_t *bp = zio->io_bp; 1609789Sahrens 16107754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 16117754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 16125329Sgw25295 16139443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 16149443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1615789Sahrens 16167754SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 16179443SBill.Moore@Sun.COM zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 16187754SJeff.Bonwick@Sun.COM else 16199443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 1620789Sahrens 16217754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 16225530Sbonwick 16235530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1624789Sahrens } 1625789Sahrens 1626789Sahrens static void 16277754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio) 1628789Sahrens { 16298632SBill.Moore@Sun.COM zio_t *pio = zio_unique_parent(zio); 16309443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 16311775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 16321775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1633789Sahrens uint64_t asize; 16347754SJeff.Bonwick@Sun.COM 16357754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(zio->io_bp)) 16367754SJeff.Bonwick@Sun.COM return; 16377754SJeff.Bonwick@Sun.COM 16387754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1639789Sahrens 16407754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 164110922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 164210922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 164310922SJeff.Bonwick@Sun.COM ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 16441775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 16451775Sbillm 1646789Sahrens mutex_enter(&pio->io_lock); 16477754SJeff.Bonwick@Sun.COM for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 16481775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 16491775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 16501775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 16511775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 16521775Sbillm } 1653789Sahrens mutex_exit(&pio->io_lock); 1654789Sahrens } 1655789Sahrens 16565329Sgw25295 static int 16577754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio) 1658789Sahrens { 16597754SJeff.Bonwick@Sun.COM spa_t *spa = pio->io_spa; 16607754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 16619443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 16627754SJeff.Bonwick@Sun.COM zio_t *zio; 16637754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn, **gnpp; 1664789Sahrens zio_gbh_phys_t *gbh; 16657754SJeff.Bonwick@Sun.COM uint64_t txg = pio->io_txg; 16667754SJeff.Bonwick@Sun.COM uint64_t resid = pio->io_size; 16677754SJeff.Bonwick@Sun.COM uint64_t lsize; 166810922SJeff.Bonwick@Sun.COM int copies = gio->io_prop.zp_copies; 166910922SJeff.Bonwick@Sun.COM int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 16707754SJeff.Bonwick@Sun.COM zio_prop_t zp; 1671789Sahrens int error; 1672789Sahrens 167310922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 167410922SJeff.Bonwick@Sun.COM bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 16757754SJeff.Bonwick@Sun.COM METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 16765530Sbonwick if (error) { 16777754SJeff.Bonwick@Sun.COM pio->io_error = error; 16785530Sbonwick return (ZIO_PIPELINE_CONTINUE); 16795530Sbonwick } 1680789Sahrens 16819443SBill.Moore@Sun.COM if (pio == gio) { 16829443SBill.Moore@Sun.COM gnpp = &gio->io_gang_tree; 16837754SJeff.Bonwick@Sun.COM } else { 16847754SJeff.Bonwick@Sun.COM gnpp = pio->io_private; 16857754SJeff.Bonwick@Sun.COM ASSERT(pio->io_ready == zio_write_gang_member_ready); 1686789Sahrens } 1687789Sahrens 16887754SJeff.Bonwick@Sun.COM gn = zio_gang_node_alloc(gnpp); 16897754SJeff.Bonwick@Sun.COM gbh = gn->gn_gbh; 16907754SJeff.Bonwick@Sun.COM bzero(gbh, SPA_GANGBLOCKSIZE); 1691789Sahrens 16927754SJeff.Bonwick@Sun.COM /* 16937754SJeff.Bonwick@Sun.COM * Create the gang header. 16947754SJeff.Bonwick@Sun.COM */ 16957754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 16967754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 16975530Sbonwick 16981775Sbillm /* 16997754SJeff.Bonwick@Sun.COM * Create and nowait the gang children. 17001775Sbillm */ 17017754SJeff.Bonwick@Sun.COM for (int g = 0; resid != 0; resid -= lsize, g++) { 17027754SJeff.Bonwick@Sun.COM lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 17037754SJeff.Bonwick@Sun.COM SPA_MINBLOCKSIZE); 17047754SJeff.Bonwick@Sun.COM ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 17057754SJeff.Bonwick@Sun.COM 17069443SBill.Moore@Sun.COM zp.zp_checksum = gio->io_prop.zp_checksum; 17077754SJeff.Bonwick@Sun.COM zp.zp_compress = ZIO_COMPRESS_OFF; 17087754SJeff.Bonwick@Sun.COM zp.zp_type = DMU_OT_NONE; 17097754SJeff.Bonwick@Sun.COM zp.zp_level = 0; 171010922SJeff.Bonwick@Sun.COM zp.zp_copies = gio->io_prop.zp_copies; 171110922SJeff.Bonwick@Sun.COM zp.zp_dedup = 0; 171210922SJeff.Bonwick@Sun.COM zp.zp_dedup_verify = 0; 17137754SJeff.Bonwick@Sun.COM 17147754SJeff.Bonwick@Sun.COM zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 17157754SJeff.Bonwick@Sun.COM (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 17167754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready, NULL, &gn->gn_child[g], 17177754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 17187754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 17197754SJeff.Bonwick@Sun.COM } 17207754SJeff.Bonwick@Sun.COM 17217754SJeff.Bonwick@Sun.COM /* 17227754SJeff.Bonwick@Sun.COM * Set pio's pipeline to just wait for zio to finish. 17237754SJeff.Bonwick@Sun.COM */ 17247754SJeff.Bonwick@Sun.COM pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 17257754SJeff.Bonwick@Sun.COM 17267754SJeff.Bonwick@Sun.COM zio_nowait(zio); 17277754SJeff.Bonwick@Sun.COM 17287754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1729789Sahrens } 1730789Sahrens 1731789Sahrens /* 1732789Sahrens * ========================================================================== 173310922SJeff.Bonwick@Sun.COM * Dedup 173410922SJeff.Bonwick@Sun.COM * ========================================================================== 173510922SJeff.Bonwick@Sun.COM */ 173610922SJeff.Bonwick@Sun.COM static void 173710922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done(zio_t *zio) 173810922SJeff.Bonwick@Sun.COM { 173910922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 174010922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 174110922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 174210922SJeff.Bonwick@Sun.COM zio_t *pio = zio_unique_parent(zio); 174310922SJeff.Bonwick@Sun.COM 174410922SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 174510922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 174610922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) 174710922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 174810922SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && dde->dde_repair_data == NULL) 174910922SJeff.Bonwick@Sun.COM dde->dde_repair_data = zio->io_data; 175010922SJeff.Bonwick@Sun.COM else 175110922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zio->io_size); 175210922SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 175310922SJeff.Bonwick@Sun.COM } 175410922SJeff.Bonwick@Sun.COM 175510922SJeff.Bonwick@Sun.COM static int 175610922SJeff.Bonwick@Sun.COM zio_ddt_read_start(zio_t *zio) 175710922SJeff.Bonwick@Sun.COM { 175810922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 175910922SJeff.Bonwick@Sun.COM 176010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 176110922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 176210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 176310922SJeff.Bonwick@Sun.COM 176410922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 176510922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 176610922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = ddt_repair_start(ddt, bp); 176710922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = dde->dde_phys; 176810922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 176910922SJeff.Bonwick@Sun.COM blkptr_t blk; 177010922SJeff.Bonwick@Sun.COM 177110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 177210922SJeff.Bonwick@Sun.COM zio->io_vsd = dde; 177310922SJeff.Bonwick@Sun.COM 177410922SJeff.Bonwick@Sun.COM if (ddp_self == NULL) 177510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 177610922SJeff.Bonwick@Sun.COM 177710922SJeff.Bonwick@Sun.COM for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 177810922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 177910922SJeff.Bonwick@Sun.COM continue; 178011125SJeff.Bonwick@Sun.COM ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 178111125SJeff.Bonwick@Sun.COM &blk); 178210922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, &blk, 178310922SJeff.Bonwick@Sun.COM zio_buf_alloc(zio->io_size), zio->io_size, 178410922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done, dde, zio->io_priority, 178510922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 178610922SJeff.Bonwick@Sun.COM &zio->io_bookmark)); 178710922SJeff.Bonwick@Sun.COM } 178810922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 178910922SJeff.Bonwick@Sun.COM } 179010922SJeff.Bonwick@Sun.COM 179110922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, bp, 179210922SJeff.Bonwick@Sun.COM zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 179310922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 179410922SJeff.Bonwick@Sun.COM 179510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 179610922SJeff.Bonwick@Sun.COM } 179710922SJeff.Bonwick@Sun.COM 179810922SJeff.Bonwick@Sun.COM static int 179910922SJeff.Bonwick@Sun.COM zio_ddt_read_done(zio_t *zio) 180010922SJeff.Bonwick@Sun.COM { 180110922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 180210922SJeff.Bonwick@Sun.COM 180310922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 180410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 180510922SJeff.Bonwick@Sun.COM 180610922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 180710922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 180810922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 180910922SJeff.Bonwick@Sun.COM 181010922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 181110922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 181210922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_vsd; 181310922SJeff.Bonwick@Sun.COM if (ddt == NULL) { 181411147SGeorge.Wilson@Sun.COM ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 181510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 181610922SJeff.Bonwick@Sun.COM } 181710922SJeff.Bonwick@Sun.COM if (dde == NULL) { 181810922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 181911173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 182010922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 182110922SJeff.Bonwick@Sun.COM } 182210922SJeff.Bonwick@Sun.COM if (dde->dde_repair_data != NULL) { 182310922SJeff.Bonwick@Sun.COM bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 182410922SJeff.Bonwick@Sun.COM zio->io_child_error[ZIO_CHILD_DDT] = 0; 182510922SJeff.Bonwick@Sun.COM } 182610922SJeff.Bonwick@Sun.COM ddt_repair_done(ddt, dde); 182710922SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 182810922SJeff.Bonwick@Sun.COM } 182910922SJeff.Bonwick@Sun.COM 183010922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 183110922SJeff.Bonwick@Sun.COM 183210922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 183310922SJeff.Bonwick@Sun.COM } 183410922SJeff.Bonwick@Sun.COM 183510922SJeff.Bonwick@Sun.COM static boolean_t 183610922SJeff.Bonwick@Sun.COM zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 183710922SJeff.Bonwick@Sun.COM { 183810922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 183910922SJeff.Bonwick@Sun.COM 184010922SJeff.Bonwick@Sun.COM /* 184110922SJeff.Bonwick@Sun.COM * Note: we compare the original data, not the transformed data, 184210922SJeff.Bonwick@Sun.COM * because when zio->io_bp is an override bp, we will not have 184310922SJeff.Bonwick@Sun.COM * pushed the I/O transforms. That's an important optimization 184410922SJeff.Bonwick@Sun.COM * because otherwise we'd compress/encrypt all dmu_sync() data twice. 184510922SJeff.Bonwick@Sun.COM */ 184610922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 184710922SJeff.Bonwick@Sun.COM zio_t *lio = dde->dde_lead_zio[p]; 184810922SJeff.Bonwick@Sun.COM 184910922SJeff.Bonwick@Sun.COM if (lio != NULL) { 185010922SJeff.Bonwick@Sun.COM return (lio->io_orig_size != zio->io_orig_size || 185110922SJeff.Bonwick@Sun.COM bcmp(zio->io_orig_data, lio->io_orig_data, 185210922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0); 185310922SJeff.Bonwick@Sun.COM } 185410922SJeff.Bonwick@Sun.COM } 185510922SJeff.Bonwick@Sun.COM 185610922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 185710922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 185810922SJeff.Bonwick@Sun.COM 185910922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) { 186010922SJeff.Bonwick@Sun.COM arc_buf_t *abuf = NULL; 186110922SJeff.Bonwick@Sun.COM uint32_t aflags = ARC_WAIT; 186210922SJeff.Bonwick@Sun.COM blkptr_t blk = *zio->io_bp; 186310922SJeff.Bonwick@Sun.COM int error; 186410922SJeff.Bonwick@Sun.COM 186510922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 186610922SJeff.Bonwick@Sun.COM 186710922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 186810922SJeff.Bonwick@Sun.COM 186910922SJeff.Bonwick@Sun.COM error = arc_read_nolock(NULL, spa, &blk, 187010922SJeff.Bonwick@Sun.COM arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 187110922SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 187210922SJeff.Bonwick@Sun.COM &aflags, &zio->io_bookmark); 187310922SJeff.Bonwick@Sun.COM 187410922SJeff.Bonwick@Sun.COM if (error == 0) { 187510922SJeff.Bonwick@Sun.COM if (arc_buf_size(abuf) != zio->io_orig_size || 187610922SJeff.Bonwick@Sun.COM bcmp(abuf->b_data, zio->io_orig_data, 187710922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0) 187810922SJeff.Bonwick@Sun.COM error = EEXIST; 187910922SJeff.Bonwick@Sun.COM VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 188010922SJeff.Bonwick@Sun.COM } 188110922SJeff.Bonwick@Sun.COM 188210922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 188310922SJeff.Bonwick@Sun.COM return (error != 0); 188410922SJeff.Bonwick@Sun.COM } 188510922SJeff.Bonwick@Sun.COM } 188610922SJeff.Bonwick@Sun.COM 188710922SJeff.Bonwick@Sun.COM return (B_FALSE); 188810922SJeff.Bonwick@Sun.COM } 188910922SJeff.Bonwick@Sun.COM 189010922SJeff.Bonwick@Sun.COM static void 189110922SJeff.Bonwick@Sun.COM zio_ddt_child_write_ready(zio_t *zio) 189210922SJeff.Bonwick@Sun.COM { 189310922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 189410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 189510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 189610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 189710922SJeff.Bonwick@Sun.COM zio_t *pio; 189810922SJeff.Bonwick@Sun.COM 189910922SJeff.Bonwick@Sun.COM if (zio->io_error) 190010922SJeff.Bonwick@Sun.COM return; 190110922SJeff.Bonwick@Sun.COM 190210922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 190310922SJeff.Bonwick@Sun.COM 190410922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 190510922SJeff.Bonwick@Sun.COM 190610922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, zio->io_bp); 190710922SJeff.Bonwick@Sun.COM 190810922SJeff.Bonwick@Sun.COM while ((pio = zio_walk_parents(zio)) != NULL) 190910922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 191010922SJeff.Bonwick@Sun.COM 191110922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 191210922SJeff.Bonwick@Sun.COM } 191310922SJeff.Bonwick@Sun.COM 191410922SJeff.Bonwick@Sun.COM static void 191510922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done(zio_t *zio) 191610922SJeff.Bonwick@Sun.COM { 191710922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 191810922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 191910922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 192010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 192110922SJeff.Bonwick@Sun.COM 192210922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 192310922SJeff.Bonwick@Sun.COM 192410922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 192510922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 192610922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 192710922SJeff.Bonwick@Sun.COM 192810922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 192910922SJeff.Bonwick@Sun.COM while (zio_walk_parents(zio) != NULL) 193010922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 193110922SJeff.Bonwick@Sun.COM } else { 193210922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); 193310922SJeff.Bonwick@Sun.COM } 193410922SJeff.Bonwick@Sun.COM 193510922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 193610922SJeff.Bonwick@Sun.COM } 193710922SJeff.Bonwick@Sun.COM 193810922SJeff.Bonwick@Sun.COM static void 193910922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done(zio_t *zio) 194010922SJeff.Bonwick@Sun.COM { 194110922SJeff.Bonwick@Sun.COM int p = DDT_PHYS_DITTO; 194210922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 194310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 194410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 194510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 194610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 194710922SJeff.Bonwick@Sun.COM ddt_key_t *ddk = &dde->dde_key; 194810922SJeff.Bonwick@Sun.COM 194910922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 195010922SJeff.Bonwick@Sun.COM 195110922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 195210922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 195310922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 195410922SJeff.Bonwick@Sun.COM 195510922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 195610922SJeff.Bonwick@Sun.COM ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 195710922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 195810922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 195910922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 196010922SJeff.Bonwick@Sun.COM ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 196110922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 196210922SJeff.Bonwick@Sun.COM } 196310922SJeff.Bonwick@Sun.COM 196410922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 196510922SJeff.Bonwick@Sun.COM } 196610922SJeff.Bonwick@Sun.COM 196710922SJeff.Bonwick@Sun.COM static int 196810922SJeff.Bonwick@Sun.COM zio_ddt_write(zio_t *zio) 196910922SJeff.Bonwick@Sun.COM { 197010922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 197110922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 197210922SJeff.Bonwick@Sun.COM uint64_t txg = zio->io_txg; 197310922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 197410922SJeff.Bonwick@Sun.COM int p = zp->zp_copies; 197510922SJeff.Bonwick@Sun.COM int ditto_copies; 197610922SJeff.Bonwick@Sun.COM zio_t *cio = NULL; 197710922SJeff.Bonwick@Sun.COM zio_t *dio = NULL; 197810922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 197910922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 198010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 198110922SJeff.Bonwick@Sun.COM 198210922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 198310922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 198410922SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 198510922SJeff.Bonwick@Sun.COM 198610922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 198710922SJeff.Bonwick@Sun.COM dde = ddt_lookup(ddt, bp, B_TRUE); 198810922SJeff.Bonwick@Sun.COM ddp = &dde->dde_phys[p]; 198910922SJeff.Bonwick@Sun.COM 199010922SJeff.Bonwick@Sun.COM if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 199110922SJeff.Bonwick@Sun.COM /* 199210922SJeff.Bonwick@Sun.COM * If we're using a weak checksum, upgrade to a strong checksum 199310922SJeff.Bonwick@Sun.COM * and try again. If we're already using a strong checksum, 199410922SJeff.Bonwick@Sun.COM * we can't resolve it, so just convert to an ordinary write. 199510922SJeff.Bonwick@Sun.COM * (And automatically e-mail a paper to Nature?) 199610922SJeff.Bonwick@Sun.COM */ 199710922SJeff.Bonwick@Sun.COM if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 199810922SJeff.Bonwick@Sun.COM zp->zp_checksum = spa_dedup_checksum(spa); 199910922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 200010922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 200110922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 200210922SJeff.Bonwick@Sun.COM } else { 200310922SJeff.Bonwick@Sun.COM zp->zp_dedup = 0; 200410922SJeff.Bonwick@Sun.COM } 200510922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 200610922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 200710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 200810922SJeff.Bonwick@Sun.COM } 200910922SJeff.Bonwick@Sun.COM 201010922SJeff.Bonwick@Sun.COM ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 201110922SJeff.Bonwick@Sun.COM ASSERT(ditto_copies < SPA_DVAS_PER_BP); 201210922SJeff.Bonwick@Sun.COM 201310922SJeff.Bonwick@Sun.COM if (ditto_copies > ddt_ditto_copies_present(dde) && 201410922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 201510922SJeff.Bonwick@Sun.COM zio_prop_t czp = *zp; 201610922SJeff.Bonwick@Sun.COM 201710922SJeff.Bonwick@Sun.COM czp.zp_copies = ditto_copies; 201810922SJeff.Bonwick@Sun.COM 201910922SJeff.Bonwick@Sun.COM /* 202010922SJeff.Bonwick@Sun.COM * If we arrived here with an override bp, we won't have run 202110922SJeff.Bonwick@Sun.COM * the transform stack, so we won't have the data we need to 202210922SJeff.Bonwick@Sun.COM * generate a child i/o. So, toss the override bp and restart. 202310922SJeff.Bonwick@Sun.COM * This is safe, because using the override bp is just an 202410922SJeff.Bonwick@Sun.COM * optimization; and it's rare, so the cost doesn't matter. 202510922SJeff.Bonwick@Sun.COM */ 202610922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 202710922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 202810922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 202910922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 203010922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 203110922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 203210922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 203310922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 203410922SJeff.Bonwick@Sun.COM } 203510922SJeff.Bonwick@Sun.COM 203610922SJeff.Bonwick@Sun.COM dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 203710922SJeff.Bonwick@Sun.COM zio->io_orig_size, &czp, NULL, 203810922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done, dde, zio->io_priority, 203910922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 204010922SJeff.Bonwick@Sun.COM 204110922SJeff.Bonwick@Sun.COM zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 204210922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 204310922SJeff.Bonwick@Sun.COM } 204410922SJeff.Bonwick@Sun.COM 204510922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 204610922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 204710922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, bp, txg); 204810922SJeff.Bonwick@Sun.COM if (dde->dde_lead_zio[p] != NULL) 204910922SJeff.Bonwick@Sun.COM zio_add_child(zio, dde->dde_lead_zio[p]); 205010922SJeff.Bonwick@Sun.COM else 205110922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 205210922SJeff.Bonwick@Sun.COM } else if (zio->io_bp_override) { 205310922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == txg); 205410922SJeff.Bonwick@Sun.COM ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 205510922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 205610922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 205710922SJeff.Bonwick@Sun.COM } else { 205810922SJeff.Bonwick@Sun.COM cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 205910922SJeff.Bonwick@Sun.COM zio->io_orig_size, zp, zio_ddt_child_write_ready, 206010922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done, dde, zio->io_priority, 206110922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 206210922SJeff.Bonwick@Sun.COM 206310922SJeff.Bonwick@Sun.COM zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 206410922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = cio; 206510922SJeff.Bonwick@Sun.COM } 206610922SJeff.Bonwick@Sun.COM 206710922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 206810922SJeff.Bonwick@Sun.COM 206910922SJeff.Bonwick@Sun.COM if (cio) 207010922SJeff.Bonwick@Sun.COM zio_nowait(cio); 207110922SJeff.Bonwick@Sun.COM if (dio) 207210922SJeff.Bonwick@Sun.COM zio_nowait(dio); 207310922SJeff.Bonwick@Sun.COM 207410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 207510922SJeff.Bonwick@Sun.COM } 207610922SJeff.Bonwick@Sun.COM 207712296SLin.Ling@Sun.COM ddt_entry_t *freedde; /* for debugging */ 207812296SLin.Ling@Sun.COM 207910922SJeff.Bonwick@Sun.COM static int 208010922SJeff.Bonwick@Sun.COM zio_ddt_free(zio_t *zio) 208110922SJeff.Bonwick@Sun.COM { 208210922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 208310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 208410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 208510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 208610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 208710922SJeff.Bonwick@Sun.COM 208810922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 208910922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 209010922SJeff.Bonwick@Sun.COM 209110922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 209212296SLin.Ling@Sun.COM freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 209310922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 209410922SJeff.Bonwick@Sun.COM ddt_phys_decref(ddp); 209510922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 209610922SJeff.Bonwick@Sun.COM 209710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 209810922SJeff.Bonwick@Sun.COM } 209910922SJeff.Bonwick@Sun.COM 210010922SJeff.Bonwick@Sun.COM /* 210110922SJeff.Bonwick@Sun.COM * ========================================================================== 2102789Sahrens * Allocate and free blocks 2103789Sahrens * ========================================================================== 2104789Sahrens */ 21055530Sbonwick static int 2106789Sahrens zio_dva_allocate(zio_t *zio) 2107789Sahrens { 21084527Sperrin spa_t *spa = zio->io_spa; 210910922SJeff.Bonwick@Sun.COM metaslab_class_t *mc = spa_normal_class(spa); 2110789Sahrens blkptr_t *bp = zio->io_bp; 2111789Sahrens int error; 2112789Sahrens 21139443SBill.Moore@Sun.COM if (zio->io_gang_leader == NULL) { 21149443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 21159443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 21169443SBill.Moore@Sun.COM } 21179443SBill.Moore@Sun.COM 2118789Sahrens ASSERT(BP_IS_HOLE(bp)); 21191775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 212010922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, >, 0); 212110922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2122789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2123789Sahrens 21247754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, mc, zio->io_size, bp, 212510922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2126789Sahrens 21277754SJeff.Bonwick@Sun.COM if (error) { 21287754SJeff.Bonwick@Sun.COM if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 21297754SJeff.Bonwick@Sun.COM return (zio_write_gang_block(zio)); 2130789Sahrens zio->io_error = error; 2131789Sahrens } 21325530Sbonwick 21335530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2134789Sahrens } 2135789Sahrens 21365530Sbonwick static int 2137789Sahrens zio_dva_free(zio_t *zio) 2138789Sahrens { 21397754SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2140789Sahrens 21415530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2142789Sahrens } 2143789Sahrens 21445530Sbonwick static int 2145789Sahrens zio_dva_claim(zio_t *zio) 2146789Sahrens { 21477754SJeff.Bonwick@Sun.COM int error; 21487754SJeff.Bonwick@Sun.COM 21497754SJeff.Bonwick@Sun.COM error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 21507754SJeff.Bonwick@Sun.COM if (error) 21517754SJeff.Bonwick@Sun.COM zio->io_error = error; 2152789Sahrens 21535530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2154789Sahrens } 2155789Sahrens 2156789Sahrens /* 21577754SJeff.Bonwick@Sun.COM * Undo an allocation. This is used by zio_done() when an I/O fails 21587754SJeff.Bonwick@Sun.COM * and we want to give back the block we just allocated. 21597754SJeff.Bonwick@Sun.COM * This handles both normal blocks and gang blocks. 21607754SJeff.Bonwick@Sun.COM */ 21617754SJeff.Bonwick@Sun.COM static void 21627754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 21637754SJeff.Bonwick@Sun.COM { 21647754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 216510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_bp_override == NULL); 21667754SJeff.Bonwick@Sun.COM 21677754SJeff.Bonwick@Sun.COM if (!BP_IS_HOLE(bp)) 216810922SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 21697754SJeff.Bonwick@Sun.COM 21707754SJeff.Bonwick@Sun.COM if (gn != NULL) { 21717754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 21727754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, gn->gn_child[g], 21737754SJeff.Bonwick@Sun.COM &gn->gn_gbh->zg_blkptr[g]); 21747754SJeff.Bonwick@Sun.COM } 21757754SJeff.Bonwick@Sun.COM } 21767754SJeff.Bonwick@Sun.COM } 21777754SJeff.Bonwick@Sun.COM 21787754SJeff.Bonwick@Sun.COM /* 21797754SJeff.Bonwick@Sun.COM * Try to allocate an intent log block. Return 0 on success, errno on failure. 21807754SJeff.Bonwick@Sun.COM */ 21817754SJeff.Bonwick@Sun.COM int 218210922SJeff.Bonwick@Sun.COM zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 218310922SJeff.Bonwick@Sun.COM uint64_t size, boolean_t use_slog) 21847754SJeff.Bonwick@Sun.COM { 218510310SNeil.Perrin@Sun.COM int error = 1; 21867754SJeff.Bonwick@Sun.COM 218710922SJeff.Bonwick@Sun.COM ASSERT(txg > spa_syncing_txg(spa)); 218810922SJeff.Bonwick@Sun.COM 218910879SNeil.Perrin@Sun.COM if (use_slog) 219010922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_log_class(spa), size, 219110310SNeil.Perrin@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21927754SJeff.Bonwick@Sun.COM 21937754SJeff.Bonwick@Sun.COM if (error) 219410922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), size, 21957754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21967754SJeff.Bonwick@Sun.COM 21977754SJeff.Bonwick@Sun.COM if (error == 0) { 21987754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(new_bp, size); 21997754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(new_bp, size); 22007754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 220111670SNeil.Perrin@Sun.COM BP_SET_CHECKSUM(new_bp, 220211670SNeil.Perrin@Sun.COM spa_version(spa) >= SPA_VERSION_SLIM_ZIL 220311670SNeil.Perrin@Sun.COM ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 22047754SJeff.Bonwick@Sun.COM BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 22057754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(new_bp, 0); 220610922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(new_bp, 0); 22077754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 22087754SJeff.Bonwick@Sun.COM } 22097754SJeff.Bonwick@Sun.COM 22107754SJeff.Bonwick@Sun.COM return (error); 22117754SJeff.Bonwick@Sun.COM } 22127754SJeff.Bonwick@Sun.COM 22137754SJeff.Bonwick@Sun.COM /* 221410922SJeff.Bonwick@Sun.COM * Free an intent log block. 22157754SJeff.Bonwick@Sun.COM */ 22167754SJeff.Bonwick@Sun.COM void 221710922SJeff.Bonwick@Sun.COM zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 22187754SJeff.Bonwick@Sun.COM { 221910922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 22207754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(bp)); 22217754SJeff.Bonwick@Sun.COM 222210922SJeff.Bonwick@Sun.COM zio_free(spa, txg, bp); 22237754SJeff.Bonwick@Sun.COM } 22247754SJeff.Bonwick@Sun.COM 22257754SJeff.Bonwick@Sun.COM /* 2226789Sahrens * ========================================================================== 2227789Sahrens * Read and write to physical devices 2228789Sahrens * ========================================================================== 2229789Sahrens */ 22305530Sbonwick static int 22311775Sbillm zio_vdev_io_start(zio_t *zio) 2232789Sahrens { 2233789Sahrens vdev_t *vd = zio->io_vd; 22341775Sbillm uint64_t align; 22355329Sgw25295 spa_t *spa = zio->io_spa; 22365329Sgw25295 22377754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0); 22387754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 22397754SJeff.Bonwick@Sun.COM 22407754SJeff.Bonwick@Sun.COM if (vd == NULL) { 22417754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 22427754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2243789Sahrens 22447754SJeff.Bonwick@Sun.COM /* 22457754SJeff.Bonwick@Sun.COM * The mirror_ops handle multiple DVAs in a single BP. 22467754SJeff.Bonwick@Sun.COM */ 22475530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 22487754SJeff.Bonwick@Sun.COM } 22491775Sbillm 22507754SJeff.Bonwick@Sun.COM align = 1ULL << vd->vdev_top->vdev_ashift; 2251789Sahrens 22521732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 22531732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 22541732Sbonwick char *abuf = zio_buf_alloc(asize); 22557754SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 22561732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 22571732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 22581732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 22591732Sbonwick } 22607754SJeff.Bonwick@Sun.COM zio_push_transform(zio, abuf, asize, asize, zio_subblock); 22611732Sbonwick } 22621732Sbonwick 22631732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 22641732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 22658241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 22668241SJeff.Bonwick@Sun.COM 22678241SJeff.Bonwick@Sun.COM /* 22688241SJeff.Bonwick@Sun.COM * If this is a repair I/O, and there's no self-healing involved -- 22698241SJeff.Bonwick@Sun.COM * that is, we're just resilvering what we expect to resilver -- 22708241SJeff.Bonwick@Sun.COM * then don't do the I/O unless zio's txg is actually in vd's DTL. 22718241SJeff.Bonwick@Sun.COM * This prevents spurious resilvering with nested replication. 22728241SJeff.Bonwick@Sun.COM * For example, given a mirror of mirrors, (A+B)+(C+D), if only 22738241SJeff.Bonwick@Sun.COM * A is out of date, we'll read from C+D, then use the data to 22748241SJeff.Bonwick@Sun.COM * resilver A+B -- but we don't actually want to resilver B, just A. 22758241SJeff.Bonwick@Sun.COM * The top-level mirror has no way to know this, so instead we just 22768241SJeff.Bonwick@Sun.COM * discard unnecessary repairs as we work our way down the vdev tree. 22778241SJeff.Bonwick@Sun.COM * The same logic applies to any form of nested replication: 22788241SJeff.Bonwick@Sun.COM * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 22798241SJeff.Bonwick@Sun.COM */ 22808241SJeff.Bonwick@Sun.COM if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 22818241SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 22828241SJeff.Bonwick@Sun.COM zio->io_txg != 0 && /* not a delegated i/o */ 22838241SJeff.Bonwick@Sun.COM !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 22848241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 22858241SJeff.Bonwick@Sun.COM zio_vdev_io_bypass(zio); 22868241SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 22878241SJeff.Bonwick@Sun.COM } 2288789Sahrens 22897754SJeff.Bonwick@Sun.COM if (vd->vdev_ops->vdev_op_leaf && 22907754SJeff.Bonwick@Sun.COM (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 22917754SJeff.Bonwick@Sun.COM 22927754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 22938632SBill.Moore@Sun.COM return (ZIO_PIPELINE_CONTINUE); 22947754SJeff.Bonwick@Sun.COM 22957754SJeff.Bonwick@Sun.COM if ((zio = vdev_queue_io(zio)) == NULL) 22967754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22977754SJeff.Bonwick@Sun.COM 22987754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 22997754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23007754SJeff.Bonwick@Sun.COM zio_interrupt(zio); 23017754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23027754SJeff.Bonwick@Sun.COM } 23037754SJeff.Bonwick@Sun.COM } 23047754SJeff.Bonwick@Sun.COM 23055530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 2306789Sahrens } 2307789Sahrens 23085530Sbonwick static int 2309789Sahrens zio_vdev_io_done(zio_t *zio) 2310789Sahrens { 23117754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 23127754SJeff.Bonwick@Sun.COM vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 23137754SJeff.Bonwick@Sun.COM boolean_t unexpected_error = B_FALSE; 23145530Sbonwick 23157754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 23167754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23177754SJeff.Bonwick@Sun.COM 23187754SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2319789Sahrens 23207754SJeff.Bonwick@Sun.COM if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 23217754SJeff.Bonwick@Sun.COM 23227754SJeff.Bonwick@Sun.COM vdev_queue_io_done(zio); 23237754SJeff.Bonwick@Sun.COM 23247754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE) 23257754SJeff.Bonwick@Sun.COM vdev_cache_write(zio); 23267754SJeff.Bonwick@Sun.COM 23277754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23289725SEric.Schrock@Sun.COM zio->io_error = zio_handle_device_injection(vd, 23299725SEric.Schrock@Sun.COM zio, EIO); 2330789Sahrens 23317754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23327754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_label_injection(zio, EIO); 23337754SJeff.Bonwick@Sun.COM 23347754SJeff.Bonwick@Sun.COM if (zio->io_error) { 23357754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 23367754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23377754SJeff.Bonwick@Sun.COM } else { 23387754SJeff.Bonwick@Sun.COM unexpected_error = B_TRUE; 23397754SJeff.Bonwick@Sun.COM } 23407754SJeff.Bonwick@Sun.COM } 23416976Seschrock } 23427754SJeff.Bonwick@Sun.COM 23437754SJeff.Bonwick@Sun.COM ops->vdev_op_io_done(zio); 2344789Sahrens 23457754SJeff.Bonwick@Sun.COM if (unexpected_error) 23468632SBill.Moore@Sun.COM VERIFY(vdev_probe(vd, zio) == NULL); 23477754SJeff.Bonwick@Sun.COM 23487754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2349789Sahrens } 2350789Sahrens 235110614SJonathan.Adams@Sun.COM /* 235210614SJonathan.Adams@Sun.COM * For non-raidz ZIOs, we can just copy aside the bad data read from the 235310614SJonathan.Adams@Sun.COM * disk, and use that to finish the checksum ereport later. 235410614SJonathan.Adams@Sun.COM */ 235510614SJonathan.Adams@Sun.COM static void 235610614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 235710614SJonathan.Adams@Sun.COM const void *good_buf) 235810614SJonathan.Adams@Sun.COM { 235910614SJonathan.Adams@Sun.COM /* no processing needed */ 236010614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 236110614SJonathan.Adams@Sun.COM } 236210614SJonathan.Adams@Sun.COM 236310614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 236410614SJonathan.Adams@Sun.COM void 236510614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 236610614SJonathan.Adams@Sun.COM { 236710614SJonathan.Adams@Sun.COM void *buf = zio_buf_alloc(zio->io_size); 236810614SJonathan.Adams@Sun.COM 236910614SJonathan.Adams@Sun.COM bcopy(zio->io_data, buf, zio->io_size); 237010614SJonathan.Adams@Sun.COM 237110614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = zio->io_size; 237210614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = buf; 237310614SJonathan.Adams@Sun.COM zcr->zcr_finish = zio_vsd_default_cksum_finish; 237410614SJonathan.Adams@Sun.COM zcr->zcr_free = zio_buf_free; 237510614SJonathan.Adams@Sun.COM } 237610614SJonathan.Adams@Sun.COM 23775530Sbonwick static int 2378789Sahrens zio_vdev_io_assess(zio_t *zio) 2379789Sahrens { 2380789Sahrens vdev_t *vd = zio->io_vd; 2381789Sahrens 23827754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 23837754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2384789Sahrens 23857754SJeff.Bonwick@Sun.COM if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 23867754SJeff.Bonwick@Sun.COM spa_config_exit(zio->io_spa, SCL_ZIO, zio); 23877754SJeff.Bonwick@Sun.COM 23887754SJeff.Bonwick@Sun.COM if (zio->io_vsd != NULL) { 238910614SJonathan.Adams@Sun.COM zio->io_vsd_ops->vsd_free(zio); 23907754SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 23911732Sbonwick } 23921732Sbonwick 23937754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23941544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 2395789Sahrens 2396789Sahrens /* 2397789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 239811173SJonathan.Adams@Sun.COM * 239911173SJonathan.Adams@Sun.COM * On retry, we cut in line in the issue queue, since we don't want 240011173SJonathan.Adams@Sun.COM * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2401789Sahrens */ 24027754SJeff.Bonwick@Sun.COM if (zio->io_error && vd == NULL && 24037754SJeff.Bonwick@Sun.COM !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 24047754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 24057754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2406789Sahrens zio->io_error = 0; 24077754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_RETRY | 24087754SJeff.Bonwick@Sun.COM ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 240910922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 241011173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 241111173SJonathan.Adams@Sun.COM zio_requeue_io_start_cut_in_line); 24127754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 24137754SJeff.Bonwick@Sun.COM } 2414789Sahrens 24157754SJeff.Bonwick@Sun.COM /* 24167754SJeff.Bonwick@Sun.COM * If we got an error on a leaf device, convert it to ENXIO 24177754SJeff.Bonwick@Sun.COM * if the device is not accessible at all. 24187754SJeff.Bonwick@Sun.COM */ 24197754SJeff.Bonwick@Sun.COM if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 24207754SJeff.Bonwick@Sun.COM !vdev_accessible(vd, zio)) 24217754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 24227754SJeff.Bonwick@Sun.COM 24237754SJeff.Bonwick@Sun.COM /* 24247754SJeff.Bonwick@Sun.COM * If we can't write to an interior vdev (mirror or RAID-Z), 24257754SJeff.Bonwick@Sun.COM * set vdev_cant_write so that we stop trying to allocate from it. 24267754SJeff.Bonwick@Sun.COM */ 24277754SJeff.Bonwick@Sun.COM if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 24287754SJeff.Bonwick@Sun.COM vd != NULL && !vd->vdev_ops->vdev_op_leaf) 24297754SJeff.Bonwick@Sun.COM vd->vdev_cant_write = B_TRUE; 24307754SJeff.Bonwick@Sun.COM 24317754SJeff.Bonwick@Sun.COM if (zio->io_error) 24327754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2433789Sahrens 24345530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2435789Sahrens } 2436789Sahrens 2437789Sahrens void 2438789Sahrens zio_vdev_io_reissue(zio_t *zio) 2439789Sahrens { 2440789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2441789Sahrens ASSERT(zio->io_error == 0); 2442789Sahrens 244310922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2444789Sahrens } 2445789Sahrens 2446789Sahrens void 2447789Sahrens zio_vdev_io_redone(zio_t *zio) 2448789Sahrens { 2449789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2450789Sahrens 245110922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2452789Sahrens } 2453789Sahrens 2454789Sahrens void 2455789Sahrens zio_vdev_io_bypass(zio_t *zio) 2456789Sahrens { 2457789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2458789Sahrens ASSERT(zio->io_error == 0); 2459789Sahrens 2460789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 246110922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2462789Sahrens } 2463789Sahrens 2464789Sahrens /* 2465789Sahrens * ========================================================================== 2466789Sahrens * Generate and verify checksums 2467789Sahrens * ========================================================================== 2468789Sahrens */ 24695530Sbonwick static int 2470789Sahrens zio_checksum_generate(zio_t *zio) 2471789Sahrens { 2472789Sahrens blkptr_t *bp = zio->io_bp; 24737754SJeff.Bonwick@Sun.COM enum zio_checksum checksum; 2474789Sahrens 24757754SJeff.Bonwick@Sun.COM if (bp == NULL) { 24767754SJeff.Bonwick@Sun.COM /* 24777754SJeff.Bonwick@Sun.COM * This is zio_write_phys(). 24787754SJeff.Bonwick@Sun.COM * We're either generating a label checksum, or none at all. 24797754SJeff.Bonwick@Sun.COM */ 24807754SJeff.Bonwick@Sun.COM checksum = zio->io_prop.zp_checksum; 2481789Sahrens 24827754SJeff.Bonwick@Sun.COM if (checksum == ZIO_CHECKSUM_OFF) 24837754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2484789Sahrens 24857754SJeff.Bonwick@Sun.COM ASSERT(checksum == ZIO_CHECKSUM_LABEL); 24867754SJeff.Bonwick@Sun.COM } else { 24877754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 24887754SJeff.Bonwick@Sun.COM ASSERT(!IO_IS_ALLOCATING(zio)); 24897754SJeff.Bonwick@Sun.COM checksum = ZIO_CHECKSUM_GANG_HEADER; 24907754SJeff.Bonwick@Sun.COM } else { 24917754SJeff.Bonwick@Sun.COM checksum = BP_GET_CHECKSUM(bp); 24927754SJeff.Bonwick@Sun.COM } 24937754SJeff.Bonwick@Sun.COM } 2494789Sahrens 24957754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2496789Sahrens 24975530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2498789Sahrens } 2499789Sahrens 25005530Sbonwick static int 2501789Sahrens zio_checksum_verify(zio_t *zio) 2502789Sahrens { 250310614SJonathan.Adams@Sun.COM zio_bad_cksum_t info; 25047754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 25057754SJeff.Bonwick@Sun.COM int error; 25067754SJeff.Bonwick@Sun.COM 250710922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vd != NULL); 250810922SJeff.Bonwick@Sun.COM 25097754SJeff.Bonwick@Sun.COM if (bp == NULL) { 25107754SJeff.Bonwick@Sun.COM /* 25117754SJeff.Bonwick@Sun.COM * This is zio_read_phys(). 25127754SJeff.Bonwick@Sun.COM * We're either verifying a label checksum, or nothing at all. 25137754SJeff.Bonwick@Sun.COM */ 25147754SJeff.Bonwick@Sun.COM if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 25157754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 25167754SJeff.Bonwick@Sun.COM 25177754SJeff.Bonwick@Sun.COM ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 25187754SJeff.Bonwick@Sun.COM } 25197754SJeff.Bonwick@Sun.COM 252010614SJonathan.Adams@Sun.COM if ((error = zio_checksum_error(zio, &info)) != 0) { 25217754SJeff.Bonwick@Sun.COM zio->io_error = error; 25227754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 252310614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum(zio->io_spa, 252410614SJonathan.Adams@Sun.COM zio->io_vd, zio, zio->io_offset, 252510614SJonathan.Adams@Sun.COM zio->io_size, NULL, &info); 25267754SJeff.Bonwick@Sun.COM } 2527789Sahrens } 2528789Sahrens 25295530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2530789Sahrens } 2531789Sahrens 2532789Sahrens /* 2533789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 2534789Sahrens */ 2535789Sahrens void 2536789Sahrens zio_checksum_verified(zio_t *zio) 2537789Sahrens { 253810922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2539789Sahrens } 2540789Sahrens 2541789Sahrens /* 25427754SJeff.Bonwick@Sun.COM * ========================================================================== 25437754SJeff.Bonwick@Sun.COM * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 25447754SJeff.Bonwick@Sun.COM * An error of 0 indictes success. ENXIO indicates whole-device failure, 25457754SJeff.Bonwick@Sun.COM * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 25467754SJeff.Bonwick@Sun.COM * indicate errors that are specific to one I/O, and most likely permanent. 25477754SJeff.Bonwick@Sun.COM * Any other error is presumed to be worse because we weren't expecting it. 25487754SJeff.Bonwick@Sun.COM * ========================================================================== 2549789Sahrens */ 25507754SJeff.Bonwick@Sun.COM int 25517754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2) 2552789Sahrens { 25537754SJeff.Bonwick@Sun.COM static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 25547754SJeff.Bonwick@Sun.COM int r1, r2; 25551775Sbillm 25567754SJeff.Bonwick@Sun.COM for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 25577754SJeff.Bonwick@Sun.COM if (e1 == zio_error_rank[r1]) 25587754SJeff.Bonwick@Sun.COM break; 25597754SJeff.Bonwick@Sun.COM 25607754SJeff.Bonwick@Sun.COM for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 25617754SJeff.Bonwick@Sun.COM if (e2 == zio_error_rank[r2]) 25627754SJeff.Bonwick@Sun.COM break; 25637754SJeff.Bonwick@Sun.COM 25647754SJeff.Bonwick@Sun.COM return (r1 > r2 ? e1 : e2); 2565789Sahrens } 2566789Sahrens 2567789Sahrens /* 2568789Sahrens * ========================================================================== 25697754SJeff.Bonwick@Sun.COM * I/O completion 2570789Sahrens * ========================================================================== 2571789Sahrens */ 25727754SJeff.Bonwick@Sun.COM static int 25737754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio) 25747754SJeff.Bonwick@Sun.COM { 25757754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 25768632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 25777754SJeff.Bonwick@Sun.COM 257810922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 257910922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 25809443SBill.Moore@Sun.COM return (ZIO_PIPELINE_STOP); 25819443SBill.Moore@Sun.COM 25827754SJeff.Bonwick@Sun.COM if (zio->io_ready) { 25837754SJeff.Bonwick@Sun.COM ASSERT(IO_IS_ALLOCATING(zio)); 25847754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 25857754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 25867754SJeff.Bonwick@Sun.COM 25877754SJeff.Bonwick@Sun.COM zio->io_ready(zio); 25887754SJeff.Bonwick@Sun.COM } 25897754SJeff.Bonwick@Sun.COM 25907754SJeff.Bonwick@Sun.COM if (bp != NULL && bp != &zio->io_bp_copy) 25917754SJeff.Bonwick@Sun.COM zio->io_bp_copy = *bp; 25927754SJeff.Bonwick@Sun.COM 25937754SJeff.Bonwick@Sun.COM if (zio->io_error) 25947754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 25957754SJeff.Bonwick@Sun.COM 25968632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 25978632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = 1; 25988632SBill.Moore@Sun.COM pio = zio_walk_parents(zio); 25998632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 26008632SBill.Moore@Sun.COM 26018632SBill.Moore@Sun.COM /* 26028632SBill.Moore@Sun.COM * As we notify zio's parents, new parents could be added. 26038632SBill.Moore@Sun.COM * New parents go to the head of zio's io_parent_list, however, 26048632SBill.Moore@Sun.COM * so we will (correctly) not notify them. The remainder of zio's 26058632SBill.Moore@Sun.COM * io_parent_list, from 'pio_next' onward, cannot change because 26068632SBill.Moore@Sun.COM * all parents must wait for us to be done before they can be done. 26078632SBill.Moore@Sun.COM */ 26088632SBill.Moore@Sun.COM for (; pio != NULL; pio = pio_next) { 26098632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 26107754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_READY); 26118632SBill.Moore@Sun.COM } 26127754SJeff.Bonwick@Sun.COM 261310922SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_NODATA) { 261410922SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp)) { 261510922SJeff.Bonwick@Sun.COM zio->io_flags &= ~ZIO_FLAG_NODATA; 261610922SJeff.Bonwick@Sun.COM } else { 261710922SJeff.Bonwick@Sun.COM ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 261810922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 261910922SJeff.Bonwick@Sun.COM } 262010922SJeff.Bonwick@Sun.COM } 262110922SJeff.Bonwick@Sun.COM 262211026STim.Haley@Sun.COM if (zio_injection_enabled && 262311026STim.Haley@Sun.COM zio->io_spa->spa_syncing_txg == zio->io_txg) 262411026STim.Haley@Sun.COM zio_handle_ignored_writes(zio); 262511026STim.Haley@Sun.COM 26267754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 26277754SJeff.Bonwick@Sun.COM } 26287754SJeff.Bonwick@Sun.COM 26297754SJeff.Bonwick@Sun.COM static int 26307754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio) 26317754SJeff.Bonwick@Sun.COM { 26327754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 26337754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 26347754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 26357754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 26367754SJeff.Bonwick@Sun.COM uint64_t psize = zio->io_size; 26378632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 26387754SJeff.Bonwick@Sun.COM 26397754SJeff.Bonwick@Sun.COM /* 26409443SBill.Moore@Sun.COM * If our children haven't all completed, 26417754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 26427754SJeff.Bonwick@Sun.COM */ 26437754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 26447754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 264510922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 26467754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 26477754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 26487754SJeff.Bonwick@Sun.COM 26497754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 26507754SJeff.Bonwick@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 26517754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[c][w] == 0); 26527754SJeff.Bonwick@Sun.COM 26537754SJeff.Bonwick@Sun.COM if (bp != NULL) { 26547754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[0] == 0); 26557754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[1] == 0); 26567754SJeff.Bonwick@Sun.COM ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 26578632SBill.Moore@Sun.COM (bp == zio_unique_parent(zio)->io_bp)); 26587754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 265910922SJeff.Bonwick@Sun.COM zio->io_bp_override == NULL && 26607754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 26617754SJeff.Bonwick@Sun.COM ASSERT(!BP_SHOULD_BYTESWAP(bp)); 266210922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 26637754SJeff.Bonwick@Sun.COM ASSERT(BP_COUNT_GANG(bp) == 0 || 26647754SJeff.Bonwick@Sun.COM (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 26657754SJeff.Bonwick@Sun.COM } 26667754SJeff.Bonwick@Sun.COM } 26677754SJeff.Bonwick@Sun.COM 26687754SJeff.Bonwick@Sun.COM /* 266910922SJeff.Bonwick@Sun.COM * If there were child vdev/gang/ddt errors, they apply to us now. 26707754SJeff.Bonwick@Sun.COM */ 26717754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 26727754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 267310922SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 267410922SJeff.Bonwick@Sun.COM 267510922SJeff.Bonwick@Sun.COM /* 267610922SJeff.Bonwick@Sun.COM * If the I/O on the transformed data was successful, generate any 267710922SJeff.Bonwick@Sun.COM * checksum reports now while we still have the transformed data. 267810922SJeff.Bonwick@Sun.COM */ 267910922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 268010922SJeff.Bonwick@Sun.COM while (zio->io_cksum_report != NULL) { 268110922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 268210922SJeff.Bonwick@Sun.COM uint64_t align = zcr->zcr_align; 268310922SJeff.Bonwick@Sun.COM uint64_t asize = P2ROUNDUP(psize, align); 268410922SJeff.Bonwick@Sun.COM char *abuf = zio->io_data; 268510922SJeff.Bonwick@Sun.COM 268610922SJeff.Bonwick@Sun.COM if (asize != psize) { 268710922SJeff.Bonwick@Sun.COM abuf = zio_buf_alloc(asize); 268810922SJeff.Bonwick@Sun.COM bcopy(zio->io_data, abuf, psize); 268910922SJeff.Bonwick@Sun.COM bzero(abuf + psize, asize - psize); 269010922SJeff.Bonwick@Sun.COM } 269110922SJeff.Bonwick@Sun.COM 269210922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 269310922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 269410922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, abuf); 269510922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 269610922SJeff.Bonwick@Sun.COM 269710922SJeff.Bonwick@Sun.COM if (asize != psize) 269810922SJeff.Bonwick@Sun.COM zio_buf_free(abuf, asize); 269910922SJeff.Bonwick@Sun.COM } 270010922SJeff.Bonwick@Sun.COM } 27017754SJeff.Bonwick@Sun.COM 27027754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); /* note: may set zio->io_error */ 27037754SJeff.Bonwick@Sun.COM 27047754SJeff.Bonwick@Sun.COM vdev_stat_update(zio, psize); 27057754SJeff.Bonwick@Sun.COM 27067754SJeff.Bonwick@Sun.COM if (zio->io_error) { 27077754SJeff.Bonwick@Sun.COM /* 27087754SJeff.Bonwick@Sun.COM * If this I/O is attached to a particular vdev, 27097754SJeff.Bonwick@Sun.COM * generate an error message describing the I/O failure 27107754SJeff.Bonwick@Sun.COM * at the block level. We ignore these errors if the 27117754SJeff.Bonwick@Sun.COM * device is currently unavailable. 27127754SJeff.Bonwick@Sun.COM */ 27137754SJeff.Bonwick@Sun.COM if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 27147754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 27157754SJeff.Bonwick@Sun.COM 271610685SGeorge.Wilson@Sun.COM if ((zio->io_error == EIO || !(zio->io_flags & 271710685SGeorge.Wilson@Sun.COM (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 271810685SGeorge.Wilson@Sun.COM zio == lio) { 27197754SJeff.Bonwick@Sun.COM /* 27207754SJeff.Bonwick@Sun.COM * For logical I/O requests, tell the SPA to log the 27217754SJeff.Bonwick@Sun.COM * error and generate a logical data ereport. 27227754SJeff.Bonwick@Sun.COM */ 27237754SJeff.Bonwick@Sun.COM spa_log_error(spa, zio); 27247754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 27257754SJeff.Bonwick@Sun.COM 0, 0); 27267754SJeff.Bonwick@Sun.COM } 27277754SJeff.Bonwick@Sun.COM } 27287754SJeff.Bonwick@Sun.COM 27297754SJeff.Bonwick@Sun.COM if (zio->io_error && zio == lio) { 27307754SJeff.Bonwick@Sun.COM /* 27317754SJeff.Bonwick@Sun.COM * Determine whether zio should be reexecuted. This will 27327754SJeff.Bonwick@Sun.COM * propagate all the way to the root via zio_notify_parent(). 27337754SJeff.Bonwick@Sun.COM */ 27347754SJeff.Bonwick@Sun.COM ASSERT(vd == NULL && bp != NULL); 273510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 273610922SJeff.Bonwick@Sun.COM 273710922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio) && 273810922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 27397754SJeff.Bonwick@Sun.COM if (zio->io_error != ENOSPC) 27407754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_NOW; 27417754SJeff.Bonwick@Sun.COM else 27427754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 274310922SJeff.Bonwick@Sun.COM } 27447754SJeff.Bonwick@Sun.COM 27457754SJeff.Bonwick@Sun.COM if ((zio->io_type == ZIO_TYPE_READ || 27467754SJeff.Bonwick@Sun.COM zio->io_type == ZIO_TYPE_FREE) && 27477754SJeff.Bonwick@Sun.COM zio->io_error == ENXIO && 274811147SGeorge.Wilson@Sun.COM spa_load_state(spa) == SPA_LOAD_NONE && 27497754SJeff.Bonwick@Sun.COM spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 27507754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 27517754SJeff.Bonwick@Sun.COM 27527754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 27537754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 275410614SJonathan.Adams@Sun.COM 275510614SJonathan.Adams@Sun.COM /* 275610614SJonathan.Adams@Sun.COM * Here is a possibly good place to attempt to do 275710614SJonathan.Adams@Sun.COM * either combinatorial reconstruction or error correction 275810614SJonathan.Adams@Sun.COM * based on checksums. It also might be a good place 275910614SJonathan.Adams@Sun.COM * to send out preliminary ereports before we suspend 276010614SJonathan.Adams@Sun.COM * processing. 276110614SJonathan.Adams@Sun.COM */ 27627754SJeff.Bonwick@Sun.COM } 27637754SJeff.Bonwick@Sun.COM 27647754SJeff.Bonwick@Sun.COM /* 27657754SJeff.Bonwick@Sun.COM * If there were logical child errors, they apply to us now. 27667754SJeff.Bonwick@Sun.COM * We defer this until now to avoid conflating logical child 27677754SJeff.Bonwick@Sun.COM * errors with errors that happened to the zio itself when 27687754SJeff.Bonwick@Sun.COM * updating vdev stats and reporting FMA events above. 27697754SJeff.Bonwick@Sun.COM */ 27707754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 27717754SJeff.Bonwick@Sun.COM 277210922SJeff.Bonwick@Sun.COM if ((zio->io_error || zio->io_reexecute) && 277310922SJeff.Bonwick@Sun.COM IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 277410922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 27759443SBill.Moore@Sun.COM zio_dva_unallocate(zio, zio->io_gang_tree, bp); 27769443SBill.Moore@Sun.COM 27779443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 27789443SBill.Moore@Sun.COM 27799470SGeorge.Wilson@Sun.COM /* 27809470SGeorge.Wilson@Sun.COM * Godfather I/Os should never suspend. 27819470SGeorge.Wilson@Sun.COM */ 27829470SGeorge.Wilson@Sun.COM if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 27839470SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 27849470SGeorge.Wilson@Sun.COM zio->io_reexecute = 0; 27859470SGeorge.Wilson@Sun.COM 27869470SGeorge.Wilson@Sun.COM if (zio->io_reexecute) { 27877754SJeff.Bonwick@Sun.COM /* 27887754SJeff.Bonwick@Sun.COM * This is a logical I/O that wants to reexecute. 27897754SJeff.Bonwick@Sun.COM * 27907754SJeff.Bonwick@Sun.COM * Reexecute is top-down. When an i/o fails, if it's not 27917754SJeff.Bonwick@Sun.COM * the root, it simply notifies its parent and sticks around. 27927754SJeff.Bonwick@Sun.COM * The parent, seeing that it still has children in zio_done(), 27937754SJeff.Bonwick@Sun.COM * does the same. This percolates all the way up to the root. 27947754SJeff.Bonwick@Sun.COM * The root i/o will reexecute or suspend the entire tree. 27957754SJeff.Bonwick@Sun.COM * 27967754SJeff.Bonwick@Sun.COM * This approach ensures that zio_reexecute() honors 27977754SJeff.Bonwick@Sun.COM * all the original i/o dependency relationships, e.g. 27987754SJeff.Bonwick@Sun.COM * parents not executing until children are ready. 27997754SJeff.Bonwick@Sun.COM */ 28007754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 28017754SJeff.Bonwick@Sun.COM 28029443SBill.Moore@Sun.COM zio->io_gang_leader = NULL; 28037754SJeff.Bonwick@Sun.COM 28048632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 28058632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 28068632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 28078632SBill.Moore@Sun.COM 28089234SGeorge.Wilson@Sun.COM /* 28099234SGeorge.Wilson@Sun.COM * "The Godfather" I/O monitors its children but is 28109234SGeorge.Wilson@Sun.COM * not a true parent to them. It will track them through 28119234SGeorge.Wilson@Sun.COM * the pipeline but severs its ties whenever they get into 28129234SGeorge.Wilson@Sun.COM * trouble (e.g. suspended). This allows "The Godfather" 28139234SGeorge.Wilson@Sun.COM * I/O to return status without blocking. 28149234SGeorge.Wilson@Sun.COM */ 28159234SGeorge.Wilson@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 28169234SGeorge.Wilson@Sun.COM zio_link_t *zl = zio->io_walk_link; 28179234SGeorge.Wilson@Sun.COM pio_next = zio_walk_parents(zio); 28189234SGeorge.Wilson@Sun.COM 28199234SGeorge.Wilson@Sun.COM if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 28209234SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 28219234SGeorge.Wilson@Sun.COM zio_remove_child(pio, zio, zl); 28229234SGeorge.Wilson@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28239234SGeorge.Wilson@Sun.COM } 28249234SGeorge.Wilson@Sun.COM } 28259234SGeorge.Wilson@Sun.COM 28268632SBill.Moore@Sun.COM if ((pio = zio_unique_parent(zio)) != NULL) { 28277754SJeff.Bonwick@Sun.COM /* 28287754SJeff.Bonwick@Sun.COM * We're not a root i/o, so there's nothing to do 28297754SJeff.Bonwick@Sun.COM * but notify our parent. Don't propagate errors 28307754SJeff.Bonwick@Sun.COM * upward since we haven't permanently failed yet. 28317754SJeff.Bonwick@Sun.COM */ 28329470SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 28337754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 28347754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28357754SJeff.Bonwick@Sun.COM } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 28367754SJeff.Bonwick@Sun.COM /* 28377754SJeff.Bonwick@Sun.COM * We'd fail again if we reexecuted now, so suspend 28387754SJeff.Bonwick@Sun.COM * until conditions improve (e.g. device comes online). 28397754SJeff.Bonwick@Sun.COM */ 28407754SJeff.Bonwick@Sun.COM zio_suspend(spa, zio); 28417754SJeff.Bonwick@Sun.COM } else { 28427754SJeff.Bonwick@Sun.COM /* 28437754SJeff.Bonwick@Sun.COM * Reexecution is potentially a huge amount of work. 28447754SJeff.Bonwick@Sun.COM * Hand it off to the otherwise-unused claim taskq. 28457754SJeff.Bonwick@Sun.COM */ 28467754SJeff.Bonwick@Sun.COM (void) taskq_dispatch( 28477754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 28487754SJeff.Bonwick@Sun.COM (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 28497754SJeff.Bonwick@Sun.COM } 28507754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28517754SJeff.Bonwick@Sun.COM } 28527754SJeff.Bonwick@Sun.COM 285310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 28549470SGeorge.Wilson@Sun.COM ASSERT(zio->io_reexecute == 0); 28557754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 28567754SJeff.Bonwick@Sun.COM 285710922SJeff.Bonwick@Sun.COM /* 285810922SJeff.Bonwick@Sun.COM * Report any checksum errors, since the I/O is complete. 285910922SJeff.Bonwick@Sun.COM */ 286010614SJonathan.Adams@Sun.COM while (zio->io_cksum_report != NULL) { 286110922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 286210922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 286310922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 286410922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, NULL); 286510922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 286610614SJonathan.Adams@Sun.COM } 286710614SJonathan.Adams@Sun.COM 28688632SBill.Moore@Sun.COM /* 28698632SBill.Moore@Sun.COM * It is the responsibility of the done callback to ensure that this 28708632SBill.Moore@Sun.COM * particular zio is no longer discoverable for adoption, and as 28718632SBill.Moore@Sun.COM * such, cannot acquire any new parents. 28728632SBill.Moore@Sun.COM */ 28737754SJeff.Bonwick@Sun.COM if (zio->io_done) 28747754SJeff.Bonwick@Sun.COM zio->io_done(zio); 28757754SJeff.Bonwick@Sun.COM 28768632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 28778632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 28788632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 28797754SJeff.Bonwick@Sun.COM 28808632SBill.Moore@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 28818632SBill.Moore@Sun.COM zio_link_t *zl = zio->io_walk_link; 28828632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 28838632SBill.Moore@Sun.COM zio_remove_child(pio, zio, zl); 28847754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28857754SJeff.Bonwick@Sun.COM } 28867754SJeff.Bonwick@Sun.COM 28877754SJeff.Bonwick@Sun.COM if (zio->io_waiter != NULL) { 28887754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 28897754SJeff.Bonwick@Sun.COM zio->io_executor = NULL; 28907754SJeff.Bonwick@Sun.COM cv_broadcast(&zio->io_cv); 28917754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 28927754SJeff.Bonwick@Sun.COM } else { 28937754SJeff.Bonwick@Sun.COM zio_destroy(zio); 28947754SJeff.Bonwick@Sun.COM } 28957754SJeff.Bonwick@Sun.COM 28967754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28977754SJeff.Bonwick@Sun.COM } 28987754SJeff.Bonwick@Sun.COM 28997754SJeff.Bonwick@Sun.COM /* 29007754SJeff.Bonwick@Sun.COM * ========================================================================== 29017754SJeff.Bonwick@Sun.COM * I/O pipeline definition 29027754SJeff.Bonwick@Sun.COM * ========================================================================== 29037754SJeff.Bonwick@Sun.COM */ 290410922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[] = { 29055530Sbonwick NULL, 290610922SJeff.Bonwick@Sun.COM zio_read_bp_init, 290710922SJeff.Bonwick@Sun.COM zio_free_bp_init, 29085530Sbonwick zio_issue_async, 29097754SJeff.Bonwick@Sun.COM zio_write_bp_init, 2910789Sahrens zio_checksum_generate, 291110922SJeff.Bonwick@Sun.COM zio_ddt_read_start, 291210922SJeff.Bonwick@Sun.COM zio_ddt_read_done, 291310922SJeff.Bonwick@Sun.COM zio_ddt_write, 291410922SJeff.Bonwick@Sun.COM zio_ddt_free, 29157754SJeff.Bonwick@Sun.COM zio_gang_assemble, 29167754SJeff.Bonwick@Sun.COM zio_gang_issue, 2917789Sahrens zio_dva_allocate, 2918789Sahrens zio_dva_free, 2919789Sahrens zio_dva_claim, 2920789Sahrens zio_ready, 2921789Sahrens zio_vdev_io_start, 2922789Sahrens zio_vdev_io_done, 2923789Sahrens zio_vdev_io_assess, 2924789Sahrens zio_checksum_verify, 29257754SJeff.Bonwick@Sun.COM zio_done 2926789Sahrens }; 2927