1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 228632SBill.Moore@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #include <sys/zfs_context.h> 271544Seschrock #include <sys/fm/fs/zfs.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/txg.h> 30789Sahrens #include <sys/spa_impl.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio_impl.h> 33789Sahrens #include <sys/zio_compress.h> 34789Sahrens #include <sys/zio_checksum.h> 3510922SJeff.Bonwick@Sun.COM #include <sys/dmu_objset.h> 3610922SJeff.Bonwick@Sun.COM #include <sys/arc.h> 3710922SJeff.Bonwick@Sun.COM #include <sys/ddt.h> 38789Sahrens 39789Sahrens /* 40789Sahrens * ========================================================================== 41789Sahrens * I/O priority table 42789Sahrens * ========================================================================== 43789Sahrens */ 44789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 45789Sahrens 0, /* ZIO_PRIORITY_NOW */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 47789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 4811146SGeorge.Wilson@Sun.COM 0, /* ZIO_PRIORITY_LOG_WRITE */ 4911146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_CACHE_FILL */ 5011146SGeorge.Wilson@Sun.COM 1, /* ZIO_PRIORITY_AGG */ 51789Sahrens 4, /* ZIO_PRIORITY_FREE */ 5211146SGeorge.Wilson@Sun.COM 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 5311146SGeorge.Wilson@Sun.COM 6, /* ZIO_PRIORITY_ASYNC_READ */ 54789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 55789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 56789Sahrens }; 57789Sahrens 58789Sahrens /* 59789Sahrens * ========================================================================== 60789Sahrens * I/O type descriptions 61789Sahrens * ========================================================================== 62789Sahrens */ 63789Sahrens char *zio_type_name[ZIO_TYPES] = { 6411146SGeorge.Wilson@Sun.COM "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 6511146SGeorge.Wilson@Sun.COM "zio_ioctl" 6611146SGeorge.Wilson@Sun.COM }; 67789Sahrens 68789Sahrens /* 69789Sahrens * ========================================================================== 70789Sahrens * I/O kmem caches 71789Sahrens * ========================================================================== 72789Sahrens */ 734055Seschrock kmem_cache_t *zio_cache; 748632SBill.Moore@Sun.COM kmem_cache_t *zio_link_cache; 75789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 763290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 773290Sjohansen 783290Sjohansen #ifdef _KERNEL 793290Sjohansen extern vmem_t *zio_alloc_arena; 803290Sjohansen #endif 81789Sahrens 825329Sgw25295 /* 837754SJeff.Bonwick@Sun.COM * An allocating zio is one that either currently has the DVA allocate 847754SJeff.Bonwick@Sun.COM * stage set or will have it later in its lifetime. 855329Sgw25295 */ 8610922SJeff.Bonwick@Sun.COM #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 8710922SJeff.Bonwick@Sun.COM 88*11173SJonathan.Adams@Sun.COM boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 89*11173SJonathan.Adams@Sun.COM 9010922SJeff.Bonwick@Sun.COM #ifdef ZFS_DEBUG 9110922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 16384; 9210922SJeff.Bonwick@Sun.COM #else 9310922SJeff.Bonwick@Sun.COM int zio_buf_debug_limit = 0; 9410922SJeff.Bonwick@Sun.COM #endif 955329Sgw25295 96789Sahrens void 97789Sahrens zio_init(void) 98789Sahrens { 99789Sahrens size_t c; 1003290Sjohansen vmem_t *data_alloc_arena = NULL; 1013290Sjohansen 1023290Sjohansen #ifdef _KERNEL 1033290Sjohansen data_alloc_arena = zio_alloc_arena; 1043290Sjohansen #endif 1058632SBill.Moore@Sun.COM zio_cache = kmem_cache_create("zio_cache", 1068632SBill.Moore@Sun.COM sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1078632SBill.Moore@Sun.COM zio_link_cache = kmem_cache_create("zio_link_cache", 1088632SBill.Moore@Sun.COM sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1094055Seschrock 110789Sahrens /* 111789Sahrens * For small buffers, we want a cache for each multiple of 112789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 113789Sahrens * for each quarter-power of 2. For large buffers, we want 114789Sahrens * a cache for each multiple of PAGESIZE. 115789Sahrens */ 116789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 117789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 118789Sahrens size_t p2 = size; 119789Sahrens size_t align = 0; 120789Sahrens 121789Sahrens while (p2 & (p2 - 1)) 122789Sahrens p2 &= p2 - 1; 123789Sahrens 124789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 125789Sahrens align = SPA_MINBLOCKSIZE; 126789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 127789Sahrens align = PAGESIZE; 128789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 129789Sahrens align = p2 >> 2; 130789Sahrens } 131789Sahrens 132789Sahrens if (align != 0) { 1333290Sjohansen char name[36]; 1342856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 135789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 13610922SJeff.Bonwick@Sun.COM align, NULL, NULL, NULL, NULL, NULL, 13710922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 1383290Sjohansen 1393290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1403290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1413290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 14210922SJeff.Bonwick@Sun.COM size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 143789Sahrens } 144789Sahrens } 145789Sahrens 146789Sahrens while (--c != 0) { 147789Sahrens ASSERT(zio_buf_cache[c] != NULL); 148789Sahrens if (zio_buf_cache[c - 1] == NULL) 149789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1503290Sjohansen 1513290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1523290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1533290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 154789Sahrens } 1551544Seschrock 1561544Seschrock zio_inject_init(); 157789Sahrens } 158789Sahrens 159789Sahrens void 160789Sahrens zio_fini(void) 161789Sahrens { 162789Sahrens size_t c; 163789Sahrens kmem_cache_t *last_cache = NULL; 1643290Sjohansen kmem_cache_t *last_data_cache = NULL; 165789Sahrens 166789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167789Sahrens if (zio_buf_cache[c] != last_cache) { 168789Sahrens last_cache = zio_buf_cache[c]; 169789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 170789Sahrens } 171789Sahrens zio_buf_cache[c] = NULL; 1723290Sjohansen 1733290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1743290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1753290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1763290Sjohansen } 1773290Sjohansen zio_data_buf_cache[c] = NULL; 178789Sahrens } 1791544Seschrock 1808632SBill.Moore@Sun.COM kmem_cache_destroy(zio_link_cache); 1814055Seschrock kmem_cache_destroy(zio_cache); 1824055Seschrock 1831544Seschrock zio_inject_fini(); 184789Sahrens } 185789Sahrens 186789Sahrens /* 187789Sahrens * ========================================================================== 188789Sahrens * Allocate and free I/O buffers 189789Sahrens * ========================================================================== 190789Sahrens */ 1913290Sjohansen 1923290Sjohansen /* 1933290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1943290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1953290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1963290Sjohansen * excess / transient data in-core during a crashdump. 1973290Sjohansen */ 198789Sahrens void * 199789Sahrens zio_buf_alloc(size_t size) 200789Sahrens { 201789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 202789Sahrens 203789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 204789Sahrens 2056245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 206789Sahrens } 207789Sahrens 2083290Sjohansen /* 2093290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2103290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2113290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2123290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2133290Sjohansen */ 2143290Sjohansen void * 2153290Sjohansen zio_data_buf_alloc(size_t size) 2163290Sjohansen { 2173290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2183290Sjohansen 2193290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2203290Sjohansen 2216245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2223290Sjohansen } 2233290Sjohansen 224789Sahrens void 225789Sahrens zio_buf_free(void *buf, size_t size) 226789Sahrens { 227789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 228789Sahrens 229789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 230789Sahrens 231789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 232789Sahrens } 233789Sahrens 2343290Sjohansen void 2353290Sjohansen zio_data_buf_free(void *buf, size_t size) 2363290Sjohansen { 2373290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2383290Sjohansen 2393290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2403290Sjohansen 2413290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2423290Sjohansen } 2433463Sahrens 244789Sahrens /* 245789Sahrens * ========================================================================== 246789Sahrens * Push and pop I/O transform buffers 247789Sahrens * ========================================================================== 248789Sahrens */ 249789Sahrens static void 2507754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 2517754SJeff.Bonwick@Sun.COM zio_transform_func_t *transform) 252789Sahrens { 253789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 254789Sahrens 2557754SJeff.Bonwick@Sun.COM zt->zt_orig_data = zio->io_data; 2567754SJeff.Bonwick@Sun.COM zt->zt_orig_size = zio->io_size; 257789Sahrens zt->zt_bufsize = bufsize; 2587754SJeff.Bonwick@Sun.COM zt->zt_transform = transform; 259789Sahrens 260789Sahrens zt->zt_next = zio->io_transform_stack; 261789Sahrens zio->io_transform_stack = zt; 262789Sahrens 263789Sahrens zio->io_data = data; 264789Sahrens zio->io_size = size; 265789Sahrens } 266789Sahrens 267789Sahrens static void 2687754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio) 269789Sahrens { 2707754SJeff.Bonwick@Sun.COM zio_transform_t *zt; 271789Sahrens 2727754SJeff.Bonwick@Sun.COM while ((zt = zio->io_transform_stack) != NULL) { 2737754SJeff.Bonwick@Sun.COM if (zt->zt_transform != NULL) 2747754SJeff.Bonwick@Sun.COM zt->zt_transform(zio, 2757754SJeff.Bonwick@Sun.COM zt->zt_orig_data, zt->zt_orig_size); 276789Sahrens 27710922SJeff.Bonwick@Sun.COM if (zt->zt_bufsize != 0) 27810922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zt->zt_bufsize); 279789Sahrens 2807754SJeff.Bonwick@Sun.COM zio->io_data = zt->zt_orig_data; 2817754SJeff.Bonwick@Sun.COM zio->io_size = zt->zt_orig_size; 2827754SJeff.Bonwick@Sun.COM zio->io_transform_stack = zt->zt_next; 283789Sahrens 2847754SJeff.Bonwick@Sun.COM kmem_free(zt, sizeof (zio_transform_t)); 285789Sahrens } 286789Sahrens } 287789Sahrens 288789Sahrens /* 289789Sahrens * ========================================================================== 2907754SJeff.Bonwick@Sun.COM * I/O transform callbacks for subblocks and decompression 2917754SJeff.Bonwick@Sun.COM * ========================================================================== 2927754SJeff.Bonwick@Sun.COM */ 2937754SJeff.Bonwick@Sun.COM static void 2947754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size) 2957754SJeff.Bonwick@Sun.COM { 2967754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size > size); 2977754SJeff.Bonwick@Sun.COM 2987754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ) 2997754SJeff.Bonwick@Sun.COM bcopy(zio->io_data, data, size); 3007754SJeff.Bonwick@Sun.COM } 3017754SJeff.Bonwick@Sun.COM 3027754SJeff.Bonwick@Sun.COM static void 3037754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size) 3047754SJeff.Bonwick@Sun.COM { 3057754SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && 3067754SJeff.Bonwick@Sun.COM zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 30710922SJeff.Bonwick@Sun.COM zio->io_data, data, zio->io_size, size) != 0) 3087754SJeff.Bonwick@Sun.COM zio->io_error = EIO; 3097754SJeff.Bonwick@Sun.COM } 3107754SJeff.Bonwick@Sun.COM 3117754SJeff.Bonwick@Sun.COM /* 3127754SJeff.Bonwick@Sun.COM * ========================================================================== 3137754SJeff.Bonwick@Sun.COM * I/O parent/child relationships and pipeline interlocks 3147754SJeff.Bonwick@Sun.COM * ========================================================================== 3157754SJeff.Bonwick@Sun.COM */ 3168632SBill.Moore@Sun.COM /* 3178632SBill.Moore@Sun.COM * NOTE - Callers to zio_walk_parents() and zio_walk_children must 3188632SBill.Moore@Sun.COM * continue calling these functions until they return NULL. 3198632SBill.Moore@Sun.COM * Otherwise, the next caller will pick up the list walk in 3208632SBill.Moore@Sun.COM * some indeterminate state. (Otherwise every caller would 3218632SBill.Moore@Sun.COM * have to pass in a cookie to keep the state represented by 3228632SBill.Moore@Sun.COM * io_walk_link, which gets annoying.) 3238632SBill.Moore@Sun.COM */ 3248632SBill.Moore@Sun.COM zio_t * 3258632SBill.Moore@Sun.COM zio_walk_parents(zio_t *cio) 3268632SBill.Moore@Sun.COM { 3278632SBill.Moore@Sun.COM zio_link_t *zl = cio->io_walk_link; 3288632SBill.Moore@Sun.COM list_t *pl = &cio->io_parent_list; 3297754SJeff.Bonwick@Sun.COM 3308632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 3318632SBill.Moore@Sun.COM cio->io_walk_link = zl; 3328632SBill.Moore@Sun.COM 3338632SBill.Moore@Sun.COM if (zl == NULL) 3348632SBill.Moore@Sun.COM return (NULL); 3358632SBill.Moore@Sun.COM 3368632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 3378632SBill.Moore@Sun.COM return (zl->zl_parent); 3388632SBill.Moore@Sun.COM } 3398632SBill.Moore@Sun.COM 3408632SBill.Moore@Sun.COM zio_t * 3418632SBill.Moore@Sun.COM zio_walk_children(zio_t *pio) 3427754SJeff.Bonwick@Sun.COM { 3438632SBill.Moore@Sun.COM zio_link_t *zl = pio->io_walk_link; 3448632SBill.Moore@Sun.COM list_t *cl = &pio->io_child_list; 3458632SBill.Moore@Sun.COM 3468632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 3478632SBill.Moore@Sun.COM pio->io_walk_link = zl; 3488632SBill.Moore@Sun.COM 3498632SBill.Moore@Sun.COM if (zl == NULL) 3508632SBill.Moore@Sun.COM return (NULL); 3518632SBill.Moore@Sun.COM 3528632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 3538632SBill.Moore@Sun.COM return (zl->zl_child); 3548632SBill.Moore@Sun.COM } 3558632SBill.Moore@Sun.COM 3568632SBill.Moore@Sun.COM zio_t * 3578632SBill.Moore@Sun.COM zio_unique_parent(zio_t *cio) 3588632SBill.Moore@Sun.COM { 3598632SBill.Moore@Sun.COM zio_t *pio = zio_walk_parents(cio); 3608632SBill.Moore@Sun.COM 3618632SBill.Moore@Sun.COM VERIFY(zio_walk_parents(cio) == NULL); 3628632SBill.Moore@Sun.COM return (pio); 3638632SBill.Moore@Sun.COM } 3648632SBill.Moore@Sun.COM 3658632SBill.Moore@Sun.COM void 3668632SBill.Moore@Sun.COM zio_add_child(zio_t *pio, zio_t *cio) 3678632SBill.Moore@Sun.COM { 3688632SBill.Moore@Sun.COM zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 3698632SBill.Moore@Sun.COM 3708632SBill.Moore@Sun.COM /* 3718632SBill.Moore@Sun.COM * Logical I/Os can have logical, gang, or vdev children. 3728632SBill.Moore@Sun.COM * Gang I/Os can have gang or vdev children. 3738632SBill.Moore@Sun.COM * Vdev I/Os can only have vdev children. 3748632SBill.Moore@Sun.COM * The following ASSERT captures all of these constraints. 3758632SBill.Moore@Sun.COM */ 3768632SBill.Moore@Sun.COM ASSERT(cio->io_child_type <= pio->io_child_type); 3778632SBill.Moore@Sun.COM 3788632SBill.Moore@Sun.COM zl->zl_parent = pio; 3798632SBill.Moore@Sun.COM zl->zl_child = cio; 3808632SBill.Moore@Sun.COM 3818632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 3827754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 3838632SBill.Moore@Sun.COM 3848632SBill.Moore@Sun.COM ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 3858632SBill.Moore@Sun.COM 3868632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3878632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 3888632SBill.Moore@Sun.COM 3898632SBill.Moore@Sun.COM list_insert_head(&pio->io_child_list, zl); 3908632SBill.Moore@Sun.COM list_insert_head(&cio->io_parent_list, zl); 3918632SBill.Moore@Sun.COM 39210922SJeff.Bonwick@Sun.COM pio->io_child_count++; 39310922SJeff.Bonwick@Sun.COM cio->io_parent_count++; 39410922SJeff.Bonwick@Sun.COM 3957754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 3968632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 3977754SJeff.Bonwick@Sun.COM } 3987754SJeff.Bonwick@Sun.COM 3997754SJeff.Bonwick@Sun.COM static void 4008632SBill.Moore@Sun.COM zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 4017754SJeff.Bonwick@Sun.COM { 4028632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 4038632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 4047754SJeff.Bonwick@Sun.COM 4058632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 4067754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4078632SBill.Moore@Sun.COM 4088632SBill.Moore@Sun.COM list_remove(&pio->io_child_list, zl); 4098632SBill.Moore@Sun.COM list_remove(&cio->io_parent_list, zl); 4108632SBill.Moore@Sun.COM 41110922SJeff.Bonwick@Sun.COM pio->io_child_count--; 41210922SJeff.Bonwick@Sun.COM cio->io_parent_count--; 41310922SJeff.Bonwick@Sun.COM 4147754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4158632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 4168632SBill.Moore@Sun.COM 4178632SBill.Moore@Sun.COM kmem_cache_free(zio_link_cache, zl); 4187754SJeff.Bonwick@Sun.COM } 4197754SJeff.Bonwick@Sun.COM 4207754SJeff.Bonwick@Sun.COM static boolean_t 4217754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 4227754SJeff.Bonwick@Sun.COM { 4237754SJeff.Bonwick@Sun.COM uint64_t *countp = &zio->io_children[child][wait]; 4247754SJeff.Bonwick@Sun.COM boolean_t waiting = B_FALSE; 4257754SJeff.Bonwick@Sun.COM 4267754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 4277754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 4287754SJeff.Bonwick@Sun.COM if (*countp != 0) { 42910922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 4307754SJeff.Bonwick@Sun.COM zio->io_stall = countp; 4317754SJeff.Bonwick@Sun.COM waiting = B_TRUE; 4327754SJeff.Bonwick@Sun.COM } 4337754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 4347754SJeff.Bonwick@Sun.COM 4357754SJeff.Bonwick@Sun.COM return (waiting); 4367754SJeff.Bonwick@Sun.COM } 4377754SJeff.Bonwick@Sun.COM 4387754SJeff.Bonwick@Sun.COM static void 4397754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 4407754SJeff.Bonwick@Sun.COM { 4417754SJeff.Bonwick@Sun.COM uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 4427754SJeff.Bonwick@Sun.COM int *errorp = &pio->io_child_error[zio->io_child_type]; 4437754SJeff.Bonwick@Sun.COM 4447754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4457754SJeff.Bonwick@Sun.COM if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 4467754SJeff.Bonwick@Sun.COM *errorp = zio_worst_error(*errorp, zio->io_error); 4477754SJeff.Bonwick@Sun.COM pio->io_reexecute |= zio->io_reexecute; 4487754SJeff.Bonwick@Sun.COM ASSERT3U(*countp, >, 0); 4497754SJeff.Bonwick@Sun.COM if (--*countp == 0 && pio->io_stall == countp) { 4507754SJeff.Bonwick@Sun.COM pio->io_stall = NULL; 4517754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4527754SJeff.Bonwick@Sun.COM zio_execute(pio); 4537754SJeff.Bonwick@Sun.COM } else { 4547754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4557754SJeff.Bonwick@Sun.COM } 4567754SJeff.Bonwick@Sun.COM } 4577754SJeff.Bonwick@Sun.COM 4587754SJeff.Bonwick@Sun.COM static void 4597754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c) 4607754SJeff.Bonwick@Sun.COM { 4617754SJeff.Bonwick@Sun.COM if (zio->io_child_error[c] != 0 && zio->io_error == 0) 4627754SJeff.Bonwick@Sun.COM zio->io_error = zio->io_child_error[c]; 4637754SJeff.Bonwick@Sun.COM } 4647754SJeff.Bonwick@Sun.COM 4657754SJeff.Bonwick@Sun.COM /* 4667754SJeff.Bonwick@Sun.COM * ========================================================================== 4677754SJeff.Bonwick@Sun.COM * Create the various types of I/O (read, write, free, etc) 468789Sahrens * ========================================================================== 469789Sahrens */ 470789Sahrens static zio_t * 47110922SJeff.Bonwick@Sun.COM zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 472789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 47310922SJeff.Bonwick@Sun.COM zio_type_t type, int priority, enum zio_flag flags, 47410922SJeff.Bonwick@Sun.COM vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 47510922SJeff.Bonwick@Sun.COM enum zio_stage stage, enum zio_stage pipeline) 476789Sahrens { 477789Sahrens zio_t *zio; 478789Sahrens 479789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 480789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 4817754SJeff.Bonwick@Sun.COM ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 482789Sahrens 4837754SJeff.Bonwick@Sun.COM ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 4847754SJeff.Bonwick@Sun.COM ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 4857754SJeff.Bonwick@Sun.COM ASSERT(vd || stage == ZIO_STAGE_OPEN); 4867046Sahrens 4874055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 4884055Seschrock bzero(zio, sizeof (zio_t)); 4897754SJeff.Bonwick@Sun.COM 4907754SJeff.Bonwick@Sun.COM mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 4917754SJeff.Bonwick@Sun.COM cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 4927754SJeff.Bonwick@Sun.COM 4938632SBill.Moore@Sun.COM list_create(&zio->io_parent_list, sizeof (zio_link_t), 4948632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_parent_node)); 4958632SBill.Moore@Sun.COM list_create(&zio->io_child_list, sizeof (zio_link_t), 4968632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_child_node)); 4978632SBill.Moore@Sun.COM 4987754SJeff.Bonwick@Sun.COM if (vd != NULL) 4997754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_VDEV; 5007754SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_GANG_CHILD) 5017754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_GANG; 50210922SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_DDT_CHILD) 50310922SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_DDT; 5047754SJeff.Bonwick@Sun.COM else 5057754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_LOGICAL; 5067754SJeff.Bonwick@Sun.COM 507789Sahrens if (bp != NULL) { 50810922SJeff.Bonwick@Sun.COM zio->io_bp = (blkptr_t *)bp; 509789Sahrens zio->io_bp_copy = *bp; 510789Sahrens zio->io_bp_orig = *bp; 51110922SJeff.Bonwick@Sun.COM if (type != ZIO_TYPE_WRITE || 51210922SJeff.Bonwick@Sun.COM zio->io_child_type == ZIO_CHILD_DDT) 5137754SJeff.Bonwick@Sun.COM zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 5149443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) 5157754SJeff.Bonwick@Sun.COM zio->io_logical = zio; 5169443SBill.Moore@Sun.COM if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 5179443SBill.Moore@Sun.COM pipeline |= ZIO_GANG_STAGES; 518789Sahrens } 5197754SJeff.Bonwick@Sun.COM 5207754SJeff.Bonwick@Sun.COM zio->io_spa = spa; 5217754SJeff.Bonwick@Sun.COM zio->io_txg = txg; 522789Sahrens zio->io_done = done; 523789Sahrens zio->io_private = private; 524789Sahrens zio->io_type = type; 525789Sahrens zio->io_priority = priority; 5267754SJeff.Bonwick@Sun.COM zio->io_vd = vd; 5277754SJeff.Bonwick@Sun.COM zio->io_offset = offset; 52810922SJeff.Bonwick@Sun.COM zio->io_orig_data = zio->io_data = data; 52910922SJeff.Bonwick@Sun.COM zio->io_orig_size = zio->io_size = size; 5307754SJeff.Bonwick@Sun.COM zio->io_orig_flags = zio->io_flags = flags; 5317754SJeff.Bonwick@Sun.COM zio->io_orig_stage = zio->io_stage = stage; 5327754SJeff.Bonwick@Sun.COM zio->io_orig_pipeline = zio->io_pipeline = pipeline; 5337754SJeff.Bonwick@Sun.COM 5348632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 5358632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 5368632SBill.Moore@Sun.COM 5377754SJeff.Bonwick@Sun.COM if (zb != NULL) 5387754SJeff.Bonwick@Sun.COM zio->io_bookmark = *zb; 539789Sahrens 5407754SJeff.Bonwick@Sun.COM if (pio != NULL) { 5417754SJeff.Bonwick@Sun.COM if (zio->io_logical == NULL) 5421544Seschrock zio->io_logical = pio->io_logical; 5439443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_GANG) 5449443SBill.Moore@Sun.COM zio->io_gang_leader = pio->io_gang_leader; 5457754SJeff.Bonwick@Sun.COM zio_add_child(pio, zio); 546789Sahrens } 547789Sahrens 548789Sahrens return (zio); 549789Sahrens } 550789Sahrens 5515329Sgw25295 static void 5527754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio) 5535329Sgw25295 { 5548632SBill.Moore@Sun.COM list_destroy(&zio->io_parent_list); 5558632SBill.Moore@Sun.COM list_destroy(&zio->io_child_list); 5567754SJeff.Bonwick@Sun.COM mutex_destroy(&zio->io_lock); 5577754SJeff.Bonwick@Sun.COM cv_destroy(&zio->io_cv); 5587754SJeff.Bonwick@Sun.COM kmem_cache_free(zio_cache, zio); 5595329Sgw25295 } 5605329Sgw25295 561789Sahrens zio_t * 5628632SBill.Moore@Sun.COM zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 56310922SJeff.Bonwick@Sun.COM void *private, enum zio_flag flags) 564789Sahrens { 565789Sahrens zio_t *zio; 566789Sahrens 567789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 5688632SBill.Moore@Sun.COM ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 5697754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 570789Sahrens 571789Sahrens return (zio); 572789Sahrens } 573789Sahrens 574789Sahrens zio_t * 57510922SJeff.Bonwick@Sun.COM zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 576789Sahrens { 5778632SBill.Moore@Sun.COM return (zio_null(NULL, spa, NULL, done, private, flags)); 578789Sahrens } 579789Sahrens 580789Sahrens zio_t * 5817754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 5827754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_done_func_t *done, void *private, 58310922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 584789Sahrens { 585789Sahrens zio_t *zio; 586789Sahrens 58710922SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 5887046Sahrens data, size, done, private, 5897754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 59010922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 59110922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 592789Sahrens 593789Sahrens return (zio); 594789Sahrens } 595789Sahrens 596789Sahrens zio_t * 5977754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 59810922SJeff.Bonwick@Sun.COM void *data, uint64_t size, const zio_prop_t *zp, 5997754SJeff.Bonwick@Sun.COM zio_done_func_t *ready, zio_done_func_t *done, void *private, 60010922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, const zbookmark_t *zb) 601789Sahrens { 602789Sahrens zio_t *zio; 603789Sahrens 6047754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 6057754SJeff.Bonwick@Sun.COM zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 6067754SJeff.Bonwick@Sun.COM zp->zp_compress >= ZIO_COMPRESS_OFF && 6077754SJeff.Bonwick@Sun.COM zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 6087754SJeff.Bonwick@Sun.COM zp->zp_type < DMU_OT_NUMTYPES && 6097754SJeff.Bonwick@Sun.COM zp->zp_level < 32 && 61010922SJeff.Bonwick@Sun.COM zp->zp_copies > 0 && 61110922SJeff.Bonwick@Sun.COM zp->zp_copies <= spa_max_replication(spa) && 61210922SJeff.Bonwick@Sun.COM zp->zp_dedup <= 1 && 61310922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify <= 1); 6145329Sgw25295 615789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6167754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 61710922SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 61810922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 619789Sahrens 6203547Smaybee zio->io_ready = ready; 6217754SJeff.Bonwick@Sun.COM zio->io_prop = *zp; 622789Sahrens 623789Sahrens return (zio); 624789Sahrens } 625789Sahrens 626789Sahrens zio_t * 6277754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 6287754SJeff.Bonwick@Sun.COM uint64_t size, zio_done_func_t *done, void *private, int priority, 62910922SJeff.Bonwick@Sun.COM enum zio_flag flags, zbookmark_t *zb) 630789Sahrens { 631789Sahrens zio_t *zio; 632789Sahrens 6337181Sperrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6347754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 6357754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 636789Sahrens 637789Sahrens return (zio); 638789Sahrens } 639789Sahrens 64010922SJeff.Bonwick@Sun.COM void 64110922SJeff.Bonwick@Sun.COM zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 64210922SJeff.Bonwick@Sun.COM { 64310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 64410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 64510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 64610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 64710922SJeff.Bonwick@Sun.COM 64810922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies = copies; 64910922SJeff.Bonwick@Sun.COM zio->io_bp_override = bp; 65010922SJeff.Bonwick@Sun.COM } 65110922SJeff.Bonwick@Sun.COM 65210922SJeff.Bonwick@Sun.COM void 65310922SJeff.Bonwick@Sun.COM zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 65410922SJeff.Bonwick@Sun.COM { 65510922SJeff.Bonwick@Sun.COM bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 65610922SJeff.Bonwick@Sun.COM } 65710922SJeff.Bonwick@Sun.COM 658789Sahrens zio_t * 65910922SJeff.Bonwick@Sun.COM zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 66010922SJeff.Bonwick@Sun.COM enum zio_flag flags) 661789Sahrens { 662789Sahrens zio_t *zio; 663789Sahrens 664789Sahrens ASSERT(!BP_IS_HOLE(bp)); 66510922SJeff.Bonwick@Sun.COM ASSERT(spa_syncing_txg(spa) == txg); 66610922SJeff.Bonwick@Sun.COM ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 667789Sahrens 6687754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 66910922SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 6707754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 671789Sahrens 672789Sahrens return (zio); 673789Sahrens } 674789Sahrens 675789Sahrens zio_t * 67610922SJeff.Bonwick@Sun.COM zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 67710922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, enum zio_flag flags) 678789Sahrens { 679789Sahrens zio_t *zio; 680789Sahrens 681789Sahrens /* 682789Sahrens * A claim is an allocation of a specific block. Claims are needed 683789Sahrens * to support immediate writes in the intent log. The issue is that 684789Sahrens * immediate writes contain committed data, but in a txg that was 685789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 686789Sahrens * the intent log claims all blocks that contain immediate write data 687789Sahrens * so that the SPA knows they're in use. 688789Sahrens * 689789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 690789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 69110922SJeff.Bonwick@Sun.COM * If txg == 0 we just verify that the block is claimable. 692789Sahrens */ 693789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 69410922SJeff.Bonwick@Sun.COM ASSERT(txg == spa_first_txg(spa) || txg == 0); 69510922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 696789Sahrens 6977754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 6987754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 6997754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 700789Sahrens 701789Sahrens return (zio); 702789Sahrens } 703789Sahrens 704789Sahrens zio_t * 705789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 70610922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 707789Sahrens { 708789Sahrens zio_t *zio; 709789Sahrens int c; 710789Sahrens 711789Sahrens if (vd->vdev_children == 0) { 712789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 7137754SJeff.Bonwick@Sun.COM ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 714789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 715789Sahrens 716789Sahrens zio->io_cmd = cmd; 717789Sahrens } else { 7188632SBill.Moore@Sun.COM zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 719789Sahrens 720789Sahrens for (c = 0; c < vd->vdev_children; c++) 721789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 722789Sahrens done, private, priority, flags)); 723789Sahrens } 724789Sahrens 725789Sahrens return (zio); 726789Sahrens } 727789Sahrens 728789Sahrens zio_t * 729789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 730789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 73110922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 732789Sahrens { 733789Sahrens zio_t *zio; 7345329Sgw25295 7357754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7367754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7377754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7387754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 739789Sahrens 7407754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7417754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 742789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 743789Sahrens 7447754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 745789Sahrens 746789Sahrens return (zio); 747789Sahrens } 748789Sahrens 749789Sahrens zio_t * 750789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 751789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 75210922SJeff.Bonwick@Sun.COM int priority, enum zio_flag flags, boolean_t labels) 753789Sahrens { 754789Sahrens zio_t *zio; 755789Sahrens 7567754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7577754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7587754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7597754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 7605329Sgw25295 7617754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7627754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 763789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 764789Sahrens 7657754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 766789Sahrens 767789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 768789Sahrens /* 769789Sahrens * zbt checksums are necessarily destructive -- they modify 7707754SJeff.Bonwick@Sun.COM * the end of the write buffer to hold the verifier/checksum. 771789Sahrens * Therefore, we must make a local copy in case the data is 7727754SJeff.Bonwick@Sun.COM * being written to multiple places in parallel. 773789Sahrens */ 7747754SJeff.Bonwick@Sun.COM void *wbuf = zio_buf_alloc(size); 775789Sahrens bcopy(data, wbuf, size); 7767754SJeff.Bonwick@Sun.COM zio_push_transform(zio, wbuf, size, size, NULL); 777789Sahrens } 778789Sahrens 779789Sahrens return (zio); 780789Sahrens } 781789Sahrens 782789Sahrens /* 7837754SJeff.Bonwick@Sun.COM * Create a child I/O to do some work for us. 784789Sahrens */ 785789Sahrens zio_t * 7867754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 78710922SJeff.Bonwick@Sun.COM void *data, uint64_t size, int type, int priority, enum zio_flag flags, 788789Sahrens zio_done_func_t *done, void *private) 789789Sahrens { 79010922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 7917754SJeff.Bonwick@Sun.COM zio_t *zio; 7927754SJeff.Bonwick@Sun.COM 7937754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_parent == 7947754SJeff.Bonwick@Sun.COM (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 795789Sahrens 796789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 797789Sahrens /* 798789Sahrens * If we have the bp, then the child should perform the 799789Sahrens * checksum and the parent need not. This pushes error 800789Sahrens * detection as close to the leaves as possible and 801789Sahrens * eliminates redundant checksums in the interior nodes. 802789Sahrens */ 80310922SJeff.Bonwick@Sun.COM pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 80410922SJeff.Bonwick@Sun.COM pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 8057754SJeff.Bonwick@Sun.COM } 8067754SJeff.Bonwick@Sun.COM 8077754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0) 8087754SJeff.Bonwick@Sun.COM offset += VDEV_LABEL_START_SIZE; 8097754SJeff.Bonwick@Sun.COM 81010922SJeff.Bonwick@Sun.COM flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 81110922SJeff.Bonwick@Sun.COM 81210922SJeff.Bonwick@Sun.COM /* 81310922SJeff.Bonwick@Sun.COM * If we've decided to do a repair, the write is not speculative -- 81410922SJeff.Bonwick@Sun.COM * even if the original read was. 81510922SJeff.Bonwick@Sun.COM */ 81610922SJeff.Bonwick@Sun.COM if (flags & ZIO_FLAG_IO_REPAIR) 81710922SJeff.Bonwick@Sun.COM flags &= ~ZIO_FLAG_SPECULATIVE; 81810922SJeff.Bonwick@Sun.COM 8197754SJeff.Bonwick@Sun.COM zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 82010922SJeff.Bonwick@Sun.COM done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 82110922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 8227754SJeff.Bonwick@Sun.COM 8237754SJeff.Bonwick@Sun.COM return (zio); 8247754SJeff.Bonwick@Sun.COM } 8257754SJeff.Bonwick@Sun.COM 8267754SJeff.Bonwick@Sun.COM zio_t * 8277754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 82810922SJeff.Bonwick@Sun.COM int type, int priority, enum zio_flag flags, 82910922SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private) 8307754SJeff.Bonwick@Sun.COM { 8317754SJeff.Bonwick@Sun.COM zio_t *zio; 8327754SJeff.Bonwick@Sun.COM 8337754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_ops->vdev_op_leaf); 8347754SJeff.Bonwick@Sun.COM 8357754SJeff.Bonwick@Sun.COM zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 8367754SJeff.Bonwick@Sun.COM data, size, done, private, type, priority, 8377754SJeff.Bonwick@Sun.COM flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 8387754SJeff.Bonwick@Sun.COM vd, offset, NULL, 83910922SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 8407754SJeff.Bonwick@Sun.COM 8417754SJeff.Bonwick@Sun.COM return (zio); 8427754SJeff.Bonwick@Sun.COM } 8437754SJeff.Bonwick@Sun.COM 8447754SJeff.Bonwick@Sun.COM void 8457754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd) 8467754SJeff.Bonwick@Sun.COM { 8477754SJeff.Bonwick@Sun.COM zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 8487754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_PRIORITY_NOW, 8497754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 8507754SJeff.Bonwick@Sun.COM } 8517754SJeff.Bonwick@Sun.COM 8527754SJeff.Bonwick@Sun.COM /* 8537754SJeff.Bonwick@Sun.COM * ========================================================================== 8547754SJeff.Bonwick@Sun.COM * Prepare to read and write logical blocks 8557754SJeff.Bonwick@Sun.COM * ========================================================================== 8567754SJeff.Bonwick@Sun.COM */ 8577754SJeff.Bonwick@Sun.COM 8587754SJeff.Bonwick@Sun.COM static int 8597754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio) 8607754SJeff.Bonwick@Sun.COM { 8617754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8627754SJeff.Bonwick@Sun.COM 8638274SJeff.Bonwick@Sun.COM if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 8649443SBill.Moore@Sun.COM zio->io_child_type == ZIO_CHILD_LOGICAL && 8659443SBill.Moore@Sun.COM !(zio->io_flags & ZIO_FLAG_RAW)) { 86610922SJeff.Bonwick@Sun.COM uint64_t psize = BP_GET_PSIZE(bp); 86710922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(psize); 86810922SJeff.Bonwick@Sun.COM 86910922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 8707754SJeff.Bonwick@Sun.COM } 8717754SJeff.Bonwick@Sun.COM 8727754SJeff.Bonwick@Sun.COM if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 8737754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 8747754SJeff.Bonwick@Sun.COM 87511125SJeff.Bonwick@Sun.COM if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 87611125SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 87711125SJeff.Bonwick@Sun.COM 87810922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 87910922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 88010922SJeff.Bonwick@Sun.COM 8817754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 8827754SJeff.Bonwick@Sun.COM } 8837754SJeff.Bonwick@Sun.COM 8847754SJeff.Bonwick@Sun.COM static int 8857754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio) 8867754SJeff.Bonwick@Sun.COM { 88710922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 8887754SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 88910922SJeff.Bonwick@Sun.COM enum zio_compress compress = zp->zp_compress; 8907754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8917754SJeff.Bonwick@Sun.COM uint64_t lsize = zio->io_size; 89210922SJeff.Bonwick@Sun.COM uint64_t psize = lsize; 8937754SJeff.Bonwick@Sun.COM int pass = 1; 8947754SJeff.Bonwick@Sun.COM 8957754SJeff.Bonwick@Sun.COM /* 8967754SJeff.Bonwick@Sun.COM * If our children haven't all reached the ready stage, 8977754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 8987754SJeff.Bonwick@Sun.COM */ 8997754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 9007754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 9017754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 9027754SJeff.Bonwick@Sun.COM 9037754SJeff.Bonwick@Sun.COM if (!IO_IS_ALLOCATING(zio)) 9047754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9057754SJeff.Bonwick@Sun.COM 90610922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 90710922SJeff.Bonwick@Sun.COM 90810922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 90910922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth != zio->io_txg); 91010922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 91110922SJeff.Bonwick@Sun.COM 91210922SJeff.Bonwick@Sun.COM *bp = *zio->io_bp_override; 91310922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 91410922SJeff.Bonwick@Sun.COM 91510922SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(bp) || !zp->zp_dedup) 91610922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 91710922SJeff.Bonwick@Sun.COM 91810922SJeff.Bonwick@Sun.COM ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 91910922SJeff.Bonwick@Sun.COM zp->zp_dedup_verify); 92010922SJeff.Bonwick@Sun.COM 92110922SJeff.Bonwick@Sun.COM if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 92210922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, 1); 92310922SJeff.Bonwick@Sun.COM zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 92410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 92510922SJeff.Bonwick@Sun.COM } 92610922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 92710922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 92810922SJeff.Bonwick@Sun.COM } 9297754SJeff.Bonwick@Sun.COM 9307754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg) { 9317754SJeff.Bonwick@Sun.COM /* 9327754SJeff.Bonwick@Sun.COM * We're rewriting an existing block, which means we're 9337754SJeff.Bonwick@Sun.COM * working on behalf of spa_sync(). For spa_sync() to 9347754SJeff.Bonwick@Sun.COM * converge, it must eventually be the case that we don't 9357754SJeff.Bonwick@Sun.COM * have to allocate new blocks. But compression changes 9367754SJeff.Bonwick@Sun.COM * the blocksize, which forces a reallocate, and makes 9377754SJeff.Bonwick@Sun.COM * convergence take longer. Therefore, after the first 9387754SJeff.Bonwick@Sun.COM * few passes, stop compressing to ensure convergence. 9397754SJeff.Bonwick@Sun.COM */ 94010922SJeff.Bonwick@Sun.COM pass = spa_sync_pass(spa); 94110922SJeff.Bonwick@Sun.COM 94210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_txg == spa_syncing_txg(spa)); 94310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 94410922SJeff.Bonwick@Sun.COM ASSERT(!BP_GET_DEDUP(bp)); 9457754SJeff.Bonwick@Sun.COM 9467754SJeff.Bonwick@Sun.COM if (pass > SYNC_PASS_DONT_COMPRESS) 9477754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 9487754SJeff.Bonwick@Sun.COM 9497754SJeff.Bonwick@Sun.COM /* Make sure someone doesn't change their mind on overwrites */ 95010922SJeff.Bonwick@Sun.COM ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 95110922SJeff.Bonwick@Sun.COM spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 9527754SJeff.Bonwick@Sun.COM } 9537754SJeff.Bonwick@Sun.COM 9547754SJeff.Bonwick@Sun.COM if (compress != ZIO_COMPRESS_OFF) { 95510922SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(lsize); 95610922SJeff.Bonwick@Sun.COM psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 95710922SJeff.Bonwick@Sun.COM if (psize == 0 || psize == lsize) { 9587754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 95910922SJeff.Bonwick@Sun.COM zio_buf_free(cbuf, lsize); 96010922SJeff.Bonwick@Sun.COM } else { 96110922SJeff.Bonwick@Sun.COM ASSERT(psize < lsize); 96210922SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, psize, lsize, NULL); 9637754SJeff.Bonwick@Sun.COM } 964789Sahrens } 965789Sahrens 9667754SJeff.Bonwick@Sun.COM /* 9677754SJeff.Bonwick@Sun.COM * The final pass of spa_sync() must be all rewrites, but the first 9687754SJeff.Bonwick@Sun.COM * few passes offer a trade-off: allocating blocks defers convergence, 9697754SJeff.Bonwick@Sun.COM * but newly allocated blocks are sequential, so they can be written 9707754SJeff.Bonwick@Sun.COM * to disk faster. Therefore, we allow the first few passes of 9717754SJeff.Bonwick@Sun.COM * spa_sync() to allocate new blocks, but force rewrites after that. 9727754SJeff.Bonwick@Sun.COM * There should only be a handful of blocks after pass 1 in any case. 9737754SJeff.Bonwick@Sun.COM */ 97410922SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 9757754SJeff.Bonwick@Sun.COM pass > SYNC_PASS_REWRITE) { 97610922SJeff.Bonwick@Sun.COM ASSERT(psize != 0); 97710922SJeff.Bonwick@Sun.COM enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 9787754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 9797754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_REWRITE; 9807754SJeff.Bonwick@Sun.COM } else { 9817754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 9827754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 9837754SJeff.Bonwick@Sun.COM } 9847754SJeff.Bonwick@Sun.COM 98510922SJeff.Bonwick@Sun.COM if (psize == 0) { 9867754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 9877754SJeff.Bonwick@Sun.COM } else { 9887754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 9897754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(bp, lsize); 99010922SJeff.Bonwick@Sun.COM BP_SET_PSIZE(bp, psize); 9917754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(bp, compress); 9927754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(bp, zp->zp_checksum); 9937754SJeff.Bonwick@Sun.COM BP_SET_TYPE(bp, zp->zp_type); 9947754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(bp, zp->zp_level); 99510922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(bp, zp->zp_dedup); 9967754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 99710922SJeff.Bonwick@Sun.COM if (zp->zp_dedup) { 99810922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 99910922SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 100010922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 100110922SJeff.Bonwick@Sun.COM } 100210922SJeff.Bonwick@Sun.COM } 100310922SJeff.Bonwick@Sun.COM 100410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 100510922SJeff.Bonwick@Sun.COM } 100610922SJeff.Bonwick@Sun.COM 100710922SJeff.Bonwick@Sun.COM static int 100810922SJeff.Bonwick@Sun.COM zio_free_bp_init(zio_t *zio) 100910922SJeff.Bonwick@Sun.COM { 101010922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 101110922SJeff.Bonwick@Sun.COM 101210922SJeff.Bonwick@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 101310922SJeff.Bonwick@Sun.COM if (BP_GET_DEDUP(bp)) 101410922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 101510922SJeff.Bonwick@Sun.COM else 101610922SJeff.Bonwick@Sun.COM arc_free(zio->io_spa, bp); 10177754SJeff.Bonwick@Sun.COM } 10187754SJeff.Bonwick@Sun.COM 10197754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 10207754SJeff.Bonwick@Sun.COM } 10217754SJeff.Bonwick@Sun.COM 10227754SJeff.Bonwick@Sun.COM /* 10237754SJeff.Bonwick@Sun.COM * ========================================================================== 10247754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline 10257754SJeff.Bonwick@Sun.COM * ========================================================================== 10267754SJeff.Bonwick@Sun.COM */ 10277754SJeff.Bonwick@Sun.COM 10287754SJeff.Bonwick@Sun.COM static void 1029*11173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 10307754SJeff.Bonwick@Sun.COM { 103111146SGeorge.Wilson@Sun.COM spa_t *spa = zio->io_spa; 10327754SJeff.Bonwick@Sun.COM zio_type_t t = zio->io_type; 1033*11173SJonathan.Adams@Sun.COM int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 10347754SJeff.Bonwick@Sun.COM 10357754SJeff.Bonwick@Sun.COM /* 10369722SGeorge.Wilson@Sun.COM * If we're a config writer or a probe, the normal issue and 10379722SGeorge.Wilson@Sun.COM * interrupt threads may all be blocked waiting for the config lock. 10389722SGeorge.Wilson@Sun.COM * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 10397754SJeff.Bonwick@Sun.COM */ 10409722SGeorge.Wilson@Sun.COM if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 10417754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10427754SJeff.Bonwick@Sun.COM 10437754SJeff.Bonwick@Sun.COM /* 10447754SJeff.Bonwick@Sun.COM * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 10457754SJeff.Bonwick@Sun.COM */ 10467754SJeff.Bonwick@Sun.COM if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 10477754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 10487754SJeff.Bonwick@Sun.COM 104911146SGeorge.Wilson@Sun.COM /* 105011146SGeorge.Wilson@Sun.COM * If this is a high priority I/O, then use the high priority taskq. 105111146SGeorge.Wilson@Sun.COM */ 105211146SGeorge.Wilson@Sun.COM if (zio->io_priority == ZIO_PRIORITY_NOW && 105311146SGeorge.Wilson@Sun.COM spa->spa_zio_taskq[t][q + 1] != NULL) 105411146SGeorge.Wilson@Sun.COM q++; 105511146SGeorge.Wilson@Sun.COM 105611146SGeorge.Wilson@Sun.COM ASSERT3U(q, <, ZIO_TASKQ_TYPES); 105711146SGeorge.Wilson@Sun.COM (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1058*11173SJonathan.Adams@Sun.COM (task_func_t *)zio_execute, zio, flags); 10597754SJeff.Bonwick@Sun.COM } 10607754SJeff.Bonwick@Sun.COM 10617754SJeff.Bonwick@Sun.COM static boolean_t 10627754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 10637754SJeff.Bonwick@Sun.COM { 10647754SJeff.Bonwick@Sun.COM kthread_t *executor = zio->io_executor; 10657754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 1066789Sahrens 10677754SJeff.Bonwick@Sun.COM for (zio_type_t t = 0; t < ZIO_TYPES; t++) 10687754SJeff.Bonwick@Sun.COM if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 10697754SJeff.Bonwick@Sun.COM return (B_TRUE); 10707754SJeff.Bonwick@Sun.COM 10717754SJeff.Bonwick@Sun.COM return (B_FALSE); 10727754SJeff.Bonwick@Sun.COM } 10737754SJeff.Bonwick@Sun.COM 10747754SJeff.Bonwick@Sun.COM static int 10757754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio) 10767754SJeff.Bonwick@Sun.COM { 1077*11173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 10787754SJeff.Bonwick@Sun.COM 10797754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 10807754SJeff.Bonwick@Sun.COM } 10817754SJeff.Bonwick@Sun.COM 10827754SJeff.Bonwick@Sun.COM void 10837754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio) 10847754SJeff.Bonwick@Sun.COM { 1085*11173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 10867754SJeff.Bonwick@Sun.COM } 10877754SJeff.Bonwick@Sun.COM 10887754SJeff.Bonwick@Sun.COM /* 10897754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline until one of the following occurs: 10907754SJeff.Bonwick@Sun.COM * (1) the I/O completes; (2) the pipeline stalls waiting for 10917754SJeff.Bonwick@Sun.COM * dependent child I/Os; (3) the I/O issues, so we're waiting 10927754SJeff.Bonwick@Sun.COM * for an I/O completion interrupt; (4) the I/O is delegated by 10937754SJeff.Bonwick@Sun.COM * vdev-level caching or aggregation; (5) the I/O is deferred 10947754SJeff.Bonwick@Sun.COM * due to vdev-level queueing; (6) the I/O is handed off to 10957754SJeff.Bonwick@Sun.COM * another thread. In all cases, the pipeline stops whenever 10967754SJeff.Bonwick@Sun.COM * there's no CPU work; it never burns a thread in cv_wait(). 10977754SJeff.Bonwick@Sun.COM * 10987754SJeff.Bonwick@Sun.COM * There's no locking on io_stage because there's no legitimate way 10997754SJeff.Bonwick@Sun.COM * for multiple threads to be attempting to process the same I/O. 11007754SJeff.Bonwick@Sun.COM */ 110110922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[]; 1102789Sahrens 11037754SJeff.Bonwick@Sun.COM void 11047754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio) 11057754SJeff.Bonwick@Sun.COM { 11067754SJeff.Bonwick@Sun.COM zio->io_executor = curthread; 11077754SJeff.Bonwick@Sun.COM 11087754SJeff.Bonwick@Sun.COM while (zio->io_stage < ZIO_STAGE_DONE) { 110910922SJeff.Bonwick@Sun.COM enum zio_stage pipeline = zio->io_pipeline; 111010922SJeff.Bonwick@Sun.COM enum zio_stage stage = zio->io_stage; 11117754SJeff.Bonwick@Sun.COM int rv; 11127754SJeff.Bonwick@Sun.COM 11137754SJeff.Bonwick@Sun.COM ASSERT(!MUTEX_HELD(&zio->io_lock)); 111410922SJeff.Bonwick@Sun.COM ASSERT(ISP2(stage)); 111510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 111610922SJeff.Bonwick@Sun.COM 111710922SJeff.Bonwick@Sun.COM do { 111810922SJeff.Bonwick@Sun.COM stage <<= 1; 111910922SJeff.Bonwick@Sun.COM } while ((stage & pipeline) == 0); 11207754SJeff.Bonwick@Sun.COM 11217754SJeff.Bonwick@Sun.COM ASSERT(stage <= ZIO_STAGE_DONE); 11227754SJeff.Bonwick@Sun.COM 11237754SJeff.Bonwick@Sun.COM /* 11247754SJeff.Bonwick@Sun.COM * If we are in interrupt context and this pipeline stage 11257754SJeff.Bonwick@Sun.COM * will grab a config lock that is held across I/O, 112610922SJeff.Bonwick@Sun.COM * or may wait for an I/O that needs an interrupt thread 112710922SJeff.Bonwick@Sun.COM * to complete, issue async to avoid deadlock. 1128*11173SJonathan.Adams@Sun.COM * 1129*11173SJonathan.Adams@Sun.COM * For VDEV_IO_START, we cut in line so that the io will 1130*11173SJonathan.Adams@Sun.COM * be sent to disk promptly. 11317754SJeff.Bonwick@Sun.COM */ 113210922SJeff.Bonwick@Sun.COM if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 11337754SJeff.Bonwick@Sun.COM zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1134*11173SJonathan.Adams@Sun.COM boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1135*11173SJonathan.Adams@Sun.COM zio_requeue_io_start_cut_in_line : B_FALSE; 1136*11173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 11377754SJeff.Bonwick@Sun.COM return; 11387754SJeff.Bonwick@Sun.COM } 11397754SJeff.Bonwick@Sun.COM 11407754SJeff.Bonwick@Sun.COM zio->io_stage = stage; 114110922SJeff.Bonwick@Sun.COM rv = zio_pipeline[highbit(stage) - 1](zio); 11427754SJeff.Bonwick@Sun.COM 11437754SJeff.Bonwick@Sun.COM if (rv == ZIO_PIPELINE_STOP) 11447754SJeff.Bonwick@Sun.COM return; 11457754SJeff.Bonwick@Sun.COM 11467754SJeff.Bonwick@Sun.COM ASSERT(rv == ZIO_PIPELINE_CONTINUE); 11477754SJeff.Bonwick@Sun.COM } 1148789Sahrens } 1149789Sahrens 1150789Sahrens /* 1151789Sahrens * ========================================================================== 1152789Sahrens * Initiate I/O, either sync or async 1153789Sahrens * ========================================================================== 1154789Sahrens */ 1155789Sahrens int 1156789Sahrens zio_wait(zio_t *zio) 1157789Sahrens { 1158789Sahrens int error; 1159789Sahrens 1160789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 11617754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 1162789Sahrens 1163789Sahrens zio->io_waiter = curthread; 1164789Sahrens 11655530Sbonwick zio_execute(zio); 1166789Sahrens 1167789Sahrens mutex_enter(&zio->io_lock); 11687754SJeff.Bonwick@Sun.COM while (zio->io_executor != NULL) 1169789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 1170789Sahrens mutex_exit(&zio->io_lock); 1171789Sahrens 1172789Sahrens error = zio->io_error; 11736523Sek110237 zio_destroy(zio); 1174789Sahrens 1175789Sahrens return (error); 1176789Sahrens } 1177789Sahrens 1178789Sahrens void 1179789Sahrens zio_nowait(zio_t *zio) 1180789Sahrens { 11817754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 11827754SJeff.Bonwick@Sun.COM 11838632SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL && 11848632SBill.Moore@Sun.COM zio_unique_parent(zio) == NULL) { 11857754SJeff.Bonwick@Sun.COM /* 11867754SJeff.Bonwick@Sun.COM * This is a logical async I/O with no parent to wait for it. 11879234SGeorge.Wilson@Sun.COM * We add it to the spa_async_root_zio "Godfather" I/O which 11889234SGeorge.Wilson@Sun.COM * will ensure they complete prior to unloading the pool. 11897754SJeff.Bonwick@Sun.COM */ 11907754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 11919234SGeorge.Wilson@Sun.COM 11929234SGeorge.Wilson@Sun.COM zio_add_child(spa->spa_async_zio_root, zio); 11937754SJeff.Bonwick@Sun.COM } 11947754SJeff.Bonwick@Sun.COM 11955530Sbonwick zio_execute(zio); 11965530Sbonwick } 11975530Sbonwick 11987754SJeff.Bonwick@Sun.COM /* 11997754SJeff.Bonwick@Sun.COM * ========================================================================== 12007754SJeff.Bonwick@Sun.COM * Reexecute or suspend/resume failed I/O 12017754SJeff.Bonwick@Sun.COM * ========================================================================== 12027754SJeff.Bonwick@Sun.COM */ 12037754SJeff.Bonwick@Sun.COM 12047754SJeff.Bonwick@Sun.COM static void 12057754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio) 12067754SJeff.Bonwick@Sun.COM { 12078632SBill.Moore@Sun.COM zio_t *cio, *cio_next; 12088632SBill.Moore@Sun.COM 12098632SBill.Moore@Sun.COM ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 12108632SBill.Moore@Sun.COM ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 12119443SBill.Moore@Sun.COM ASSERT(pio->io_gang_leader == NULL); 12129443SBill.Moore@Sun.COM ASSERT(pio->io_gang_tree == NULL); 12137754SJeff.Bonwick@Sun.COM 12147754SJeff.Bonwick@Sun.COM pio->io_flags = pio->io_orig_flags; 12157754SJeff.Bonwick@Sun.COM pio->io_stage = pio->io_orig_stage; 12167754SJeff.Bonwick@Sun.COM pio->io_pipeline = pio->io_orig_pipeline; 12177754SJeff.Bonwick@Sun.COM pio->io_reexecute = 0; 12187754SJeff.Bonwick@Sun.COM pio->io_error = 0; 12198632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12208632SBill.Moore@Sun.COM pio->io_state[w] = 0; 12217754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 12227754SJeff.Bonwick@Sun.COM pio->io_child_error[c] = 0; 12237754SJeff.Bonwick@Sun.COM 122410922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(pio)) 122510922SJeff.Bonwick@Sun.COM BP_ZERO(pio->io_bp); 12267754SJeff.Bonwick@Sun.COM 12277754SJeff.Bonwick@Sun.COM /* 12287754SJeff.Bonwick@Sun.COM * As we reexecute pio's children, new children could be created. 12298632SBill.Moore@Sun.COM * New children go to the head of pio's io_child_list, however, 12307754SJeff.Bonwick@Sun.COM * so we will (correctly) not reexecute them. The key is that 12318632SBill.Moore@Sun.COM * the remainder of pio's io_child_list, from 'cio_next' onward, 12328632SBill.Moore@Sun.COM * cannot be affected by any side effects of reexecuting 'cio'. 12337754SJeff.Bonwick@Sun.COM */ 12348632SBill.Moore@Sun.COM for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 12358632SBill.Moore@Sun.COM cio_next = zio_walk_children(pio); 12367754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 12378632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 12388632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w]++; 12397754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 12408632SBill.Moore@Sun.COM zio_reexecute(cio); 12417754SJeff.Bonwick@Sun.COM } 12427754SJeff.Bonwick@Sun.COM 12437754SJeff.Bonwick@Sun.COM /* 12447754SJeff.Bonwick@Sun.COM * Now that all children have been reexecuted, execute the parent. 12459234SGeorge.Wilson@Sun.COM * We don't reexecute "The Godfather" I/O here as it's the 12469234SGeorge.Wilson@Sun.COM * responsibility of the caller to wait on him. 12477754SJeff.Bonwick@Sun.COM */ 12489234SGeorge.Wilson@Sun.COM if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 12499234SGeorge.Wilson@Sun.COM zio_execute(pio); 12507754SJeff.Bonwick@Sun.COM } 12517754SJeff.Bonwick@Sun.COM 12525530Sbonwick void 12537754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio) 12545530Sbonwick { 12557754SJeff.Bonwick@Sun.COM if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 12567754SJeff.Bonwick@Sun.COM fm_panic("Pool '%s' has encountered an uncorrectable I/O " 12577754SJeff.Bonwick@Sun.COM "failure and the failure mode property for this pool " 12587754SJeff.Bonwick@Sun.COM "is set to panic.", spa_name(spa)); 12597754SJeff.Bonwick@Sun.COM 12607754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 12617754SJeff.Bonwick@Sun.COM 12627754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12637754SJeff.Bonwick@Sun.COM 12647754SJeff.Bonwick@Sun.COM if (spa->spa_suspend_zio_root == NULL) 12659234SGeorge.Wilson@Sun.COM spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 12669234SGeorge.Wilson@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 12679234SGeorge.Wilson@Sun.COM ZIO_FLAG_GODFATHER); 12687754SJeff.Bonwick@Sun.COM 12697754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_TRUE; 12707754SJeff.Bonwick@Sun.COM 12717754SJeff.Bonwick@Sun.COM if (zio != NULL) { 12729234SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 12737754SJeff.Bonwick@Sun.COM ASSERT(zio != spa->spa_suspend_zio_root); 12747754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 12758632SBill.Moore@Sun.COM ASSERT(zio_unique_parent(zio) == NULL); 12767754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_DONE); 12777754SJeff.Bonwick@Sun.COM zio_add_child(spa->spa_suspend_zio_root, zio); 12787754SJeff.Bonwick@Sun.COM } 12797754SJeff.Bonwick@Sun.COM 12807754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12815530Sbonwick } 12825530Sbonwick 12839234SGeorge.Wilson@Sun.COM int 12847754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa) 12855530Sbonwick { 12869234SGeorge.Wilson@Sun.COM zio_t *pio; 12877754SJeff.Bonwick@Sun.COM 12887754SJeff.Bonwick@Sun.COM /* 12897754SJeff.Bonwick@Sun.COM * Reexecute all previously suspended i/o. 12907754SJeff.Bonwick@Sun.COM */ 12917754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12927754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_FALSE; 12937754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_suspend_cv); 12947754SJeff.Bonwick@Sun.COM pio = spa->spa_suspend_zio_root; 12957754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = NULL; 12967754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12977754SJeff.Bonwick@Sun.COM 12987754SJeff.Bonwick@Sun.COM if (pio == NULL) 12999234SGeorge.Wilson@Sun.COM return (0); 13005530Sbonwick 13019234SGeorge.Wilson@Sun.COM zio_reexecute(pio); 13029234SGeorge.Wilson@Sun.COM return (zio_wait(pio)); 13037754SJeff.Bonwick@Sun.COM } 13047754SJeff.Bonwick@Sun.COM 13057754SJeff.Bonwick@Sun.COM void 13067754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa) 13077754SJeff.Bonwick@Sun.COM { 13087754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 13097754SJeff.Bonwick@Sun.COM while (spa_suspended(spa)) 13107754SJeff.Bonwick@Sun.COM cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 13117754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1312789Sahrens } 1313789Sahrens 1314789Sahrens /* 1315789Sahrens * ========================================================================== 13167754SJeff.Bonwick@Sun.COM * Gang blocks. 13177754SJeff.Bonwick@Sun.COM * 13187754SJeff.Bonwick@Sun.COM * A gang block is a collection of small blocks that looks to the DMU 13197754SJeff.Bonwick@Sun.COM * like one large block. When zio_dva_allocate() cannot find a block 13207754SJeff.Bonwick@Sun.COM * of the requested size, due to either severe fragmentation or the pool 13217754SJeff.Bonwick@Sun.COM * being nearly full, it calls zio_write_gang_block() to construct the 13227754SJeff.Bonwick@Sun.COM * block from smaller fragments. 13237754SJeff.Bonwick@Sun.COM * 13247754SJeff.Bonwick@Sun.COM * A gang block consists of a gang header (zio_gbh_phys_t) and up to 13257754SJeff.Bonwick@Sun.COM * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 13267754SJeff.Bonwick@Sun.COM * an indirect block: it's an array of block pointers. It consumes 13277754SJeff.Bonwick@Sun.COM * only one sector and hence is allocatable regardless of fragmentation. 13287754SJeff.Bonwick@Sun.COM * The gang header's bps point to its gang members, which hold the data. 13297754SJeff.Bonwick@Sun.COM * 13307754SJeff.Bonwick@Sun.COM * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 13317754SJeff.Bonwick@Sun.COM * as the verifier to ensure uniqueness of the SHA256 checksum. 13327754SJeff.Bonwick@Sun.COM * Critically, the gang block bp's blk_cksum is the checksum of the data, 13337754SJeff.Bonwick@Sun.COM * not the gang header. This ensures that data block signatures (needed for 13347754SJeff.Bonwick@Sun.COM * deduplication) are independent of how the block is physically stored. 13357754SJeff.Bonwick@Sun.COM * 13367754SJeff.Bonwick@Sun.COM * Gang blocks can be nested: a gang member may itself be a gang block. 13377754SJeff.Bonwick@Sun.COM * Thus every gang block is a tree in which root and all interior nodes are 13387754SJeff.Bonwick@Sun.COM * gang headers, and the leaves are normal blocks that contain user data. 13397754SJeff.Bonwick@Sun.COM * The root of the gang tree is called the gang leader. 13407754SJeff.Bonwick@Sun.COM * 13417754SJeff.Bonwick@Sun.COM * To perform any operation (read, rewrite, free, claim) on a gang block, 13427754SJeff.Bonwick@Sun.COM * zio_gang_assemble() first assembles the gang tree (minus data leaves) 13437754SJeff.Bonwick@Sun.COM * in the io_gang_tree field of the original logical i/o by recursively 13447754SJeff.Bonwick@Sun.COM * reading the gang leader and all gang headers below it. This yields 13457754SJeff.Bonwick@Sun.COM * an in-core tree containing the contents of every gang header and the 13467754SJeff.Bonwick@Sun.COM * bps for every constituent of the gang block. 13477754SJeff.Bonwick@Sun.COM * 13487754SJeff.Bonwick@Sun.COM * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 13497754SJeff.Bonwick@Sun.COM * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 13507754SJeff.Bonwick@Sun.COM * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 13517754SJeff.Bonwick@Sun.COM * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 13527754SJeff.Bonwick@Sun.COM * zio_read_gang() is a wrapper around zio_read() that omits reading gang 13537754SJeff.Bonwick@Sun.COM * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 13547754SJeff.Bonwick@Sun.COM * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 13557754SJeff.Bonwick@Sun.COM * of the gang header plus zio_checksum_compute() of the data to update the 13567754SJeff.Bonwick@Sun.COM * gang header's blk_cksum as described above. 13577754SJeff.Bonwick@Sun.COM * 13587754SJeff.Bonwick@Sun.COM * The two-phase assemble/issue model solves the problem of partial failure -- 13597754SJeff.Bonwick@Sun.COM * what if you'd freed part of a gang block but then couldn't read the 13607754SJeff.Bonwick@Sun.COM * gang header for another part? Assembling the entire gang tree first 13617754SJeff.Bonwick@Sun.COM * ensures that all the necessary gang header I/O has succeeded before 13627754SJeff.Bonwick@Sun.COM * starting the actual work of free, claim, or write. Once the gang tree 13637754SJeff.Bonwick@Sun.COM * is assembled, free and claim are in-memory operations that cannot fail. 13647754SJeff.Bonwick@Sun.COM * 13657754SJeff.Bonwick@Sun.COM * In the event that a gang write fails, zio_dva_unallocate() walks the 13667754SJeff.Bonwick@Sun.COM * gang tree to immediately free (i.e. insert back into the space map) 13677754SJeff.Bonwick@Sun.COM * everything we've allocated. This ensures that we don't get ENOSPC 13687754SJeff.Bonwick@Sun.COM * errors during repeated suspend/resume cycles due to a flaky device. 13697754SJeff.Bonwick@Sun.COM * 13707754SJeff.Bonwick@Sun.COM * Gang rewrites only happen during sync-to-convergence. If we can't assemble 13717754SJeff.Bonwick@Sun.COM * the gang tree, we won't modify the block, so we can safely defer the free 13727754SJeff.Bonwick@Sun.COM * (knowing that the block is still intact). If we *can* assemble the gang 13737754SJeff.Bonwick@Sun.COM * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 13747754SJeff.Bonwick@Sun.COM * each constituent bp and we can allocate a new block on the next sync pass. 13757754SJeff.Bonwick@Sun.COM * 13767754SJeff.Bonwick@Sun.COM * In all cases, the gang tree allows complete recovery from partial failure. 1377789Sahrens * ========================================================================== 1378789Sahrens */ 13795530Sbonwick 13807754SJeff.Bonwick@Sun.COM static zio_t * 13817754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 13827754SJeff.Bonwick@Sun.COM { 13837754SJeff.Bonwick@Sun.COM if (gn != NULL) 13847754SJeff.Bonwick@Sun.COM return (pio); 13855530Sbonwick 13867754SJeff.Bonwick@Sun.COM return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 13877754SJeff.Bonwick@Sun.COM NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 13887754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1389789Sahrens } 1390789Sahrens 13917754SJeff.Bonwick@Sun.COM zio_t * 13927754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 13936523Sek110237 { 13947754SJeff.Bonwick@Sun.COM zio_t *zio; 13956523Sek110237 13967754SJeff.Bonwick@Sun.COM if (gn != NULL) { 13977754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 13987754SJeff.Bonwick@Sun.COM gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 13997754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14007754SJeff.Bonwick@Sun.COM /* 14017754SJeff.Bonwick@Sun.COM * As we rewrite each gang header, the pipeline will compute 14027754SJeff.Bonwick@Sun.COM * a new gang block header checksum for it; but no one will 14037754SJeff.Bonwick@Sun.COM * compute a new data checksum, so we do that here. The one 14047754SJeff.Bonwick@Sun.COM * exception is the gang leader: the pipeline already computed 14057754SJeff.Bonwick@Sun.COM * its data checksum because that stage precedes gang assembly. 14067754SJeff.Bonwick@Sun.COM * (Presently, nothing actually uses interior data checksums; 14077754SJeff.Bonwick@Sun.COM * this is just good hygiene.) 14087754SJeff.Bonwick@Sun.COM */ 14099443SBill.Moore@Sun.COM if (gn != pio->io_gang_leader->io_gang_tree) { 14107754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 14117754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp)); 14127754SJeff.Bonwick@Sun.COM } 141310922SJeff.Bonwick@Sun.COM /* 141410922SJeff.Bonwick@Sun.COM * If we are here to damage data for testing purposes, 141510922SJeff.Bonwick@Sun.COM * leave the GBH alone so that we can detect the damage. 141610922SJeff.Bonwick@Sun.COM */ 141710922SJeff.Bonwick@Sun.COM if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 141810922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 14197754SJeff.Bonwick@Sun.COM } else { 14207754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 14217754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 14227754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 14236523Sek110237 } 14246523Sek110237 14257754SJeff.Bonwick@Sun.COM return (zio); 14267754SJeff.Bonwick@Sun.COM } 14277754SJeff.Bonwick@Sun.COM 14287754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14297754SJeff.Bonwick@Sun.COM zio_t * 14307754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14317754SJeff.Bonwick@Sun.COM { 143210922SJeff.Bonwick@Sun.COM return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 143310922SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio))); 14347754SJeff.Bonwick@Sun.COM } 14357754SJeff.Bonwick@Sun.COM 14367754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 14377754SJeff.Bonwick@Sun.COM zio_t * 14387754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 14397754SJeff.Bonwick@Sun.COM { 14407754SJeff.Bonwick@Sun.COM return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 14417754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 14427754SJeff.Bonwick@Sun.COM } 14437754SJeff.Bonwick@Sun.COM 14447754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 14457754SJeff.Bonwick@Sun.COM NULL, 14467754SJeff.Bonwick@Sun.COM zio_read_gang, 14477754SJeff.Bonwick@Sun.COM zio_rewrite_gang, 14487754SJeff.Bonwick@Sun.COM zio_free_gang, 14497754SJeff.Bonwick@Sun.COM zio_claim_gang, 14507754SJeff.Bonwick@Sun.COM NULL 14517754SJeff.Bonwick@Sun.COM }; 14527754SJeff.Bonwick@Sun.COM 14537754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio); 14547754SJeff.Bonwick@Sun.COM 14557754SJeff.Bonwick@Sun.COM static zio_gang_node_t * 14567754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp) 14577754SJeff.Bonwick@Sun.COM { 14587754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn; 14597754SJeff.Bonwick@Sun.COM 14607754SJeff.Bonwick@Sun.COM ASSERT(*gnpp == NULL); 14617754SJeff.Bonwick@Sun.COM 14627754SJeff.Bonwick@Sun.COM gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 14637754SJeff.Bonwick@Sun.COM gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 14647754SJeff.Bonwick@Sun.COM *gnpp = gn; 14657754SJeff.Bonwick@Sun.COM 14667754SJeff.Bonwick@Sun.COM return (gn); 14676523Sek110237 } 14686523Sek110237 14696523Sek110237 static void 14707754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp) 14717754SJeff.Bonwick@Sun.COM { 14727754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 14737754SJeff.Bonwick@Sun.COM 14747754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 14757754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_child[g] == NULL); 14767754SJeff.Bonwick@Sun.COM 14777754SJeff.Bonwick@Sun.COM zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 14787754SJeff.Bonwick@Sun.COM kmem_free(gn, sizeof (*gn)); 14797754SJeff.Bonwick@Sun.COM *gnpp = NULL; 14807754SJeff.Bonwick@Sun.COM } 14817754SJeff.Bonwick@Sun.COM 14827754SJeff.Bonwick@Sun.COM static void 14837754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp) 1484789Sahrens { 14857754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 14867754SJeff.Bonwick@Sun.COM 14877754SJeff.Bonwick@Sun.COM if (gn == NULL) 14887754SJeff.Bonwick@Sun.COM return; 14897754SJeff.Bonwick@Sun.COM 14907754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 14917754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&gn->gn_child[g]); 14927754SJeff.Bonwick@Sun.COM 14937754SJeff.Bonwick@Sun.COM zio_gang_node_free(gnpp); 14947754SJeff.Bonwick@Sun.COM } 14957754SJeff.Bonwick@Sun.COM 14967754SJeff.Bonwick@Sun.COM static void 14979443SBill.Moore@Sun.COM zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 14987754SJeff.Bonwick@Sun.COM { 14997754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1500789Sahrens 15019443SBill.Moore@Sun.COM ASSERT(gio->io_gang_leader == gio); 15027754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp)); 15037754SJeff.Bonwick@Sun.COM 15049443SBill.Moore@Sun.COM zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 15057754SJeff.Bonwick@Sun.COM SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 15069443SBill.Moore@Sun.COM gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 15077754SJeff.Bonwick@Sun.COM } 15087754SJeff.Bonwick@Sun.COM 15097754SJeff.Bonwick@Sun.COM static void 15107754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio) 15117754SJeff.Bonwick@Sun.COM { 15129443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 15137754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio->io_private; 15147754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 15157754SJeff.Bonwick@Sun.COM 15169443SBill.Moore@Sun.COM ASSERT(gio == zio_unique_parent(zio)); 151710922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 15187754SJeff.Bonwick@Sun.COM 15197754SJeff.Bonwick@Sun.COM if (zio->io_error) 15207754SJeff.Bonwick@Sun.COM return; 15217754SJeff.Bonwick@Sun.COM 15227754SJeff.Bonwick@Sun.COM if (BP_SHOULD_BYTESWAP(bp)) 15237754SJeff.Bonwick@Sun.COM byteswap_uint64_array(zio->io_data, zio->io_size); 15247754SJeff.Bonwick@Sun.COM 15257754SJeff.Bonwick@Sun.COM ASSERT(zio->io_data == gn->gn_gbh); 15267754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 15277754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 15287754SJeff.Bonwick@Sun.COM 15297754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15307754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15317754SJeff.Bonwick@Sun.COM if (!BP_IS_GANG(gbp)) 15327754SJeff.Bonwick@Sun.COM continue; 15339443SBill.Moore@Sun.COM zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1534789Sahrens } 1535789Sahrens } 1536789Sahrens 15377754SJeff.Bonwick@Sun.COM static void 15387754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1539789Sahrens { 15409443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 15417754SJeff.Bonwick@Sun.COM zio_t *zio; 15427754SJeff.Bonwick@Sun.COM 15437754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) == !!gn); 15449443SBill.Moore@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 15459443SBill.Moore@Sun.COM ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 15467754SJeff.Bonwick@Sun.COM 15477754SJeff.Bonwick@Sun.COM /* 15487754SJeff.Bonwick@Sun.COM * If you're a gang header, your data is in gn->gn_gbh. 15497754SJeff.Bonwick@Sun.COM * If you're a gang member, your data is in 'data' and gn == NULL. 15507754SJeff.Bonwick@Sun.COM */ 15519443SBill.Moore@Sun.COM zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1552789Sahrens 15537754SJeff.Bonwick@Sun.COM if (gn != NULL) { 15547754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 15557754SJeff.Bonwick@Sun.COM 15567754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 15577754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 15587754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(gbp)) 15597754SJeff.Bonwick@Sun.COM continue; 15607754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 15617754SJeff.Bonwick@Sun.COM data = (char *)data + BP_GET_PSIZE(gbp); 15627754SJeff.Bonwick@Sun.COM } 15637754SJeff.Bonwick@Sun.COM } 15647754SJeff.Bonwick@Sun.COM 15659443SBill.Moore@Sun.COM if (gn == gio->io_gang_tree) 15669443SBill.Moore@Sun.COM ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 15677754SJeff.Bonwick@Sun.COM 15687754SJeff.Bonwick@Sun.COM if (zio != pio) 15697754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1570789Sahrens } 1571789Sahrens 15725530Sbonwick static int 15737754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio) 15745329Sgw25295 { 15755530Sbonwick blkptr_t *bp = zio->io_bp; 15765530Sbonwick 15779443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 15789443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 15799443SBill.Moore@Sun.COM 15809443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 15815530Sbonwick 15827754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1583789Sahrens 15845530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1585789Sahrens } 1586789Sahrens 15875530Sbonwick static int 15887754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio) 15896523Sek110237 { 15906523Sek110237 blkptr_t *bp = zio->io_bp; 1591789Sahrens 15927754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 15937754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 15945329Sgw25295 15959443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 15969443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1597789Sahrens 15987754SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 15999443SBill.Moore@Sun.COM zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 16007754SJeff.Bonwick@Sun.COM else 16019443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 1602789Sahrens 16037754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 16045530Sbonwick 16055530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1606789Sahrens } 1607789Sahrens 1608789Sahrens static void 16097754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio) 1610789Sahrens { 16118632SBill.Moore@Sun.COM zio_t *pio = zio_unique_parent(zio); 16129443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 16131775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 16141775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1615789Sahrens uint64_t asize; 16167754SJeff.Bonwick@Sun.COM 16177754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(zio->io_bp)) 16187754SJeff.Bonwick@Sun.COM return; 16197754SJeff.Bonwick@Sun.COM 16207754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1621789Sahrens 16227754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 162310922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 162410922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 162510922SJeff.Bonwick@Sun.COM ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 16261775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 16271775Sbillm 1628789Sahrens mutex_enter(&pio->io_lock); 16297754SJeff.Bonwick@Sun.COM for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 16301775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 16311775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 16321775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 16331775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 16341775Sbillm } 1635789Sahrens mutex_exit(&pio->io_lock); 1636789Sahrens } 1637789Sahrens 16385329Sgw25295 static int 16397754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio) 1640789Sahrens { 16417754SJeff.Bonwick@Sun.COM spa_t *spa = pio->io_spa; 16427754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 16439443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 16447754SJeff.Bonwick@Sun.COM zio_t *zio; 16457754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn, **gnpp; 1646789Sahrens zio_gbh_phys_t *gbh; 16477754SJeff.Bonwick@Sun.COM uint64_t txg = pio->io_txg; 16487754SJeff.Bonwick@Sun.COM uint64_t resid = pio->io_size; 16497754SJeff.Bonwick@Sun.COM uint64_t lsize; 165010922SJeff.Bonwick@Sun.COM int copies = gio->io_prop.zp_copies; 165110922SJeff.Bonwick@Sun.COM int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 16527754SJeff.Bonwick@Sun.COM zio_prop_t zp; 1653789Sahrens int error; 1654789Sahrens 165510922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 165610922SJeff.Bonwick@Sun.COM bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 16577754SJeff.Bonwick@Sun.COM METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 16585530Sbonwick if (error) { 16597754SJeff.Bonwick@Sun.COM pio->io_error = error; 16605530Sbonwick return (ZIO_PIPELINE_CONTINUE); 16615530Sbonwick } 1662789Sahrens 16639443SBill.Moore@Sun.COM if (pio == gio) { 16649443SBill.Moore@Sun.COM gnpp = &gio->io_gang_tree; 16657754SJeff.Bonwick@Sun.COM } else { 16667754SJeff.Bonwick@Sun.COM gnpp = pio->io_private; 16677754SJeff.Bonwick@Sun.COM ASSERT(pio->io_ready == zio_write_gang_member_ready); 1668789Sahrens } 1669789Sahrens 16707754SJeff.Bonwick@Sun.COM gn = zio_gang_node_alloc(gnpp); 16717754SJeff.Bonwick@Sun.COM gbh = gn->gn_gbh; 16727754SJeff.Bonwick@Sun.COM bzero(gbh, SPA_GANGBLOCKSIZE); 1673789Sahrens 16747754SJeff.Bonwick@Sun.COM /* 16757754SJeff.Bonwick@Sun.COM * Create the gang header. 16767754SJeff.Bonwick@Sun.COM */ 16777754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 16787754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 16795530Sbonwick 16801775Sbillm /* 16817754SJeff.Bonwick@Sun.COM * Create and nowait the gang children. 16821775Sbillm */ 16837754SJeff.Bonwick@Sun.COM for (int g = 0; resid != 0; resid -= lsize, g++) { 16847754SJeff.Bonwick@Sun.COM lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 16857754SJeff.Bonwick@Sun.COM SPA_MINBLOCKSIZE); 16867754SJeff.Bonwick@Sun.COM ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 16877754SJeff.Bonwick@Sun.COM 16889443SBill.Moore@Sun.COM zp.zp_checksum = gio->io_prop.zp_checksum; 16897754SJeff.Bonwick@Sun.COM zp.zp_compress = ZIO_COMPRESS_OFF; 16907754SJeff.Bonwick@Sun.COM zp.zp_type = DMU_OT_NONE; 16917754SJeff.Bonwick@Sun.COM zp.zp_level = 0; 169210922SJeff.Bonwick@Sun.COM zp.zp_copies = gio->io_prop.zp_copies; 169310922SJeff.Bonwick@Sun.COM zp.zp_dedup = 0; 169410922SJeff.Bonwick@Sun.COM zp.zp_dedup_verify = 0; 16957754SJeff.Bonwick@Sun.COM 16967754SJeff.Bonwick@Sun.COM zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 16977754SJeff.Bonwick@Sun.COM (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 16987754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready, NULL, &gn->gn_child[g], 16997754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 17007754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 17017754SJeff.Bonwick@Sun.COM } 17027754SJeff.Bonwick@Sun.COM 17037754SJeff.Bonwick@Sun.COM /* 17047754SJeff.Bonwick@Sun.COM * Set pio's pipeline to just wait for zio to finish. 17057754SJeff.Bonwick@Sun.COM */ 17067754SJeff.Bonwick@Sun.COM pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 17077754SJeff.Bonwick@Sun.COM 17087754SJeff.Bonwick@Sun.COM zio_nowait(zio); 17097754SJeff.Bonwick@Sun.COM 17107754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1711789Sahrens } 1712789Sahrens 1713789Sahrens /* 1714789Sahrens * ========================================================================== 171510922SJeff.Bonwick@Sun.COM * Dedup 171610922SJeff.Bonwick@Sun.COM * ========================================================================== 171710922SJeff.Bonwick@Sun.COM */ 171810922SJeff.Bonwick@Sun.COM static void 171910922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done(zio_t *zio) 172010922SJeff.Bonwick@Sun.COM { 172110922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 172210922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 172310922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 172410922SJeff.Bonwick@Sun.COM zio_t *pio = zio_unique_parent(zio); 172510922SJeff.Bonwick@Sun.COM 172610922SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 172710922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 172810922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) 172910922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 173010922SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && dde->dde_repair_data == NULL) 173110922SJeff.Bonwick@Sun.COM dde->dde_repair_data = zio->io_data; 173210922SJeff.Bonwick@Sun.COM else 173310922SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zio->io_size); 173410922SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 173510922SJeff.Bonwick@Sun.COM } 173610922SJeff.Bonwick@Sun.COM 173710922SJeff.Bonwick@Sun.COM static int 173810922SJeff.Bonwick@Sun.COM zio_ddt_read_start(zio_t *zio) 173910922SJeff.Bonwick@Sun.COM { 174010922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 174110922SJeff.Bonwick@Sun.COM 174210922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 174310922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 174410922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 174510922SJeff.Bonwick@Sun.COM 174610922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 174710922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 174810922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = ddt_repair_start(ddt, bp); 174910922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = dde->dde_phys; 175010922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 175110922SJeff.Bonwick@Sun.COM blkptr_t blk; 175210922SJeff.Bonwick@Sun.COM 175310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 175410922SJeff.Bonwick@Sun.COM zio->io_vsd = dde; 175510922SJeff.Bonwick@Sun.COM 175610922SJeff.Bonwick@Sun.COM if (ddp_self == NULL) 175710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 175810922SJeff.Bonwick@Sun.COM 175910922SJeff.Bonwick@Sun.COM for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 176010922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 176110922SJeff.Bonwick@Sun.COM continue; 176211125SJeff.Bonwick@Sun.COM ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 176311125SJeff.Bonwick@Sun.COM &blk); 176410922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, &blk, 176510922SJeff.Bonwick@Sun.COM zio_buf_alloc(zio->io_size), zio->io_size, 176610922SJeff.Bonwick@Sun.COM zio_ddt_child_read_done, dde, zio->io_priority, 176710922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 176810922SJeff.Bonwick@Sun.COM &zio->io_bookmark)); 176910922SJeff.Bonwick@Sun.COM } 177010922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 177110922SJeff.Bonwick@Sun.COM } 177210922SJeff.Bonwick@Sun.COM 177310922SJeff.Bonwick@Sun.COM zio_nowait(zio_read(zio, zio->io_spa, bp, 177410922SJeff.Bonwick@Sun.COM zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 177510922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 177610922SJeff.Bonwick@Sun.COM 177710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 177810922SJeff.Bonwick@Sun.COM } 177910922SJeff.Bonwick@Sun.COM 178010922SJeff.Bonwick@Sun.COM static int 178110922SJeff.Bonwick@Sun.COM zio_ddt_read_done(zio_t *zio) 178210922SJeff.Bonwick@Sun.COM { 178310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 178410922SJeff.Bonwick@Sun.COM 178510922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 178610922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 178710922SJeff.Bonwick@Sun.COM 178810922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 178910922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 179010922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 179110922SJeff.Bonwick@Sun.COM 179210922SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_DDT]) { 179310922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 179410922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_vsd; 179510922SJeff.Bonwick@Sun.COM if (ddt == NULL) { 179611147SGeorge.Wilson@Sun.COM ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 179710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 179810922SJeff.Bonwick@Sun.COM } 179910922SJeff.Bonwick@Sun.COM if (dde == NULL) { 180010922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1801*11173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 180210922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 180310922SJeff.Bonwick@Sun.COM } 180410922SJeff.Bonwick@Sun.COM if (dde->dde_repair_data != NULL) { 180510922SJeff.Bonwick@Sun.COM bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 180610922SJeff.Bonwick@Sun.COM zio->io_child_error[ZIO_CHILD_DDT] = 0; 180710922SJeff.Bonwick@Sun.COM } 180810922SJeff.Bonwick@Sun.COM ddt_repair_done(ddt, dde); 180910922SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 181010922SJeff.Bonwick@Sun.COM } 181110922SJeff.Bonwick@Sun.COM 181210922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vsd == NULL); 181310922SJeff.Bonwick@Sun.COM 181410922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 181510922SJeff.Bonwick@Sun.COM } 181610922SJeff.Bonwick@Sun.COM 181710922SJeff.Bonwick@Sun.COM static boolean_t 181810922SJeff.Bonwick@Sun.COM zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 181910922SJeff.Bonwick@Sun.COM { 182010922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 182110922SJeff.Bonwick@Sun.COM 182210922SJeff.Bonwick@Sun.COM /* 182310922SJeff.Bonwick@Sun.COM * Note: we compare the original data, not the transformed data, 182410922SJeff.Bonwick@Sun.COM * because when zio->io_bp is an override bp, we will not have 182510922SJeff.Bonwick@Sun.COM * pushed the I/O transforms. That's an important optimization 182610922SJeff.Bonwick@Sun.COM * because otherwise we'd compress/encrypt all dmu_sync() data twice. 182710922SJeff.Bonwick@Sun.COM */ 182810922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 182910922SJeff.Bonwick@Sun.COM zio_t *lio = dde->dde_lead_zio[p]; 183010922SJeff.Bonwick@Sun.COM 183110922SJeff.Bonwick@Sun.COM if (lio != NULL) { 183210922SJeff.Bonwick@Sun.COM return (lio->io_orig_size != zio->io_orig_size || 183310922SJeff.Bonwick@Sun.COM bcmp(zio->io_orig_data, lio->io_orig_data, 183410922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0); 183510922SJeff.Bonwick@Sun.COM } 183610922SJeff.Bonwick@Sun.COM } 183710922SJeff.Bonwick@Sun.COM 183810922SJeff.Bonwick@Sun.COM for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 183910922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 184010922SJeff.Bonwick@Sun.COM 184110922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) { 184210922SJeff.Bonwick@Sun.COM arc_buf_t *abuf = NULL; 184310922SJeff.Bonwick@Sun.COM uint32_t aflags = ARC_WAIT; 184410922SJeff.Bonwick@Sun.COM blkptr_t blk = *zio->io_bp; 184510922SJeff.Bonwick@Sun.COM int error; 184610922SJeff.Bonwick@Sun.COM 184710922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 184810922SJeff.Bonwick@Sun.COM 184910922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 185010922SJeff.Bonwick@Sun.COM 185110922SJeff.Bonwick@Sun.COM error = arc_read_nolock(NULL, spa, &blk, 185210922SJeff.Bonwick@Sun.COM arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 185310922SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 185410922SJeff.Bonwick@Sun.COM &aflags, &zio->io_bookmark); 185510922SJeff.Bonwick@Sun.COM 185610922SJeff.Bonwick@Sun.COM if (error == 0) { 185710922SJeff.Bonwick@Sun.COM if (arc_buf_size(abuf) != zio->io_orig_size || 185810922SJeff.Bonwick@Sun.COM bcmp(abuf->b_data, zio->io_orig_data, 185910922SJeff.Bonwick@Sun.COM zio->io_orig_size) != 0) 186010922SJeff.Bonwick@Sun.COM error = EEXIST; 186110922SJeff.Bonwick@Sun.COM VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 186210922SJeff.Bonwick@Sun.COM } 186310922SJeff.Bonwick@Sun.COM 186410922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 186510922SJeff.Bonwick@Sun.COM return (error != 0); 186610922SJeff.Bonwick@Sun.COM } 186710922SJeff.Bonwick@Sun.COM } 186810922SJeff.Bonwick@Sun.COM 186910922SJeff.Bonwick@Sun.COM return (B_FALSE); 187010922SJeff.Bonwick@Sun.COM } 187110922SJeff.Bonwick@Sun.COM 187210922SJeff.Bonwick@Sun.COM static void 187310922SJeff.Bonwick@Sun.COM zio_ddt_child_write_ready(zio_t *zio) 187410922SJeff.Bonwick@Sun.COM { 187510922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 187610922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 187710922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 187810922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 187910922SJeff.Bonwick@Sun.COM zio_t *pio; 188010922SJeff.Bonwick@Sun.COM 188110922SJeff.Bonwick@Sun.COM if (zio->io_error) 188210922SJeff.Bonwick@Sun.COM return; 188310922SJeff.Bonwick@Sun.COM 188410922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 188510922SJeff.Bonwick@Sun.COM 188610922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 188710922SJeff.Bonwick@Sun.COM 188810922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, zio->io_bp); 188910922SJeff.Bonwick@Sun.COM 189010922SJeff.Bonwick@Sun.COM while ((pio = zio_walk_parents(zio)) != NULL) 189110922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 189210922SJeff.Bonwick@Sun.COM 189310922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 189410922SJeff.Bonwick@Sun.COM } 189510922SJeff.Bonwick@Sun.COM 189610922SJeff.Bonwick@Sun.COM static void 189710922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done(zio_t *zio) 189810922SJeff.Bonwick@Sun.COM { 189910922SJeff.Bonwick@Sun.COM int p = zio->io_prop.zp_copies; 190010922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 190110922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 190210922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 190310922SJeff.Bonwick@Sun.COM 190410922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 190510922SJeff.Bonwick@Sun.COM 190610922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 190710922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 190810922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 190910922SJeff.Bonwick@Sun.COM 191010922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 191110922SJeff.Bonwick@Sun.COM while (zio_walk_parents(zio) != NULL) 191210922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 191310922SJeff.Bonwick@Sun.COM } else { 191410922SJeff.Bonwick@Sun.COM ddt_phys_clear(ddp); 191510922SJeff.Bonwick@Sun.COM } 191610922SJeff.Bonwick@Sun.COM 191710922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 191810922SJeff.Bonwick@Sun.COM } 191910922SJeff.Bonwick@Sun.COM 192010922SJeff.Bonwick@Sun.COM static void 192110922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done(zio_t *zio) 192210922SJeff.Bonwick@Sun.COM { 192310922SJeff.Bonwick@Sun.COM int p = DDT_PHYS_DITTO; 192410922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 192510922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 192610922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(zio->io_spa, bp); 192710922SJeff.Bonwick@Sun.COM ddt_entry_t *dde = zio->io_private; 192810922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp = &dde->dde_phys[p]; 192910922SJeff.Bonwick@Sun.COM ddt_key_t *ddk = &dde->dde_key; 193010922SJeff.Bonwick@Sun.COM 193110922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 193210922SJeff.Bonwick@Sun.COM 193310922SJeff.Bonwick@Sun.COM ASSERT(ddp->ddp_refcnt == 0); 193410922SJeff.Bonwick@Sun.COM ASSERT(dde->dde_lead_zio[p] == zio); 193510922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = NULL; 193610922SJeff.Bonwick@Sun.COM 193710922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 193810922SJeff.Bonwick@Sun.COM ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 193910922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 194010922SJeff.Bonwick@Sun.COM ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 194110922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 194210922SJeff.Bonwick@Sun.COM ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 194310922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 194410922SJeff.Bonwick@Sun.COM } 194510922SJeff.Bonwick@Sun.COM 194610922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 194710922SJeff.Bonwick@Sun.COM } 194810922SJeff.Bonwick@Sun.COM 194910922SJeff.Bonwick@Sun.COM static int 195010922SJeff.Bonwick@Sun.COM zio_ddt_write(zio_t *zio) 195110922SJeff.Bonwick@Sun.COM { 195210922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 195310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 195410922SJeff.Bonwick@Sun.COM uint64_t txg = zio->io_txg; 195510922SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 195610922SJeff.Bonwick@Sun.COM int p = zp->zp_copies; 195710922SJeff.Bonwick@Sun.COM int ditto_copies; 195810922SJeff.Bonwick@Sun.COM zio_t *cio = NULL; 195910922SJeff.Bonwick@Sun.COM zio_t *dio = NULL; 196010922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 196110922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 196210922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 196310922SJeff.Bonwick@Sun.COM 196410922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 196510922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 196610922SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 196710922SJeff.Bonwick@Sun.COM 196810922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 196910922SJeff.Bonwick@Sun.COM dde = ddt_lookup(ddt, bp, B_TRUE); 197010922SJeff.Bonwick@Sun.COM ddp = &dde->dde_phys[p]; 197110922SJeff.Bonwick@Sun.COM 197210922SJeff.Bonwick@Sun.COM if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 197310922SJeff.Bonwick@Sun.COM /* 197410922SJeff.Bonwick@Sun.COM * If we're using a weak checksum, upgrade to a strong checksum 197510922SJeff.Bonwick@Sun.COM * and try again. If we're already using a strong checksum, 197610922SJeff.Bonwick@Sun.COM * we can't resolve it, so just convert to an ordinary write. 197710922SJeff.Bonwick@Sun.COM * (And automatically e-mail a paper to Nature?) 197810922SJeff.Bonwick@Sun.COM */ 197910922SJeff.Bonwick@Sun.COM if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 198010922SJeff.Bonwick@Sun.COM zp->zp_checksum = spa_dedup_checksum(spa); 198110922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 198210922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 198310922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 198410922SJeff.Bonwick@Sun.COM } else { 198510922SJeff.Bonwick@Sun.COM zp->zp_dedup = 0; 198610922SJeff.Bonwick@Sun.COM } 198710922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 198810922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 198910922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 199010922SJeff.Bonwick@Sun.COM } 199110922SJeff.Bonwick@Sun.COM 199210922SJeff.Bonwick@Sun.COM ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 199310922SJeff.Bonwick@Sun.COM ASSERT(ditto_copies < SPA_DVAS_PER_BP); 199410922SJeff.Bonwick@Sun.COM 199510922SJeff.Bonwick@Sun.COM if (ditto_copies > ddt_ditto_copies_present(dde) && 199610922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 199710922SJeff.Bonwick@Sun.COM zio_prop_t czp = *zp; 199810922SJeff.Bonwick@Sun.COM 199910922SJeff.Bonwick@Sun.COM czp.zp_copies = ditto_copies; 200010922SJeff.Bonwick@Sun.COM 200110922SJeff.Bonwick@Sun.COM /* 200210922SJeff.Bonwick@Sun.COM * If we arrived here with an override bp, we won't have run 200310922SJeff.Bonwick@Sun.COM * the transform stack, so we won't have the data we need to 200410922SJeff.Bonwick@Sun.COM * generate a child i/o. So, toss the override bp and restart. 200510922SJeff.Bonwick@Sun.COM * This is safe, because using the override bp is just an 200610922SJeff.Bonwick@Sun.COM * optimization; and it's rare, so the cost doesn't matter. 200710922SJeff.Bonwick@Sun.COM */ 200810922SJeff.Bonwick@Sun.COM if (zio->io_bp_override) { 200910922SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); 201010922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_OPEN; 201110922SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 201210922SJeff.Bonwick@Sun.COM zio->io_bp_override = NULL; 201310922SJeff.Bonwick@Sun.COM BP_ZERO(bp); 201410922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 201510922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 201610922SJeff.Bonwick@Sun.COM } 201710922SJeff.Bonwick@Sun.COM 201810922SJeff.Bonwick@Sun.COM dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 201910922SJeff.Bonwick@Sun.COM zio->io_orig_size, &czp, NULL, 202010922SJeff.Bonwick@Sun.COM zio_ddt_ditto_write_done, dde, zio->io_priority, 202110922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 202210922SJeff.Bonwick@Sun.COM 202310922SJeff.Bonwick@Sun.COM zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 202410922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 202510922SJeff.Bonwick@Sun.COM } 202610922SJeff.Bonwick@Sun.COM 202710922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 202810922SJeff.Bonwick@Sun.COM if (ddp->ddp_phys_birth != 0) 202910922SJeff.Bonwick@Sun.COM ddt_bp_fill(ddp, bp, txg); 203010922SJeff.Bonwick@Sun.COM if (dde->dde_lead_zio[p] != NULL) 203110922SJeff.Bonwick@Sun.COM zio_add_child(zio, dde->dde_lead_zio[p]); 203210922SJeff.Bonwick@Sun.COM else 203310922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 203410922SJeff.Bonwick@Sun.COM } else if (zio->io_bp_override) { 203510922SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == txg); 203610922SJeff.Bonwick@Sun.COM ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 203710922SJeff.Bonwick@Sun.COM ddt_phys_fill(ddp, bp); 203810922SJeff.Bonwick@Sun.COM ddt_phys_addref(ddp); 203910922SJeff.Bonwick@Sun.COM } else { 204010922SJeff.Bonwick@Sun.COM cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 204110922SJeff.Bonwick@Sun.COM zio->io_orig_size, zp, zio_ddt_child_write_ready, 204210922SJeff.Bonwick@Sun.COM zio_ddt_child_write_done, dde, zio->io_priority, 204310922SJeff.Bonwick@Sun.COM ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 204410922SJeff.Bonwick@Sun.COM 204510922SJeff.Bonwick@Sun.COM zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 204610922SJeff.Bonwick@Sun.COM dde->dde_lead_zio[p] = cio; 204710922SJeff.Bonwick@Sun.COM } 204810922SJeff.Bonwick@Sun.COM 204910922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 205010922SJeff.Bonwick@Sun.COM 205110922SJeff.Bonwick@Sun.COM if (cio) 205210922SJeff.Bonwick@Sun.COM zio_nowait(cio); 205310922SJeff.Bonwick@Sun.COM if (dio) 205410922SJeff.Bonwick@Sun.COM zio_nowait(dio); 205510922SJeff.Bonwick@Sun.COM 205610922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 205710922SJeff.Bonwick@Sun.COM } 205810922SJeff.Bonwick@Sun.COM 205910922SJeff.Bonwick@Sun.COM static int 206010922SJeff.Bonwick@Sun.COM zio_ddt_free(zio_t *zio) 206110922SJeff.Bonwick@Sun.COM { 206210922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 206310922SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 206410922SJeff.Bonwick@Sun.COM ddt_t *ddt = ddt_select(spa, bp); 206510922SJeff.Bonwick@Sun.COM ddt_entry_t *dde; 206610922SJeff.Bonwick@Sun.COM ddt_phys_t *ddp; 206710922SJeff.Bonwick@Sun.COM 206810922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_DEDUP(bp)); 206910922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 207010922SJeff.Bonwick@Sun.COM 207110922SJeff.Bonwick@Sun.COM ddt_enter(ddt); 207210922SJeff.Bonwick@Sun.COM dde = ddt_lookup(ddt, bp, B_TRUE); 207310922SJeff.Bonwick@Sun.COM ddp = ddt_phys_select(dde, bp); 207410922SJeff.Bonwick@Sun.COM ddt_phys_decref(ddp); 207510922SJeff.Bonwick@Sun.COM ddt_exit(ddt); 207610922SJeff.Bonwick@Sun.COM 207710922SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 207810922SJeff.Bonwick@Sun.COM } 207910922SJeff.Bonwick@Sun.COM 208010922SJeff.Bonwick@Sun.COM /* 208110922SJeff.Bonwick@Sun.COM * ========================================================================== 2082789Sahrens * Allocate and free blocks 2083789Sahrens * ========================================================================== 2084789Sahrens */ 20855530Sbonwick static int 2086789Sahrens zio_dva_allocate(zio_t *zio) 2087789Sahrens { 20884527Sperrin spa_t *spa = zio->io_spa; 208910922SJeff.Bonwick@Sun.COM metaslab_class_t *mc = spa_normal_class(spa); 2090789Sahrens blkptr_t *bp = zio->io_bp; 2091789Sahrens int error; 2092789Sahrens 20939443SBill.Moore@Sun.COM if (zio->io_gang_leader == NULL) { 20949443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 20959443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 20969443SBill.Moore@Sun.COM } 20979443SBill.Moore@Sun.COM 2098789Sahrens ASSERT(BP_IS_HOLE(bp)); 20991775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 210010922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, >, 0); 210110922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2102789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2103789Sahrens 21047754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, mc, zio->io_size, bp, 210510922SJeff.Bonwick@Sun.COM zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2106789Sahrens 21077754SJeff.Bonwick@Sun.COM if (error) { 21087754SJeff.Bonwick@Sun.COM if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 21097754SJeff.Bonwick@Sun.COM return (zio_write_gang_block(zio)); 2110789Sahrens zio->io_error = error; 2111789Sahrens } 21125530Sbonwick 21135530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2114789Sahrens } 2115789Sahrens 21165530Sbonwick static int 2117789Sahrens zio_dva_free(zio_t *zio) 2118789Sahrens { 21197754SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2120789Sahrens 21215530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2122789Sahrens } 2123789Sahrens 21245530Sbonwick static int 2125789Sahrens zio_dva_claim(zio_t *zio) 2126789Sahrens { 21277754SJeff.Bonwick@Sun.COM int error; 21287754SJeff.Bonwick@Sun.COM 21297754SJeff.Bonwick@Sun.COM error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 21307754SJeff.Bonwick@Sun.COM if (error) 21317754SJeff.Bonwick@Sun.COM zio->io_error = error; 2132789Sahrens 21335530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2134789Sahrens } 2135789Sahrens 2136789Sahrens /* 21377754SJeff.Bonwick@Sun.COM * Undo an allocation. This is used by zio_done() when an I/O fails 21387754SJeff.Bonwick@Sun.COM * and we want to give back the block we just allocated. 21397754SJeff.Bonwick@Sun.COM * This handles both normal blocks and gang blocks. 21407754SJeff.Bonwick@Sun.COM */ 21417754SJeff.Bonwick@Sun.COM static void 21427754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 21437754SJeff.Bonwick@Sun.COM { 21447754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 214510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_bp_override == NULL); 21467754SJeff.Bonwick@Sun.COM 21477754SJeff.Bonwick@Sun.COM if (!BP_IS_HOLE(bp)) 214810922SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 21497754SJeff.Bonwick@Sun.COM 21507754SJeff.Bonwick@Sun.COM if (gn != NULL) { 21517754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 21527754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, gn->gn_child[g], 21537754SJeff.Bonwick@Sun.COM &gn->gn_gbh->zg_blkptr[g]); 21547754SJeff.Bonwick@Sun.COM } 21557754SJeff.Bonwick@Sun.COM } 21567754SJeff.Bonwick@Sun.COM } 21577754SJeff.Bonwick@Sun.COM 21587754SJeff.Bonwick@Sun.COM /* 21597754SJeff.Bonwick@Sun.COM * Try to allocate an intent log block. Return 0 on success, errno on failure. 21607754SJeff.Bonwick@Sun.COM */ 21617754SJeff.Bonwick@Sun.COM int 216210922SJeff.Bonwick@Sun.COM zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 216310922SJeff.Bonwick@Sun.COM uint64_t size, boolean_t use_slog) 21647754SJeff.Bonwick@Sun.COM { 216510310SNeil.Perrin@Sun.COM int error = 1; 21667754SJeff.Bonwick@Sun.COM 216710922SJeff.Bonwick@Sun.COM ASSERT(txg > spa_syncing_txg(spa)); 216810922SJeff.Bonwick@Sun.COM 216910879SNeil.Perrin@Sun.COM if (use_slog) 217010922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_log_class(spa), size, 217110310SNeil.Perrin@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21727754SJeff.Bonwick@Sun.COM 21737754SJeff.Bonwick@Sun.COM if (error) 217410922SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa_normal_class(spa), size, 21757754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 21767754SJeff.Bonwick@Sun.COM 21777754SJeff.Bonwick@Sun.COM if (error == 0) { 21787754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(new_bp, size); 21797754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(new_bp, size); 21807754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 21817754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 21827754SJeff.Bonwick@Sun.COM BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 21837754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(new_bp, 0); 218410922SJeff.Bonwick@Sun.COM BP_SET_DEDUP(new_bp, 0); 21857754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 21867754SJeff.Bonwick@Sun.COM } 21877754SJeff.Bonwick@Sun.COM 21887754SJeff.Bonwick@Sun.COM return (error); 21897754SJeff.Bonwick@Sun.COM } 21907754SJeff.Bonwick@Sun.COM 21917754SJeff.Bonwick@Sun.COM /* 219210922SJeff.Bonwick@Sun.COM * Free an intent log block. 21937754SJeff.Bonwick@Sun.COM */ 21947754SJeff.Bonwick@Sun.COM void 219510922SJeff.Bonwick@Sun.COM zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 21967754SJeff.Bonwick@Sun.COM { 219710922SJeff.Bonwick@Sun.COM ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 21987754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(bp)); 21997754SJeff.Bonwick@Sun.COM 220010922SJeff.Bonwick@Sun.COM zio_free(spa, txg, bp); 22017754SJeff.Bonwick@Sun.COM } 22027754SJeff.Bonwick@Sun.COM 22037754SJeff.Bonwick@Sun.COM /* 2204789Sahrens * ========================================================================== 2205789Sahrens * Read and write to physical devices 2206789Sahrens * ========================================================================== 2207789Sahrens */ 22085530Sbonwick static int 22091775Sbillm zio_vdev_io_start(zio_t *zio) 2210789Sahrens { 2211789Sahrens vdev_t *vd = zio->io_vd; 22121775Sbillm uint64_t align; 22135329Sgw25295 spa_t *spa = zio->io_spa; 22145329Sgw25295 22157754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0); 22167754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 22177754SJeff.Bonwick@Sun.COM 22187754SJeff.Bonwick@Sun.COM if (vd == NULL) { 22197754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 22207754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2221789Sahrens 22227754SJeff.Bonwick@Sun.COM /* 22237754SJeff.Bonwick@Sun.COM * The mirror_ops handle multiple DVAs in a single BP. 22247754SJeff.Bonwick@Sun.COM */ 22255530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 22267754SJeff.Bonwick@Sun.COM } 22271775Sbillm 22287754SJeff.Bonwick@Sun.COM align = 1ULL << vd->vdev_top->vdev_ashift; 2229789Sahrens 22301732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 22311732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 22321732Sbonwick char *abuf = zio_buf_alloc(asize); 22337754SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 22341732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 22351732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 22361732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 22371732Sbonwick } 22387754SJeff.Bonwick@Sun.COM zio_push_transform(zio, abuf, asize, asize, zio_subblock); 22391732Sbonwick } 22401732Sbonwick 22411732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 22421732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 22438241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 22448241SJeff.Bonwick@Sun.COM 22458241SJeff.Bonwick@Sun.COM /* 22468241SJeff.Bonwick@Sun.COM * If this is a repair I/O, and there's no self-healing involved -- 22478241SJeff.Bonwick@Sun.COM * that is, we're just resilvering what we expect to resilver -- 22488241SJeff.Bonwick@Sun.COM * then don't do the I/O unless zio's txg is actually in vd's DTL. 22498241SJeff.Bonwick@Sun.COM * This prevents spurious resilvering with nested replication. 22508241SJeff.Bonwick@Sun.COM * For example, given a mirror of mirrors, (A+B)+(C+D), if only 22518241SJeff.Bonwick@Sun.COM * A is out of date, we'll read from C+D, then use the data to 22528241SJeff.Bonwick@Sun.COM * resilver A+B -- but we don't actually want to resilver B, just A. 22538241SJeff.Bonwick@Sun.COM * The top-level mirror has no way to know this, so instead we just 22548241SJeff.Bonwick@Sun.COM * discard unnecessary repairs as we work our way down the vdev tree. 22558241SJeff.Bonwick@Sun.COM * The same logic applies to any form of nested replication: 22568241SJeff.Bonwick@Sun.COM * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 22578241SJeff.Bonwick@Sun.COM */ 22588241SJeff.Bonwick@Sun.COM if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 22598241SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 22608241SJeff.Bonwick@Sun.COM zio->io_txg != 0 && /* not a delegated i/o */ 22618241SJeff.Bonwick@Sun.COM !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 22628241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 22638241SJeff.Bonwick@Sun.COM zio_vdev_io_bypass(zio); 22648241SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 22658241SJeff.Bonwick@Sun.COM } 2266789Sahrens 22677754SJeff.Bonwick@Sun.COM if (vd->vdev_ops->vdev_op_leaf && 22687754SJeff.Bonwick@Sun.COM (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 22697754SJeff.Bonwick@Sun.COM 22707754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 22718632SBill.Moore@Sun.COM return (ZIO_PIPELINE_CONTINUE); 22727754SJeff.Bonwick@Sun.COM 22737754SJeff.Bonwick@Sun.COM if ((zio = vdev_queue_io(zio)) == NULL) 22747754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22757754SJeff.Bonwick@Sun.COM 22767754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 22777754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 22787754SJeff.Bonwick@Sun.COM zio_interrupt(zio); 22797754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22807754SJeff.Bonwick@Sun.COM } 22817754SJeff.Bonwick@Sun.COM } 22827754SJeff.Bonwick@Sun.COM 22835530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 2284789Sahrens } 2285789Sahrens 22865530Sbonwick static int 2287789Sahrens zio_vdev_io_done(zio_t *zio) 2288789Sahrens { 22897754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 22907754SJeff.Bonwick@Sun.COM vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 22917754SJeff.Bonwick@Sun.COM boolean_t unexpected_error = B_FALSE; 22925530Sbonwick 22937754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 22947754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22957754SJeff.Bonwick@Sun.COM 22967754SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2297789Sahrens 22987754SJeff.Bonwick@Sun.COM if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 22997754SJeff.Bonwick@Sun.COM 23007754SJeff.Bonwick@Sun.COM vdev_queue_io_done(zio); 23017754SJeff.Bonwick@Sun.COM 23027754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE) 23037754SJeff.Bonwick@Sun.COM vdev_cache_write(zio); 23047754SJeff.Bonwick@Sun.COM 23057754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23069725SEric.Schrock@Sun.COM zio->io_error = zio_handle_device_injection(vd, 23079725SEric.Schrock@Sun.COM zio, EIO); 2308789Sahrens 23097754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23107754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_label_injection(zio, EIO); 23117754SJeff.Bonwick@Sun.COM 23127754SJeff.Bonwick@Sun.COM if (zio->io_error) { 23137754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 23147754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 23157754SJeff.Bonwick@Sun.COM } else { 23167754SJeff.Bonwick@Sun.COM unexpected_error = B_TRUE; 23177754SJeff.Bonwick@Sun.COM } 23187754SJeff.Bonwick@Sun.COM } 23196976Seschrock } 23207754SJeff.Bonwick@Sun.COM 23217754SJeff.Bonwick@Sun.COM ops->vdev_op_io_done(zio); 2322789Sahrens 23237754SJeff.Bonwick@Sun.COM if (unexpected_error) 23248632SBill.Moore@Sun.COM VERIFY(vdev_probe(vd, zio) == NULL); 23257754SJeff.Bonwick@Sun.COM 23267754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2327789Sahrens } 2328789Sahrens 232910614SJonathan.Adams@Sun.COM /* 233010614SJonathan.Adams@Sun.COM * For non-raidz ZIOs, we can just copy aside the bad data read from the 233110614SJonathan.Adams@Sun.COM * disk, and use that to finish the checksum ereport later. 233210614SJonathan.Adams@Sun.COM */ 233310614SJonathan.Adams@Sun.COM static void 233410614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 233510614SJonathan.Adams@Sun.COM const void *good_buf) 233610614SJonathan.Adams@Sun.COM { 233710614SJonathan.Adams@Sun.COM /* no processing needed */ 233810614SJonathan.Adams@Sun.COM zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 233910614SJonathan.Adams@Sun.COM } 234010614SJonathan.Adams@Sun.COM 234110614SJonathan.Adams@Sun.COM /*ARGSUSED*/ 234210614SJonathan.Adams@Sun.COM void 234310614SJonathan.Adams@Sun.COM zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 234410614SJonathan.Adams@Sun.COM { 234510614SJonathan.Adams@Sun.COM void *buf = zio_buf_alloc(zio->io_size); 234610614SJonathan.Adams@Sun.COM 234710614SJonathan.Adams@Sun.COM bcopy(zio->io_data, buf, zio->io_size); 234810614SJonathan.Adams@Sun.COM 234910614SJonathan.Adams@Sun.COM zcr->zcr_cbinfo = zio->io_size; 235010614SJonathan.Adams@Sun.COM zcr->zcr_cbdata = buf; 235110614SJonathan.Adams@Sun.COM zcr->zcr_finish = zio_vsd_default_cksum_finish; 235210614SJonathan.Adams@Sun.COM zcr->zcr_free = zio_buf_free; 235310614SJonathan.Adams@Sun.COM } 235410614SJonathan.Adams@Sun.COM 23555530Sbonwick static int 2356789Sahrens zio_vdev_io_assess(zio_t *zio) 2357789Sahrens { 2358789Sahrens vdev_t *vd = zio->io_vd; 2359789Sahrens 23607754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 23617754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2362789Sahrens 23637754SJeff.Bonwick@Sun.COM if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 23647754SJeff.Bonwick@Sun.COM spa_config_exit(zio->io_spa, SCL_ZIO, zio); 23657754SJeff.Bonwick@Sun.COM 23667754SJeff.Bonwick@Sun.COM if (zio->io_vsd != NULL) { 236710614SJonathan.Adams@Sun.COM zio->io_vsd_ops->vsd_free(zio); 23687754SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 23691732Sbonwick } 23701732Sbonwick 23717754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 23721544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 2373789Sahrens 2374789Sahrens /* 2375789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 2376*11173SJonathan.Adams@Sun.COM * 2377*11173SJonathan.Adams@Sun.COM * On retry, we cut in line in the issue queue, since we don't want 2378*11173SJonathan.Adams@Sun.COM * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2379789Sahrens */ 23807754SJeff.Bonwick@Sun.COM if (zio->io_error && vd == NULL && 23817754SJeff.Bonwick@Sun.COM !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 23827754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 23837754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2384789Sahrens zio->io_error = 0; 23857754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_RETRY | 23867754SJeff.Bonwick@Sun.COM ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 238710922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2388*11173SJonathan.Adams@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2389*11173SJonathan.Adams@Sun.COM zio_requeue_io_start_cut_in_line); 23907754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23917754SJeff.Bonwick@Sun.COM } 2392789Sahrens 23937754SJeff.Bonwick@Sun.COM /* 23947754SJeff.Bonwick@Sun.COM * If we got an error on a leaf device, convert it to ENXIO 23957754SJeff.Bonwick@Sun.COM * if the device is not accessible at all. 23967754SJeff.Bonwick@Sun.COM */ 23977754SJeff.Bonwick@Sun.COM if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 23987754SJeff.Bonwick@Sun.COM !vdev_accessible(vd, zio)) 23997754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 24007754SJeff.Bonwick@Sun.COM 24017754SJeff.Bonwick@Sun.COM /* 24027754SJeff.Bonwick@Sun.COM * If we can't write to an interior vdev (mirror or RAID-Z), 24037754SJeff.Bonwick@Sun.COM * set vdev_cant_write so that we stop trying to allocate from it. 24047754SJeff.Bonwick@Sun.COM */ 24057754SJeff.Bonwick@Sun.COM if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 24067754SJeff.Bonwick@Sun.COM vd != NULL && !vd->vdev_ops->vdev_op_leaf) 24077754SJeff.Bonwick@Sun.COM vd->vdev_cant_write = B_TRUE; 24087754SJeff.Bonwick@Sun.COM 24097754SJeff.Bonwick@Sun.COM if (zio->io_error) 24107754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2411789Sahrens 24125530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2413789Sahrens } 2414789Sahrens 2415789Sahrens void 2416789Sahrens zio_vdev_io_reissue(zio_t *zio) 2417789Sahrens { 2418789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2419789Sahrens ASSERT(zio->io_error == 0); 2420789Sahrens 242110922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2422789Sahrens } 2423789Sahrens 2424789Sahrens void 2425789Sahrens zio_vdev_io_redone(zio_t *zio) 2426789Sahrens { 2427789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2428789Sahrens 242910922SJeff.Bonwick@Sun.COM zio->io_stage >>= 1; 2430789Sahrens } 2431789Sahrens 2432789Sahrens void 2433789Sahrens zio_vdev_io_bypass(zio_t *zio) 2434789Sahrens { 2435789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2436789Sahrens ASSERT(zio->io_error == 0); 2437789Sahrens 2438789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 243910922SJeff.Bonwick@Sun.COM zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2440789Sahrens } 2441789Sahrens 2442789Sahrens /* 2443789Sahrens * ========================================================================== 2444789Sahrens * Generate and verify checksums 2445789Sahrens * ========================================================================== 2446789Sahrens */ 24475530Sbonwick static int 2448789Sahrens zio_checksum_generate(zio_t *zio) 2449789Sahrens { 2450789Sahrens blkptr_t *bp = zio->io_bp; 24517754SJeff.Bonwick@Sun.COM enum zio_checksum checksum; 2452789Sahrens 24537754SJeff.Bonwick@Sun.COM if (bp == NULL) { 24547754SJeff.Bonwick@Sun.COM /* 24557754SJeff.Bonwick@Sun.COM * This is zio_write_phys(). 24567754SJeff.Bonwick@Sun.COM * We're either generating a label checksum, or none at all. 24577754SJeff.Bonwick@Sun.COM */ 24587754SJeff.Bonwick@Sun.COM checksum = zio->io_prop.zp_checksum; 2459789Sahrens 24607754SJeff.Bonwick@Sun.COM if (checksum == ZIO_CHECKSUM_OFF) 24617754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2462789Sahrens 24637754SJeff.Bonwick@Sun.COM ASSERT(checksum == ZIO_CHECKSUM_LABEL); 24647754SJeff.Bonwick@Sun.COM } else { 24657754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 24667754SJeff.Bonwick@Sun.COM ASSERT(!IO_IS_ALLOCATING(zio)); 24677754SJeff.Bonwick@Sun.COM checksum = ZIO_CHECKSUM_GANG_HEADER; 24687754SJeff.Bonwick@Sun.COM } else { 24697754SJeff.Bonwick@Sun.COM checksum = BP_GET_CHECKSUM(bp); 24707754SJeff.Bonwick@Sun.COM } 24717754SJeff.Bonwick@Sun.COM } 2472789Sahrens 24737754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2474789Sahrens 24755530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2476789Sahrens } 2477789Sahrens 24785530Sbonwick static int 2479789Sahrens zio_checksum_verify(zio_t *zio) 2480789Sahrens { 248110614SJonathan.Adams@Sun.COM zio_bad_cksum_t info; 24827754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 24837754SJeff.Bonwick@Sun.COM int error; 24847754SJeff.Bonwick@Sun.COM 248510922SJeff.Bonwick@Sun.COM ASSERT(zio->io_vd != NULL); 248610922SJeff.Bonwick@Sun.COM 24877754SJeff.Bonwick@Sun.COM if (bp == NULL) { 24887754SJeff.Bonwick@Sun.COM /* 24897754SJeff.Bonwick@Sun.COM * This is zio_read_phys(). 24907754SJeff.Bonwick@Sun.COM * We're either verifying a label checksum, or nothing at all. 24917754SJeff.Bonwick@Sun.COM */ 24927754SJeff.Bonwick@Sun.COM if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 24937754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 24947754SJeff.Bonwick@Sun.COM 24957754SJeff.Bonwick@Sun.COM ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 24967754SJeff.Bonwick@Sun.COM } 24977754SJeff.Bonwick@Sun.COM 249810614SJonathan.Adams@Sun.COM if ((error = zio_checksum_error(zio, &info)) != 0) { 24997754SJeff.Bonwick@Sun.COM zio->io_error = error; 25007754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 250110614SJonathan.Adams@Sun.COM zfs_ereport_start_checksum(zio->io_spa, 250210614SJonathan.Adams@Sun.COM zio->io_vd, zio, zio->io_offset, 250310614SJonathan.Adams@Sun.COM zio->io_size, NULL, &info); 25047754SJeff.Bonwick@Sun.COM } 2505789Sahrens } 2506789Sahrens 25075530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2508789Sahrens } 2509789Sahrens 2510789Sahrens /* 2511789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 2512789Sahrens */ 2513789Sahrens void 2514789Sahrens zio_checksum_verified(zio_t *zio) 2515789Sahrens { 251610922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2517789Sahrens } 2518789Sahrens 2519789Sahrens /* 25207754SJeff.Bonwick@Sun.COM * ========================================================================== 25217754SJeff.Bonwick@Sun.COM * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 25227754SJeff.Bonwick@Sun.COM * An error of 0 indictes success. ENXIO indicates whole-device failure, 25237754SJeff.Bonwick@Sun.COM * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 25247754SJeff.Bonwick@Sun.COM * indicate errors that are specific to one I/O, and most likely permanent. 25257754SJeff.Bonwick@Sun.COM * Any other error is presumed to be worse because we weren't expecting it. 25267754SJeff.Bonwick@Sun.COM * ========================================================================== 2527789Sahrens */ 25287754SJeff.Bonwick@Sun.COM int 25297754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2) 2530789Sahrens { 25317754SJeff.Bonwick@Sun.COM static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 25327754SJeff.Bonwick@Sun.COM int r1, r2; 25331775Sbillm 25347754SJeff.Bonwick@Sun.COM for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 25357754SJeff.Bonwick@Sun.COM if (e1 == zio_error_rank[r1]) 25367754SJeff.Bonwick@Sun.COM break; 25377754SJeff.Bonwick@Sun.COM 25387754SJeff.Bonwick@Sun.COM for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 25397754SJeff.Bonwick@Sun.COM if (e2 == zio_error_rank[r2]) 25407754SJeff.Bonwick@Sun.COM break; 25417754SJeff.Bonwick@Sun.COM 25427754SJeff.Bonwick@Sun.COM return (r1 > r2 ? e1 : e2); 2543789Sahrens } 2544789Sahrens 2545789Sahrens /* 2546789Sahrens * ========================================================================== 25477754SJeff.Bonwick@Sun.COM * I/O completion 2548789Sahrens * ========================================================================== 2549789Sahrens */ 25507754SJeff.Bonwick@Sun.COM static int 25517754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio) 25527754SJeff.Bonwick@Sun.COM { 25537754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 25548632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 25557754SJeff.Bonwick@Sun.COM 255610922SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 255710922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 25589443SBill.Moore@Sun.COM return (ZIO_PIPELINE_STOP); 25599443SBill.Moore@Sun.COM 25607754SJeff.Bonwick@Sun.COM if (zio->io_ready) { 25617754SJeff.Bonwick@Sun.COM ASSERT(IO_IS_ALLOCATING(zio)); 25627754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 25637754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 25647754SJeff.Bonwick@Sun.COM 25657754SJeff.Bonwick@Sun.COM zio->io_ready(zio); 25667754SJeff.Bonwick@Sun.COM } 25677754SJeff.Bonwick@Sun.COM 25687754SJeff.Bonwick@Sun.COM if (bp != NULL && bp != &zio->io_bp_copy) 25697754SJeff.Bonwick@Sun.COM zio->io_bp_copy = *bp; 25707754SJeff.Bonwick@Sun.COM 25717754SJeff.Bonwick@Sun.COM if (zio->io_error) 25727754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 25737754SJeff.Bonwick@Sun.COM 25748632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 25758632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = 1; 25768632SBill.Moore@Sun.COM pio = zio_walk_parents(zio); 25778632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 25788632SBill.Moore@Sun.COM 25798632SBill.Moore@Sun.COM /* 25808632SBill.Moore@Sun.COM * As we notify zio's parents, new parents could be added. 25818632SBill.Moore@Sun.COM * New parents go to the head of zio's io_parent_list, however, 25828632SBill.Moore@Sun.COM * so we will (correctly) not notify them. The remainder of zio's 25838632SBill.Moore@Sun.COM * io_parent_list, from 'pio_next' onward, cannot change because 25848632SBill.Moore@Sun.COM * all parents must wait for us to be done before they can be done. 25858632SBill.Moore@Sun.COM */ 25868632SBill.Moore@Sun.COM for (; pio != NULL; pio = pio_next) { 25878632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 25887754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_READY); 25898632SBill.Moore@Sun.COM } 25907754SJeff.Bonwick@Sun.COM 259110922SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_NODATA) { 259210922SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp)) { 259310922SJeff.Bonwick@Sun.COM zio->io_flags &= ~ZIO_FLAG_NODATA; 259410922SJeff.Bonwick@Sun.COM } else { 259510922SJeff.Bonwick@Sun.COM ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 259610922SJeff.Bonwick@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 259710922SJeff.Bonwick@Sun.COM } 259810922SJeff.Bonwick@Sun.COM } 259910922SJeff.Bonwick@Sun.COM 260011026STim.Haley@Sun.COM if (zio_injection_enabled && 260111026STim.Haley@Sun.COM zio->io_spa->spa_syncing_txg == zio->io_txg) 260211026STim.Haley@Sun.COM zio_handle_ignored_writes(zio); 260311026STim.Haley@Sun.COM 26047754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 26057754SJeff.Bonwick@Sun.COM } 26067754SJeff.Bonwick@Sun.COM 26077754SJeff.Bonwick@Sun.COM static int 26087754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio) 26097754SJeff.Bonwick@Sun.COM { 26107754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 26117754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 26127754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 26137754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 26147754SJeff.Bonwick@Sun.COM uint64_t psize = zio->io_size; 26158632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 26167754SJeff.Bonwick@Sun.COM 26177754SJeff.Bonwick@Sun.COM /* 26189443SBill.Moore@Sun.COM * If our children haven't all completed, 26197754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 26207754SJeff.Bonwick@Sun.COM */ 26217754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 26227754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 262310922SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 26247754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 26257754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 26267754SJeff.Bonwick@Sun.COM 26277754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 26287754SJeff.Bonwick@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 26297754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[c][w] == 0); 26307754SJeff.Bonwick@Sun.COM 26317754SJeff.Bonwick@Sun.COM if (bp != NULL) { 26327754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[0] == 0); 26337754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[1] == 0); 26347754SJeff.Bonwick@Sun.COM ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 26358632SBill.Moore@Sun.COM (bp == zio_unique_parent(zio)->io_bp)); 26367754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 263710922SJeff.Bonwick@Sun.COM zio->io_bp_override == NULL && 26387754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 26397754SJeff.Bonwick@Sun.COM ASSERT(!BP_SHOULD_BYTESWAP(bp)); 264010922SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 26417754SJeff.Bonwick@Sun.COM ASSERT(BP_COUNT_GANG(bp) == 0 || 26427754SJeff.Bonwick@Sun.COM (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 26437754SJeff.Bonwick@Sun.COM } 26447754SJeff.Bonwick@Sun.COM } 26457754SJeff.Bonwick@Sun.COM 26467754SJeff.Bonwick@Sun.COM /* 264710922SJeff.Bonwick@Sun.COM * If there were child vdev/gang/ddt errors, they apply to us now. 26487754SJeff.Bonwick@Sun.COM */ 26497754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 26507754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 265110922SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 265210922SJeff.Bonwick@Sun.COM 265310922SJeff.Bonwick@Sun.COM /* 265410922SJeff.Bonwick@Sun.COM * If the I/O on the transformed data was successful, generate any 265510922SJeff.Bonwick@Sun.COM * checksum reports now while we still have the transformed data. 265610922SJeff.Bonwick@Sun.COM */ 265710922SJeff.Bonwick@Sun.COM if (zio->io_error == 0) { 265810922SJeff.Bonwick@Sun.COM while (zio->io_cksum_report != NULL) { 265910922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 266010922SJeff.Bonwick@Sun.COM uint64_t align = zcr->zcr_align; 266110922SJeff.Bonwick@Sun.COM uint64_t asize = P2ROUNDUP(psize, align); 266210922SJeff.Bonwick@Sun.COM char *abuf = zio->io_data; 266310922SJeff.Bonwick@Sun.COM 266410922SJeff.Bonwick@Sun.COM if (asize != psize) { 266510922SJeff.Bonwick@Sun.COM abuf = zio_buf_alloc(asize); 266610922SJeff.Bonwick@Sun.COM bcopy(zio->io_data, abuf, psize); 266710922SJeff.Bonwick@Sun.COM bzero(abuf + psize, asize - psize); 266810922SJeff.Bonwick@Sun.COM } 266910922SJeff.Bonwick@Sun.COM 267010922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 267110922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 267210922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, abuf); 267310922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 267410922SJeff.Bonwick@Sun.COM 267510922SJeff.Bonwick@Sun.COM if (asize != psize) 267610922SJeff.Bonwick@Sun.COM zio_buf_free(abuf, asize); 267710922SJeff.Bonwick@Sun.COM } 267810922SJeff.Bonwick@Sun.COM } 26797754SJeff.Bonwick@Sun.COM 26807754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); /* note: may set zio->io_error */ 26817754SJeff.Bonwick@Sun.COM 26827754SJeff.Bonwick@Sun.COM vdev_stat_update(zio, psize); 26837754SJeff.Bonwick@Sun.COM 26847754SJeff.Bonwick@Sun.COM if (zio->io_error) { 26857754SJeff.Bonwick@Sun.COM /* 26867754SJeff.Bonwick@Sun.COM * If this I/O is attached to a particular vdev, 26877754SJeff.Bonwick@Sun.COM * generate an error message describing the I/O failure 26887754SJeff.Bonwick@Sun.COM * at the block level. We ignore these errors if the 26897754SJeff.Bonwick@Sun.COM * device is currently unavailable. 26907754SJeff.Bonwick@Sun.COM */ 26917754SJeff.Bonwick@Sun.COM if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 26927754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 26937754SJeff.Bonwick@Sun.COM 269410685SGeorge.Wilson@Sun.COM if ((zio->io_error == EIO || !(zio->io_flags & 269510685SGeorge.Wilson@Sun.COM (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 269610685SGeorge.Wilson@Sun.COM zio == lio) { 26977754SJeff.Bonwick@Sun.COM /* 26987754SJeff.Bonwick@Sun.COM * For logical I/O requests, tell the SPA to log the 26997754SJeff.Bonwick@Sun.COM * error and generate a logical data ereport. 27007754SJeff.Bonwick@Sun.COM */ 27017754SJeff.Bonwick@Sun.COM spa_log_error(spa, zio); 27027754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 27037754SJeff.Bonwick@Sun.COM 0, 0); 27047754SJeff.Bonwick@Sun.COM } 27057754SJeff.Bonwick@Sun.COM } 27067754SJeff.Bonwick@Sun.COM 27077754SJeff.Bonwick@Sun.COM if (zio->io_error && zio == lio) { 27087754SJeff.Bonwick@Sun.COM /* 27097754SJeff.Bonwick@Sun.COM * Determine whether zio should be reexecuted. This will 27107754SJeff.Bonwick@Sun.COM * propagate all the way to the root via zio_notify_parent(). 27117754SJeff.Bonwick@Sun.COM */ 27127754SJeff.Bonwick@Sun.COM ASSERT(vd == NULL && bp != NULL); 271310922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 271410922SJeff.Bonwick@Sun.COM 271510922SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio) && 271610922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 27177754SJeff.Bonwick@Sun.COM if (zio->io_error != ENOSPC) 27187754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_NOW; 27197754SJeff.Bonwick@Sun.COM else 27207754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 272110922SJeff.Bonwick@Sun.COM } 27227754SJeff.Bonwick@Sun.COM 27237754SJeff.Bonwick@Sun.COM if ((zio->io_type == ZIO_TYPE_READ || 27247754SJeff.Bonwick@Sun.COM zio->io_type == ZIO_TYPE_FREE) && 27257754SJeff.Bonwick@Sun.COM zio->io_error == ENXIO && 272611147SGeorge.Wilson@Sun.COM spa_load_state(spa) == SPA_LOAD_NONE && 27277754SJeff.Bonwick@Sun.COM spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 27287754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 27297754SJeff.Bonwick@Sun.COM 27307754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 27317754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 273210614SJonathan.Adams@Sun.COM 273310614SJonathan.Adams@Sun.COM /* 273410614SJonathan.Adams@Sun.COM * Here is a possibly good place to attempt to do 273510614SJonathan.Adams@Sun.COM * either combinatorial reconstruction or error correction 273610614SJonathan.Adams@Sun.COM * based on checksums. It also might be a good place 273710614SJonathan.Adams@Sun.COM * to send out preliminary ereports before we suspend 273810614SJonathan.Adams@Sun.COM * processing. 273910614SJonathan.Adams@Sun.COM */ 27407754SJeff.Bonwick@Sun.COM } 27417754SJeff.Bonwick@Sun.COM 27427754SJeff.Bonwick@Sun.COM /* 27437754SJeff.Bonwick@Sun.COM * If there were logical child errors, they apply to us now. 27447754SJeff.Bonwick@Sun.COM * We defer this until now to avoid conflating logical child 27457754SJeff.Bonwick@Sun.COM * errors with errors that happened to the zio itself when 27467754SJeff.Bonwick@Sun.COM * updating vdev stats and reporting FMA events above. 27477754SJeff.Bonwick@Sun.COM */ 27487754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 27497754SJeff.Bonwick@Sun.COM 275010922SJeff.Bonwick@Sun.COM if ((zio->io_error || zio->io_reexecute) && 275110922SJeff.Bonwick@Sun.COM IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 275210922SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 27539443SBill.Moore@Sun.COM zio_dva_unallocate(zio, zio->io_gang_tree, bp); 27549443SBill.Moore@Sun.COM 27559443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 27569443SBill.Moore@Sun.COM 27579470SGeorge.Wilson@Sun.COM /* 27589470SGeorge.Wilson@Sun.COM * Godfather I/Os should never suspend. 27599470SGeorge.Wilson@Sun.COM */ 27609470SGeorge.Wilson@Sun.COM if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 27619470SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 27629470SGeorge.Wilson@Sun.COM zio->io_reexecute = 0; 27639470SGeorge.Wilson@Sun.COM 27649470SGeorge.Wilson@Sun.COM if (zio->io_reexecute) { 27657754SJeff.Bonwick@Sun.COM /* 27667754SJeff.Bonwick@Sun.COM * This is a logical I/O that wants to reexecute. 27677754SJeff.Bonwick@Sun.COM * 27687754SJeff.Bonwick@Sun.COM * Reexecute is top-down. When an i/o fails, if it's not 27697754SJeff.Bonwick@Sun.COM * the root, it simply notifies its parent and sticks around. 27707754SJeff.Bonwick@Sun.COM * The parent, seeing that it still has children in zio_done(), 27717754SJeff.Bonwick@Sun.COM * does the same. This percolates all the way up to the root. 27727754SJeff.Bonwick@Sun.COM * The root i/o will reexecute or suspend the entire tree. 27737754SJeff.Bonwick@Sun.COM * 27747754SJeff.Bonwick@Sun.COM * This approach ensures that zio_reexecute() honors 27757754SJeff.Bonwick@Sun.COM * all the original i/o dependency relationships, e.g. 27767754SJeff.Bonwick@Sun.COM * parents not executing until children are ready. 27777754SJeff.Bonwick@Sun.COM */ 27787754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 27797754SJeff.Bonwick@Sun.COM 27809443SBill.Moore@Sun.COM zio->io_gang_leader = NULL; 27817754SJeff.Bonwick@Sun.COM 27828632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 27838632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 27848632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 27858632SBill.Moore@Sun.COM 27869234SGeorge.Wilson@Sun.COM /* 27879234SGeorge.Wilson@Sun.COM * "The Godfather" I/O monitors its children but is 27889234SGeorge.Wilson@Sun.COM * not a true parent to them. It will track them through 27899234SGeorge.Wilson@Sun.COM * the pipeline but severs its ties whenever they get into 27909234SGeorge.Wilson@Sun.COM * trouble (e.g. suspended). This allows "The Godfather" 27919234SGeorge.Wilson@Sun.COM * I/O to return status without blocking. 27929234SGeorge.Wilson@Sun.COM */ 27939234SGeorge.Wilson@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 27949234SGeorge.Wilson@Sun.COM zio_link_t *zl = zio->io_walk_link; 27959234SGeorge.Wilson@Sun.COM pio_next = zio_walk_parents(zio); 27969234SGeorge.Wilson@Sun.COM 27979234SGeorge.Wilson@Sun.COM if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 27989234SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 27999234SGeorge.Wilson@Sun.COM zio_remove_child(pio, zio, zl); 28009234SGeorge.Wilson@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28019234SGeorge.Wilson@Sun.COM } 28029234SGeorge.Wilson@Sun.COM } 28039234SGeorge.Wilson@Sun.COM 28048632SBill.Moore@Sun.COM if ((pio = zio_unique_parent(zio)) != NULL) { 28057754SJeff.Bonwick@Sun.COM /* 28067754SJeff.Bonwick@Sun.COM * We're not a root i/o, so there's nothing to do 28077754SJeff.Bonwick@Sun.COM * but notify our parent. Don't propagate errors 28087754SJeff.Bonwick@Sun.COM * upward since we haven't permanently failed yet. 28097754SJeff.Bonwick@Sun.COM */ 28109470SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 28117754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 28127754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28137754SJeff.Bonwick@Sun.COM } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 28147754SJeff.Bonwick@Sun.COM /* 28157754SJeff.Bonwick@Sun.COM * We'd fail again if we reexecuted now, so suspend 28167754SJeff.Bonwick@Sun.COM * until conditions improve (e.g. device comes online). 28177754SJeff.Bonwick@Sun.COM */ 28187754SJeff.Bonwick@Sun.COM zio_suspend(spa, zio); 28197754SJeff.Bonwick@Sun.COM } else { 28207754SJeff.Bonwick@Sun.COM /* 28217754SJeff.Bonwick@Sun.COM * Reexecution is potentially a huge amount of work. 28227754SJeff.Bonwick@Sun.COM * Hand it off to the otherwise-unused claim taskq. 28237754SJeff.Bonwick@Sun.COM */ 28247754SJeff.Bonwick@Sun.COM (void) taskq_dispatch( 28257754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 28267754SJeff.Bonwick@Sun.COM (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 28277754SJeff.Bonwick@Sun.COM } 28287754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28297754SJeff.Bonwick@Sun.COM } 28307754SJeff.Bonwick@Sun.COM 283110922SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_count == 0); 28329470SGeorge.Wilson@Sun.COM ASSERT(zio->io_reexecute == 0); 28337754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 28347754SJeff.Bonwick@Sun.COM 283510922SJeff.Bonwick@Sun.COM /* 283610922SJeff.Bonwick@Sun.COM * Report any checksum errors, since the I/O is complete. 283710922SJeff.Bonwick@Sun.COM */ 283810614SJonathan.Adams@Sun.COM while (zio->io_cksum_report != NULL) { 283910922SJeff.Bonwick@Sun.COM zio_cksum_report_t *zcr = zio->io_cksum_report; 284010922SJeff.Bonwick@Sun.COM zio->io_cksum_report = zcr->zcr_next; 284110922SJeff.Bonwick@Sun.COM zcr->zcr_next = NULL; 284210922SJeff.Bonwick@Sun.COM zcr->zcr_finish(zcr, NULL); 284310922SJeff.Bonwick@Sun.COM zfs_ereport_free_checksum(zcr); 284410614SJonathan.Adams@Sun.COM } 284510614SJonathan.Adams@Sun.COM 28468632SBill.Moore@Sun.COM /* 28478632SBill.Moore@Sun.COM * It is the responsibility of the done callback to ensure that this 28488632SBill.Moore@Sun.COM * particular zio is no longer discoverable for adoption, and as 28498632SBill.Moore@Sun.COM * such, cannot acquire any new parents. 28508632SBill.Moore@Sun.COM */ 28517754SJeff.Bonwick@Sun.COM if (zio->io_done) 28527754SJeff.Bonwick@Sun.COM zio->io_done(zio); 28537754SJeff.Bonwick@Sun.COM 28548632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 28558632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 28568632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 28577754SJeff.Bonwick@Sun.COM 28588632SBill.Moore@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 28598632SBill.Moore@Sun.COM zio_link_t *zl = zio->io_walk_link; 28608632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 28618632SBill.Moore@Sun.COM zio_remove_child(pio, zio, zl); 28627754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 28637754SJeff.Bonwick@Sun.COM } 28647754SJeff.Bonwick@Sun.COM 28657754SJeff.Bonwick@Sun.COM if (zio->io_waiter != NULL) { 28667754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 28677754SJeff.Bonwick@Sun.COM zio->io_executor = NULL; 28687754SJeff.Bonwick@Sun.COM cv_broadcast(&zio->io_cv); 28697754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 28707754SJeff.Bonwick@Sun.COM } else { 28717754SJeff.Bonwick@Sun.COM zio_destroy(zio); 28727754SJeff.Bonwick@Sun.COM } 28737754SJeff.Bonwick@Sun.COM 28747754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 28757754SJeff.Bonwick@Sun.COM } 28767754SJeff.Bonwick@Sun.COM 28777754SJeff.Bonwick@Sun.COM /* 28787754SJeff.Bonwick@Sun.COM * ========================================================================== 28797754SJeff.Bonwick@Sun.COM * I/O pipeline definition 28807754SJeff.Bonwick@Sun.COM * ========================================================================== 28817754SJeff.Bonwick@Sun.COM */ 288210922SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[] = { 28835530Sbonwick NULL, 288410922SJeff.Bonwick@Sun.COM zio_read_bp_init, 288510922SJeff.Bonwick@Sun.COM zio_free_bp_init, 28865530Sbonwick zio_issue_async, 28877754SJeff.Bonwick@Sun.COM zio_write_bp_init, 2888789Sahrens zio_checksum_generate, 288910922SJeff.Bonwick@Sun.COM zio_ddt_read_start, 289010922SJeff.Bonwick@Sun.COM zio_ddt_read_done, 289110922SJeff.Bonwick@Sun.COM zio_ddt_write, 289210922SJeff.Bonwick@Sun.COM zio_ddt_free, 28937754SJeff.Bonwick@Sun.COM zio_gang_assemble, 28947754SJeff.Bonwick@Sun.COM zio_gang_issue, 2895789Sahrens zio_dva_allocate, 2896789Sahrens zio_dva_free, 2897789Sahrens zio_dva_claim, 2898789Sahrens zio_ready, 2899789Sahrens zio_vdev_io_start, 2900789Sahrens zio_vdev_io_done, 2901789Sahrens zio_vdev_io_assess, 2902789Sahrens zio_checksum_verify, 29037754SJeff.Bonwick@Sun.COM zio_done 2904789Sahrens }; 2905