1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 228632SBill.Moore@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #include <sys/zfs_context.h> 271544Seschrock #include <sys/fm/fs/zfs.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/txg.h> 30789Sahrens #include <sys/spa_impl.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio_impl.h> 33789Sahrens #include <sys/zio_compress.h> 34789Sahrens #include <sys/zio_checksum.h> 35789Sahrens 36789Sahrens /* 37789Sahrens * ========================================================================== 38789Sahrens * I/O priority table 39789Sahrens * ========================================================================== 40789Sahrens */ 41789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 42789Sahrens 0, /* ZIO_PRIORITY_NOW */ 43789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 44789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 45789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 46789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 47789Sahrens 4, /* ZIO_PRIORITY_FREE */ 48789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 49789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 50789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 51789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 52789Sahrens }; 53789Sahrens 54789Sahrens /* 55789Sahrens * ========================================================================== 56789Sahrens * I/O type descriptions 57789Sahrens * ========================================================================== 58789Sahrens */ 59789Sahrens char *zio_type_name[ZIO_TYPES] = { 60789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 61789Sahrens 627754SJeff.Bonwick@Sun.COM #define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ 637754SJeff.Bonwick@Sun.COM #define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ 647754SJeff.Bonwick@Sun.COM #define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ 655329Sgw25295 66789Sahrens /* 67789Sahrens * ========================================================================== 68789Sahrens * I/O kmem caches 69789Sahrens * ========================================================================== 70789Sahrens */ 714055Seschrock kmem_cache_t *zio_cache; 728632SBill.Moore@Sun.COM kmem_cache_t *zio_link_cache; 73789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 743290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 753290Sjohansen 763290Sjohansen #ifdef _KERNEL 773290Sjohansen extern vmem_t *zio_alloc_arena; 783290Sjohansen #endif 79789Sahrens 805329Sgw25295 /* 817754SJeff.Bonwick@Sun.COM * An allocating zio is one that either currently has the DVA allocate 827754SJeff.Bonwick@Sun.COM * stage set or will have it later in its lifetime. 835329Sgw25295 */ 845329Sgw25295 #define IO_IS_ALLOCATING(zio) \ 855688Sbonwick ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 865329Sgw25295 87789Sahrens void 88789Sahrens zio_init(void) 89789Sahrens { 90789Sahrens size_t c; 913290Sjohansen vmem_t *data_alloc_arena = NULL; 923290Sjohansen 933290Sjohansen #ifdef _KERNEL 943290Sjohansen data_alloc_arena = zio_alloc_arena; 953290Sjohansen #endif 968632SBill.Moore@Sun.COM zio_cache = kmem_cache_create("zio_cache", 978632SBill.Moore@Sun.COM sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 988632SBill.Moore@Sun.COM zio_link_cache = kmem_cache_create("zio_link_cache", 998632SBill.Moore@Sun.COM sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1004055Seschrock 101789Sahrens /* 102789Sahrens * For small buffers, we want a cache for each multiple of 103789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 104789Sahrens * for each quarter-power of 2. For large buffers, we want 105789Sahrens * a cache for each multiple of PAGESIZE. 106789Sahrens */ 107789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 108789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 109789Sahrens size_t p2 = size; 110789Sahrens size_t align = 0; 111789Sahrens 112789Sahrens while (p2 & (p2 - 1)) 113789Sahrens p2 &= p2 - 1; 114789Sahrens 115789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 116789Sahrens align = SPA_MINBLOCKSIZE; 117789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 118789Sahrens align = PAGESIZE; 119789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 120789Sahrens align = p2 >> 2; 121789Sahrens } 122789Sahrens 123789Sahrens if (align != 0) { 1243290Sjohansen char name[36]; 1252856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 126789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 127849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1283290Sjohansen 1293290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1303290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1313290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1323290Sjohansen KMC_NODEBUG); 133789Sahrens } 134789Sahrens } 135789Sahrens 136789Sahrens while (--c != 0) { 137789Sahrens ASSERT(zio_buf_cache[c] != NULL); 138789Sahrens if (zio_buf_cache[c - 1] == NULL) 139789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1403290Sjohansen 1413290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1423290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1433290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 144789Sahrens } 1451544Seschrock 1461544Seschrock zio_inject_init(); 147789Sahrens } 148789Sahrens 149789Sahrens void 150789Sahrens zio_fini(void) 151789Sahrens { 152789Sahrens size_t c; 153789Sahrens kmem_cache_t *last_cache = NULL; 1543290Sjohansen kmem_cache_t *last_data_cache = NULL; 155789Sahrens 156789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 157789Sahrens if (zio_buf_cache[c] != last_cache) { 158789Sahrens last_cache = zio_buf_cache[c]; 159789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 160789Sahrens } 161789Sahrens zio_buf_cache[c] = NULL; 1623290Sjohansen 1633290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1643290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1653290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1663290Sjohansen } 1673290Sjohansen zio_data_buf_cache[c] = NULL; 168789Sahrens } 1691544Seschrock 1708632SBill.Moore@Sun.COM kmem_cache_destroy(zio_link_cache); 1714055Seschrock kmem_cache_destroy(zio_cache); 1724055Seschrock 1731544Seschrock zio_inject_fini(); 174789Sahrens } 175789Sahrens 176789Sahrens /* 177789Sahrens * ========================================================================== 178789Sahrens * Allocate and free I/O buffers 179789Sahrens * ========================================================================== 180789Sahrens */ 1813290Sjohansen 1823290Sjohansen /* 1833290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1843290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1853290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1863290Sjohansen * excess / transient data in-core during a crashdump. 1873290Sjohansen */ 188789Sahrens void * 189789Sahrens zio_buf_alloc(size_t size) 190789Sahrens { 191789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 192789Sahrens 193789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 194789Sahrens 1956245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 196789Sahrens } 197789Sahrens 1983290Sjohansen /* 1993290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2003290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2013290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2023290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2033290Sjohansen */ 2043290Sjohansen void * 2053290Sjohansen zio_data_buf_alloc(size_t size) 2063290Sjohansen { 2073290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2083290Sjohansen 2093290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2103290Sjohansen 2116245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2123290Sjohansen } 2133290Sjohansen 214789Sahrens void 215789Sahrens zio_buf_free(void *buf, size_t size) 216789Sahrens { 217789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 218789Sahrens 219789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 220789Sahrens 221789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 222789Sahrens } 223789Sahrens 2243290Sjohansen void 2253290Sjohansen zio_data_buf_free(void *buf, size_t size) 2263290Sjohansen { 2273290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2283290Sjohansen 2293290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2303290Sjohansen 2313290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2323290Sjohansen } 2333463Sahrens 234789Sahrens /* 235789Sahrens * ========================================================================== 236789Sahrens * Push and pop I/O transform buffers 237789Sahrens * ========================================================================== 238789Sahrens */ 239789Sahrens static void 2407754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 2417754SJeff.Bonwick@Sun.COM zio_transform_func_t *transform) 242789Sahrens { 243789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 244789Sahrens 2457754SJeff.Bonwick@Sun.COM zt->zt_orig_data = zio->io_data; 2467754SJeff.Bonwick@Sun.COM zt->zt_orig_size = zio->io_size; 247789Sahrens zt->zt_bufsize = bufsize; 2487754SJeff.Bonwick@Sun.COM zt->zt_transform = transform; 249789Sahrens 250789Sahrens zt->zt_next = zio->io_transform_stack; 251789Sahrens zio->io_transform_stack = zt; 252789Sahrens 253789Sahrens zio->io_data = data; 254789Sahrens zio->io_size = size; 255789Sahrens } 256789Sahrens 257789Sahrens static void 2587754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio) 259789Sahrens { 2607754SJeff.Bonwick@Sun.COM zio_transform_t *zt; 261789Sahrens 2627754SJeff.Bonwick@Sun.COM while ((zt = zio->io_transform_stack) != NULL) { 2637754SJeff.Bonwick@Sun.COM if (zt->zt_transform != NULL) 2647754SJeff.Bonwick@Sun.COM zt->zt_transform(zio, 2657754SJeff.Bonwick@Sun.COM zt->zt_orig_data, zt->zt_orig_size); 266789Sahrens 2677754SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zt->zt_bufsize); 268789Sahrens 2697754SJeff.Bonwick@Sun.COM zio->io_data = zt->zt_orig_data; 2707754SJeff.Bonwick@Sun.COM zio->io_size = zt->zt_orig_size; 2717754SJeff.Bonwick@Sun.COM zio->io_transform_stack = zt->zt_next; 272789Sahrens 2737754SJeff.Bonwick@Sun.COM kmem_free(zt, sizeof (zio_transform_t)); 274789Sahrens } 275789Sahrens } 276789Sahrens 277789Sahrens /* 278789Sahrens * ========================================================================== 2797754SJeff.Bonwick@Sun.COM * I/O transform callbacks for subblocks and decompression 2807754SJeff.Bonwick@Sun.COM * ========================================================================== 2817754SJeff.Bonwick@Sun.COM */ 2827754SJeff.Bonwick@Sun.COM static void 2837754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size) 2847754SJeff.Bonwick@Sun.COM { 2857754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size > size); 2867754SJeff.Bonwick@Sun.COM 2877754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ) 2887754SJeff.Bonwick@Sun.COM bcopy(zio->io_data, data, size); 2897754SJeff.Bonwick@Sun.COM } 2907754SJeff.Bonwick@Sun.COM 2917754SJeff.Bonwick@Sun.COM static void 2927754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size) 2937754SJeff.Bonwick@Sun.COM { 2947754SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && 2957754SJeff.Bonwick@Sun.COM zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 2967754SJeff.Bonwick@Sun.COM zio->io_data, zio->io_size, data, size) != 0) 2977754SJeff.Bonwick@Sun.COM zio->io_error = EIO; 2987754SJeff.Bonwick@Sun.COM } 2997754SJeff.Bonwick@Sun.COM 3007754SJeff.Bonwick@Sun.COM /* 3017754SJeff.Bonwick@Sun.COM * ========================================================================== 3027754SJeff.Bonwick@Sun.COM * I/O parent/child relationships and pipeline interlocks 3037754SJeff.Bonwick@Sun.COM * ========================================================================== 3047754SJeff.Bonwick@Sun.COM */ 3058632SBill.Moore@Sun.COM /* 3068632SBill.Moore@Sun.COM * NOTE - Callers to zio_walk_parents() and zio_walk_children must 3078632SBill.Moore@Sun.COM * continue calling these functions until they return NULL. 3088632SBill.Moore@Sun.COM * Otherwise, the next caller will pick up the list walk in 3098632SBill.Moore@Sun.COM * some indeterminate state. (Otherwise every caller would 3108632SBill.Moore@Sun.COM * have to pass in a cookie to keep the state represented by 3118632SBill.Moore@Sun.COM * io_walk_link, which gets annoying.) 3128632SBill.Moore@Sun.COM */ 3138632SBill.Moore@Sun.COM zio_t * 3148632SBill.Moore@Sun.COM zio_walk_parents(zio_t *cio) 3158632SBill.Moore@Sun.COM { 3168632SBill.Moore@Sun.COM zio_link_t *zl = cio->io_walk_link; 3178632SBill.Moore@Sun.COM list_t *pl = &cio->io_parent_list; 3187754SJeff.Bonwick@Sun.COM 3198632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 3208632SBill.Moore@Sun.COM cio->io_walk_link = zl; 3218632SBill.Moore@Sun.COM 3228632SBill.Moore@Sun.COM if (zl == NULL) 3238632SBill.Moore@Sun.COM return (NULL); 3248632SBill.Moore@Sun.COM 3258632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 3268632SBill.Moore@Sun.COM return (zl->zl_parent); 3278632SBill.Moore@Sun.COM } 3288632SBill.Moore@Sun.COM 3298632SBill.Moore@Sun.COM zio_t * 3308632SBill.Moore@Sun.COM zio_walk_children(zio_t *pio) 3317754SJeff.Bonwick@Sun.COM { 3328632SBill.Moore@Sun.COM zio_link_t *zl = pio->io_walk_link; 3338632SBill.Moore@Sun.COM list_t *cl = &pio->io_child_list; 3348632SBill.Moore@Sun.COM 3358632SBill.Moore@Sun.COM zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 3368632SBill.Moore@Sun.COM pio->io_walk_link = zl; 3378632SBill.Moore@Sun.COM 3388632SBill.Moore@Sun.COM if (zl == NULL) 3398632SBill.Moore@Sun.COM return (NULL); 3408632SBill.Moore@Sun.COM 3418632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 3428632SBill.Moore@Sun.COM return (zl->zl_child); 3438632SBill.Moore@Sun.COM } 3448632SBill.Moore@Sun.COM 3458632SBill.Moore@Sun.COM zio_t * 3468632SBill.Moore@Sun.COM zio_unique_parent(zio_t *cio) 3478632SBill.Moore@Sun.COM { 3488632SBill.Moore@Sun.COM zio_t *pio = zio_walk_parents(cio); 3498632SBill.Moore@Sun.COM 3508632SBill.Moore@Sun.COM VERIFY(zio_walk_parents(cio) == NULL); 3518632SBill.Moore@Sun.COM return (pio); 3528632SBill.Moore@Sun.COM } 3538632SBill.Moore@Sun.COM 3548632SBill.Moore@Sun.COM void 3558632SBill.Moore@Sun.COM zio_add_child(zio_t *pio, zio_t *cio) 3568632SBill.Moore@Sun.COM { 3578632SBill.Moore@Sun.COM zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 3588632SBill.Moore@Sun.COM 3598632SBill.Moore@Sun.COM /* 3608632SBill.Moore@Sun.COM * Logical I/Os can have logical, gang, or vdev children. 3618632SBill.Moore@Sun.COM * Gang I/Os can have gang or vdev children. 3628632SBill.Moore@Sun.COM * Vdev I/Os can only have vdev children. 3638632SBill.Moore@Sun.COM * The following ASSERT captures all of these constraints. 3648632SBill.Moore@Sun.COM */ 3658632SBill.Moore@Sun.COM ASSERT(cio->io_child_type <= pio->io_child_type); 3668632SBill.Moore@Sun.COM 3678632SBill.Moore@Sun.COM zl->zl_parent = pio; 3688632SBill.Moore@Sun.COM zl->zl_child = cio; 3698632SBill.Moore@Sun.COM 3708632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 3717754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 3728632SBill.Moore@Sun.COM 3738632SBill.Moore@Sun.COM ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 3748632SBill.Moore@Sun.COM 3758632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 3768632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 3778632SBill.Moore@Sun.COM 3788632SBill.Moore@Sun.COM list_insert_head(&pio->io_child_list, zl); 3798632SBill.Moore@Sun.COM list_insert_head(&cio->io_parent_list, zl); 3808632SBill.Moore@Sun.COM 3817754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 3828632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 3837754SJeff.Bonwick@Sun.COM } 3847754SJeff.Bonwick@Sun.COM 3857754SJeff.Bonwick@Sun.COM static void 3868632SBill.Moore@Sun.COM zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 3877754SJeff.Bonwick@Sun.COM { 3888632SBill.Moore@Sun.COM ASSERT(zl->zl_parent == pio); 3898632SBill.Moore@Sun.COM ASSERT(zl->zl_child == cio); 3907754SJeff.Bonwick@Sun.COM 3918632SBill.Moore@Sun.COM mutex_enter(&cio->io_lock); 3927754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 3938632SBill.Moore@Sun.COM 3948632SBill.Moore@Sun.COM list_remove(&pio->io_child_list, zl); 3958632SBill.Moore@Sun.COM list_remove(&cio->io_parent_list, zl); 3968632SBill.Moore@Sun.COM 3977754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 3988632SBill.Moore@Sun.COM mutex_exit(&cio->io_lock); 3998632SBill.Moore@Sun.COM 4008632SBill.Moore@Sun.COM kmem_cache_free(zio_link_cache, zl); 4017754SJeff.Bonwick@Sun.COM } 4027754SJeff.Bonwick@Sun.COM 4037754SJeff.Bonwick@Sun.COM static boolean_t 4047754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 4057754SJeff.Bonwick@Sun.COM { 4067754SJeff.Bonwick@Sun.COM uint64_t *countp = &zio->io_children[child][wait]; 4077754SJeff.Bonwick@Sun.COM boolean_t waiting = B_FALSE; 4087754SJeff.Bonwick@Sun.COM 4097754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 4107754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 4117754SJeff.Bonwick@Sun.COM if (*countp != 0) { 4127754SJeff.Bonwick@Sun.COM zio->io_stage--; 4137754SJeff.Bonwick@Sun.COM zio->io_stall = countp; 4147754SJeff.Bonwick@Sun.COM waiting = B_TRUE; 4157754SJeff.Bonwick@Sun.COM } 4167754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 4177754SJeff.Bonwick@Sun.COM 4187754SJeff.Bonwick@Sun.COM return (waiting); 4197754SJeff.Bonwick@Sun.COM } 4207754SJeff.Bonwick@Sun.COM 4217754SJeff.Bonwick@Sun.COM static void 4227754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 4237754SJeff.Bonwick@Sun.COM { 4247754SJeff.Bonwick@Sun.COM uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 4257754SJeff.Bonwick@Sun.COM int *errorp = &pio->io_child_error[zio->io_child_type]; 4267754SJeff.Bonwick@Sun.COM 4277754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 4287754SJeff.Bonwick@Sun.COM if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 4297754SJeff.Bonwick@Sun.COM *errorp = zio_worst_error(*errorp, zio->io_error); 4307754SJeff.Bonwick@Sun.COM pio->io_reexecute |= zio->io_reexecute; 4317754SJeff.Bonwick@Sun.COM ASSERT3U(*countp, >, 0); 4327754SJeff.Bonwick@Sun.COM if (--*countp == 0 && pio->io_stall == countp) { 4337754SJeff.Bonwick@Sun.COM pio->io_stall = NULL; 4347754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4357754SJeff.Bonwick@Sun.COM zio_execute(pio); 4367754SJeff.Bonwick@Sun.COM } else { 4377754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 4387754SJeff.Bonwick@Sun.COM } 4397754SJeff.Bonwick@Sun.COM } 4407754SJeff.Bonwick@Sun.COM 4417754SJeff.Bonwick@Sun.COM static void 4427754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c) 4437754SJeff.Bonwick@Sun.COM { 4447754SJeff.Bonwick@Sun.COM if (zio->io_child_error[c] != 0 && zio->io_error == 0) 4457754SJeff.Bonwick@Sun.COM zio->io_error = zio->io_child_error[c]; 4467754SJeff.Bonwick@Sun.COM } 4477754SJeff.Bonwick@Sun.COM 4487754SJeff.Bonwick@Sun.COM /* 4497754SJeff.Bonwick@Sun.COM * ========================================================================== 4507754SJeff.Bonwick@Sun.COM * Create the various types of I/O (read, write, free, etc) 451789Sahrens * ========================================================================== 452789Sahrens */ 453789Sahrens static zio_t * 454789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 455789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 4567754SJeff.Bonwick@Sun.COM zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset, 4577754SJeff.Bonwick@Sun.COM const zbookmark_t *zb, uint8_t stage, uint32_t pipeline) 458789Sahrens { 459789Sahrens zio_t *zio; 460789Sahrens 461789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 462789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 4637754SJeff.Bonwick@Sun.COM ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 464789Sahrens 4657754SJeff.Bonwick@Sun.COM ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 4667754SJeff.Bonwick@Sun.COM ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 4677754SJeff.Bonwick@Sun.COM ASSERT(vd || stage == ZIO_STAGE_OPEN); 4687046Sahrens 4694055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 4704055Seschrock bzero(zio, sizeof (zio_t)); 4717754SJeff.Bonwick@Sun.COM 4727754SJeff.Bonwick@Sun.COM mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 4737754SJeff.Bonwick@Sun.COM cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 4747754SJeff.Bonwick@Sun.COM 4758632SBill.Moore@Sun.COM list_create(&zio->io_parent_list, sizeof (zio_link_t), 4768632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_parent_node)); 4778632SBill.Moore@Sun.COM list_create(&zio->io_child_list, sizeof (zio_link_t), 4788632SBill.Moore@Sun.COM offsetof(zio_link_t, zl_child_node)); 4798632SBill.Moore@Sun.COM 4807754SJeff.Bonwick@Sun.COM if (vd != NULL) 4817754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_VDEV; 4827754SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_GANG_CHILD) 4837754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_GANG; 4847754SJeff.Bonwick@Sun.COM else 4857754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_LOGICAL; 4867754SJeff.Bonwick@Sun.COM 487789Sahrens if (bp != NULL) { 488789Sahrens zio->io_bp = bp; 489789Sahrens zio->io_bp_copy = *bp; 490789Sahrens zio->io_bp_orig = *bp; 4917754SJeff.Bonwick@Sun.COM if (type != ZIO_TYPE_WRITE) 4927754SJeff.Bonwick@Sun.COM zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 493*9443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) 4947754SJeff.Bonwick@Sun.COM zio->io_logical = zio; 495*9443SBill.Moore@Sun.COM if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 496*9443SBill.Moore@Sun.COM pipeline |= ZIO_GANG_STAGES; 497789Sahrens } 4987754SJeff.Bonwick@Sun.COM 4997754SJeff.Bonwick@Sun.COM zio->io_spa = spa; 5007754SJeff.Bonwick@Sun.COM zio->io_txg = txg; 5017754SJeff.Bonwick@Sun.COM zio->io_data = data; 5027754SJeff.Bonwick@Sun.COM zio->io_size = size; 503789Sahrens zio->io_done = done; 504789Sahrens zio->io_private = private; 505789Sahrens zio->io_type = type; 506789Sahrens zio->io_priority = priority; 5077754SJeff.Bonwick@Sun.COM zio->io_vd = vd; 5087754SJeff.Bonwick@Sun.COM zio->io_offset = offset; 5097754SJeff.Bonwick@Sun.COM zio->io_orig_flags = zio->io_flags = flags; 5107754SJeff.Bonwick@Sun.COM zio->io_orig_stage = zio->io_stage = stage; 5117754SJeff.Bonwick@Sun.COM zio->io_orig_pipeline = zio->io_pipeline = pipeline; 5127754SJeff.Bonwick@Sun.COM 5138632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 5148632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 5158632SBill.Moore@Sun.COM 5167754SJeff.Bonwick@Sun.COM if (zb != NULL) 5177754SJeff.Bonwick@Sun.COM zio->io_bookmark = *zb; 518789Sahrens 5197754SJeff.Bonwick@Sun.COM if (pio != NULL) { 5207754SJeff.Bonwick@Sun.COM if (zio->io_logical == NULL) 5211544Seschrock zio->io_logical = pio->io_logical; 522*9443SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_GANG) 523*9443SBill.Moore@Sun.COM zio->io_gang_leader = pio->io_gang_leader; 5247754SJeff.Bonwick@Sun.COM zio_add_child(pio, zio); 525789Sahrens } 526789Sahrens 527789Sahrens return (zio); 528789Sahrens } 529789Sahrens 5305329Sgw25295 static void 5317754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio) 5325329Sgw25295 { 5338632SBill.Moore@Sun.COM list_destroy(&zio->io_parent_list); 5348632SBill.Moore@Sun.COM list_destroy(&zio->io_child_list); 5357754SJeff.Bonwick@Sun.COM mutex_destroy(&zio->io_lock); 5367754SJeff.Bonwick@Sun.COM cv_destroy(&zio->io_cv); 5377754SJeff.Bonwick@Sun.COM kmem_cache_free(zio_cache, zio); 5385329Sgw25295 } 5395329Sgw25295 540789Sahrens zio_t * 5418632SBill.Moore@Sun.COM zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 5428632SBill.Moore@Sun.COM void *private, int flags) 543789Sahrens { 544789Sahrens zio_t *zio; 545789Sahrens 546789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 5478632SBill.Moore@Sun.COM ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 5487754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 549789Sahrens 550789Sahrens return (zio); 551789Sahrens } 552789Sahrens 553789Sahrens zio_t * 554789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 555789Sahrens { 5568632SBill.Moore@Sun.COM return (zio_null(NULL, spa, NULL, done, private, flags)); 557789Sahrens } 558789Sahrens 559789Sahrens zio_t * 5607754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 5617754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_done_func_t *done, void *private, 5627046Sahrens int priority, int flags, const zbookmark_t *zb) 563789Sahrens { 564789Sahrens zio_t *zio; 565789Sahrens 5667046Sahrens zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, 5677046Sahrens data, size, done, private, 5687754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 5692981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 570789Sahrens 571789Sahrens return (zio); 572789Sahrens } 573789Sahrens 5747872STim.Haley@Sun.COM void 5757872STim.Haley@Sun.COM zio_skip_write(zio_t *zio) 5767872STim.Haley@Sun.COM { 5777872STim.Haley@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 5787872STim.Haley@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_READY); 5797872STim.Haley@Sun.COM ASSERT(!BP_IS_GANG(zio->io_bp)); 5807872STim.Haley@Sun.COM 5817872STim.Haley@Sun.COM zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 5827872STim.Haley@Sun.COM } 5837872STim.Haley@Sun.COM 584789Sahrens zio_t * 5857754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 5867754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_prop_t *zp, 5877754SJeff.Bonwick@Sun.COM zio_done_func_t *ready, zio_done_func_t *done, void *private, 5887754SJeff.Bonwick@Sun.COM int priority, int flags, const zbookmark_t *zb) 589789Sahrens { 590789Sahrens zio_t *zio; 591789Sahrens 5927754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 5937754SJeff.Bonwick@Sun.COM zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 5947754SJeff.Bonwick@Sun.COM zp->zp_compress >= ZIO_COMPRESS_OFF && 5957754SJeff.Bonwick@Sun.COM zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 5967754SJeff.Bonwick@Sun.COM zp->zp_type < DMU_OT_NUMTYPES && 5977754SJeff.Bonwick@Sun.COM zp->zp_level < 32 && 5987754SJeff.Bonwick@Sun.COM zp->zp_ndvas > 0 && 5997754SJeff.Bonwick@Sun.COM zp->zp_ndvas <= spa_max_replication(spa)); 6007754SJeff.Bonwick@Sun.COM ASSERT(ready != NULL); 6015329Sgw25295 602789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6037754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 604789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 605789Sahrens 6063547Smaybee zio->io_ready = ready; 6077754SJeff.Bonwick@Sun.COM zio->io_prop = *zp; 608789Sahrens 609789Sahrens return (zio); 610789Sahrens } 611789Sahrens 612789Sahrens zio_t * 6137754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 6147754SJeff.Bonwick@Sun.COM uint64_t size, zio_done_func_t *done, void *private, int priority, 6157754SJeff.Bonwick@Sun.COM int flags, zbookmark_t *zb) 616789Sahrens { 617789Sahrens zio_t *zio; 618789Sahrens 6197181Sperrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 6207754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 6217754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 622789Sahrens 623789Sahrens return (zio); 624789Sahrens } 625789Sahrens 626789Sahrens zio_t * 627789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 6287754SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int flags) 629789Sahrens { 630789Sahrens zio_t *zio; 631789Sahrens 632789Sahrens ASSERT(!BP_IS_HOLE(bp)); 633789Sahrens 6347754SJeff.Bonwick@Sun.COM if (bp->blk_fill == BLK_FILL_ALREADY_FREED) 6358632SBill.Moore@Sun.COM return (zio_null(pio, spa, NULL, NULL, NULL, flags)); 6367754SJeff.Bonwick@Sun.COM 637789Sahrens if (txg == spa->spa_syncing_txg && 6387754SJeff.Bonwick@Sun.COM spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { 639789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 6408632SBill.Moore@Sun.COM return (zio_null(pio, spa, NULL, NULL, NULL, flags)); 641789Sahrens } 642789Sahrens 6437754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 6447754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 6457754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 646789Sahrens 647789Sahrens return (zio); 648789Sahrens } 649789Sahrens 650789Sahrens zio_t * 651789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 6527754SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int flags) 653789Sahrens { 654789Sahrens zio_t *zio; 655789Sahrens 656789Sahrens /* 657789Sahrens * A claim is an allocation of a specific block. Claims are needed 658789Sahrens * to support immediate writes in the intent log. The issue is that 659789Sahrens * immediate writes contain committed data, but in a txg that was 660789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 661789Sahrens * the intent log claims all blocks that contain immediate write data 662789Sahrens * so that the SPA knows they're in use. 663789Sahrens * 664789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 665789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 666789Sahrens */ 667789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 668789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 669789Sahrens 6707754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 6717754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 6727754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 673789Sahrens 674789Sahrens return (zio); 675789Sahrens } 676789Sahrens 677789Sahrens zio_t * 678789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 679789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 680789Sahrens { 681789Sahrens zio_t *zio; 682789Sahrens int c; 683789Sahrens 684789Sahrens if (vd->vdev_children == 0) { 685789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 6867754SJeff.Bonwick@Sun.COM ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 687789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 688789Sahrens 689789Sahrens zio->io_cmd = cmd; 690789Sahrens } else { 6918632SBill.Moore@Sun.COM zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 692789Sahrens 693789Sahrens for (c = 0; c < vd->vdev_children; c++) 694789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 695789Sahrens done, private, priority, flags)); 696789Sahrens } 697789Sahrens 698789Sahrens return (zio); 699789Sahrens } 700789Sahrens 701789Sahrens zio_t * 702789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 703789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 7045450Sbrendan int priority, int flags, boolean_t labels) 705789Sahrens { 706789Sahrens zio_t *zio; 7075329Sgw25295 7087754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7097754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7107754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7117754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 712789Sahrens 7137754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7147754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 715789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 716789Sahrens 7177754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 718789Sahrens 719789Sahrens return (zio); 720789Sahrens } 721789Sahrens 722789Sahrens zio_t * 723789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 724789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 7255450Sbrendan int priority, int flags, boolean_t labels) 726789Sahrens { 727789Sahrens zio_t *zio; 728789Sahrens 7297754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 7307754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 7317754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 7327754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 7335329Sgw25295 7347754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 7357754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 736789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 737789Sahrens 7387754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 739789Sahrens 740789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 741789Sahrens /* 742789Sahrens * zbt checksums are necessarily destructive -- they modify 7437754SJeff.Bonwick@Sun.COM * the end of the write buffer to hold the verifier/checksum. 744789Sahrens * Therefore, we must make a local copy in case the data is 7457754SJeff.Bonwick@Sun.COM * being written to multiple places in parallel. 746789Sahrens */ 7477754SJeff.Bonwick@Sun.COM void *wbuf = zio_buf_alloc(size); 748789Sahrens bcopy(data, wbuf, size); 7497754SJeff.Bonwick@Sun.COM zio_push_transform(zio, wbuf, size, size, NULL); 750789Sahrens } 751789Sahrens 752789Sahrens return (zio); 753789Sahrens } 754789Sahrens 755789Sahrens /* 7567754SJeff.Bonwick@Sun.COM * Create a child I/O to do some work for us. 757789Sahrens */ 758789Sahrens zio_t * 7597754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 760789Sahrens void *data, uint64_t size, int type, int priority, int flags, 761789Sahrens zio_done_func_t *done, void *private) 762789Sahrens { 763789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 7647754SJeff.Bonwick@Sun.COM zio_t *zio; 7657754SJeff.Bonwick@Sun.COM 7667754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_parent == 7677754SJeff.Bonwick@Sun.COM (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 768789Sahrens 769789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 770789Sahrens /* 771789Sahrens * If we have the bp, then the child should perform the 772789Sahrens * checksum and the parent need not. This pushes error 773789Sahrens * detection as close to the leaves as possible and 774789Sahrens * eliminates redundant checksums in the interior nodes. 775789Sahrens */ 776789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 7777754SJeff.Bonwick@Sun.COM pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 7787754SJeff.Bonwick@Sun.COM } 7797754SJeff.Bonwick@Sun.COM 7807754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0) 7817754SJeff.Bonwick@Sun.COM offset += VDEV_LABEL_START_SIZE; 7827754SJeff.Bonwick@Sun.COM 7837754SJeff.Bonwick@Sun.COM zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 7847754SJeff.Bonwick@Sun.COM done, private, type, priority, 7857754SJeff.Bonwick@Sun.COM (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) | 7867754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags, 7877754SJeff.Bonwick@Sun.COM vd, offset, &pio->io_bookmark, 7887754SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START - 1, pipeline); 7897754SJeff.Bonwick@Sun.COM 7907754SJeff.Bonwick@Sun.COM return (zio); 7917754SJeff.Bonwick@Sun.COM } 7927754SJeff.Bonwick@Sun.COM 7937754SJeff.Bonwick@Sun.COM zio_t * 7947754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 7957754SJeff.Bonwick@Sun.COM int type, int priority, int flags, zio_done_func_t *done, void *private) 7967754SJeff.Bonwick@Sun.COM { 7977754SJeff.Bonwick@Sun.COM zio_t *zio; 7987754SJeff.Bonwick@Sun.COM 7997754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_ops->vdev_op_leaf); 8007754SJeff.Bonwick@Sun.COM 8017754SJeff.Bonwick@Sun.COM zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 8027754SJeff.Bonwick@Sun.COM data, size, done, private, type, priority, 8037754SJeff.Bonwick@Sun.COM flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 8047754SJeff.Bonwick@Sun.COM vd, offset, NULL, 8057754SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE); 8067754SJeff.Bonwick@Sun.COM 8077754SJeff.Bonwick@Sun.COM return (zio); 8087754SJeff.Bonwick@Sun.COM } 8097754SJeff.Bonwick@Sun.COM 8107754SJeff.Bonwick@Sun.COM void 8117754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd) 8127754SJeff.Bonwick@Sun.COM { 8137754SJeff.Bonwick@Sun.COM zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 8147754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_PRIORITY_NOW, 8157754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 8167754SJeff.Bonwick@Sun.COM } 8177754SJeff.Bonwick@Sun.COM 8187754SJeff.Bonwick@Sun.COM /* 8197754SJeff.Bonwick@Sun.COM * ========================================================================== 8207754SJeff.Bonwick@Sun.COM * Prepare to read and write logical blocks 8217754SJeff.Bonwick@Sun.COM * ========================================================================== 8227754SJeff.Bonwick@Sun.COM */ 8237754SJeff.Bonwick@Sun.COM 8247754SJeff.Bonwick@Sun.COM static int 8257754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio) 8267754SJeff.Bonwick@Sun.COM { 8277754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8287754SJeff.Bonwick@Sun.COM 8298274SJeff.Bonwick@Sun.COM if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 830*9443SBill.Moore@Sun.COM zio->io_child_type == ZIO_CHILD_LOGICAL && 831*9443SBill.Moore@Sun.COM !(zio->io_flags & ZIO_FLAG_RAW)) { 8327754SJeff.Bonwick@Sun.COM uint64_t csize = BP_GET_PSIZE(bp); 8337754SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(csize); 8347754SJeff.Bonwick@Sun.COM 8357754SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, csize, csize, zio_decompress); 8367754SJeff.Bonwick@Sun.COM } 8377754SJeff.Bonwick@Sun.COM 8387754SJeff.Bonwick@Sun.COM if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 8397754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 8407754SJeff.Bonwick@Sun.COM 8417754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 8427754SJeff.Bonwick@Sun.COM } 8437754SJeff.Bonwick@Sun.COM 8447754SJeff.Bonwick@Sun.COM static int 8457754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio) 8467754SJeff.Bonwick@Sun.COM { 8477754SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 8487754SJeff.Bonwick@Sun.COM int compress = zp->zp_compress; 8497754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 8507754SJeff.Bonwick@Sun.COM void *cbuf; 8517754SJeff.Bonwick@Sun.COM uint64_t lsize = zio->io_size; 8527754SJeff.Bonwick@Sun.COM uint64_t csize = lsize; 8537754SJeff.Bonwick@Sun.COM uint64_t cbufsize = 0; 8547754SJeff.Bonwick@Sun.COM int pass = 1; 8557754SJeff.Bonwick@Sun.COM 8567754SJeff.Bonwick@Sun.COM /* 8577754SJeff.Bonwick@Sun.COM * If our children haven't all reached the ready stage, 8587754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 8597754SJeff.Bonwick@Sun.COM */ 8607754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 8617754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 8627754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 8637754SJeff.Bonwick@Sun.COM 8647754SJeff.Bonwick@Sun.COM if (!IO_IS_ALLOCATING(zio)) 8657754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 8667754SJeff.Bonwick@Sun.COM 8677754SJeff.Bonwick@Sun.COM ASSERT(compress != ZIO_COMPRESS_INHERIT); 8687754SJeff.Bonwick@Sun.COM 8697754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg) { 8707754SJeff.Bonwick@Sun.COM /* 8717754SJeff.Bonwick@Sun.COM * We're rewriting an existing block, which means we're 8727754SJeff.Bonwick@Sun.COM * working on behalf of spa_sync(). For spa_sync() to 8737754SJeff.Bonwick@Sun.COM * converge, it must eventually be the case that we don't 8747754SJeff.Bonwick@Sun.COM * have to allocate new blocks. But compression changes 8757754SJeff.Bonwick@Sun.COM * the blocksize, which forces a reallocate, and makes 8767754SJeff.Bonwick@Sun.COM * convergence take longer. Therefore, after the first 8777754SJeff.Bonwick@Sun.COM * few passes, stop compressing to ensure convergence. 8787754SJeff.Bonwick@Sun.COM */ 8797754SJeff.Bonwick@Sun.COM pass = spa_sync_pass(zio->io_spa); 8807754SJeff.Bonwick@Sun.COM 8817754SJeff.Bonwick@Sun.COM if (pass > SYNC_PASS_DONT_COMPRESS) 8827754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 8837754SJeff.Bonwick@Sun.COM 8847754SJeff.Bonwick@Sun.COM /* Make sure someone doesn't change their mind on overwrites */ 8857754SJeff.Bonwick@Sun.COM ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), 8867754SJeff.Bonwick@Sun.COM spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); 8877754SJeff.Bonwick@Sun.COM } 8887754SJeff.Bonwick@Sun.COM 8897754SJeff.Bonwick@Sun.COM if (compress != ZIO_COMPRESS_OFF) { 8907754SJeff.Bonwick@Sun.COM if (!zio_compress_data(compress, zio->io_data, zio->io_size, 8917754SJeff.Bonwick@Sun.COM &cbuf, &csize, &cbufsize)) { 8927754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 8937754SJeff.Bonwick@Sun.COM } else if (csize != 0) { 8947754SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, csize, cbufsize, NULL); 8957754SJeff.Bonwick@Sun.COM } 896789Sahrens } 897789Sahrens 8987754SJeff.Bonwick@Sun.COM /* 8997754SJeff.Bonwick@Sun.COM * The final pass of spa_sync() must be all rewrites, but the first 9007754SJeff.Bonwick@Sun.COM * few passes offer a trade-off: allocating blocks defers convergence, 9017754SJeff.Bonwick@Sun.COM * but newly allocated blocks are sequential, so they can be written 9027754SJeff.Bonwick@Sun.COM * to disk faster. Therefore, we allow the first few passes of 9037754SJeff.Bonwick@Sun.COM * spa_sync() to allocate new blocks, but force rewrites after that. 9047754SJeff.Bonwick@Sun.COM * There should only be a handful of blocks after pass 1 in any case. 9057754SJeff.Bonwick@Sun.COM */ 9067754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 9077754SJeff.Bonwick@Sun.COM pass > SYNC_PASS_REWRITE) { 9087754SJeff.Bonwick@Sun.COM ASSERT(csize != 0); 9097754SJeff.Bonwick@Sun.COM uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 9107754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 9117754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_REWRITE; 9127754SJeff.Bonwick@Sun.COM } else { 9137754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 9147754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 9157754SJeff.Bonwick@Sun.COM } 9167754SJeff.Bonwick@Sun.COM 9177754SJeff.Bonwick@Sun.COM if (csize == 0) { 9187754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 9197754SJeff.Bonwick@Sun.COM } else { 9207754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 9217754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(bp, lsize); 9227754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(bp, csize); 9237754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(bp, compress); 9247754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(bp, zp->zp_checksum); 9257754SJeff.Bonwick@Sun.COM BP_SET_TYPE(bp, zp->zp_type); 9267754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(bp, zp->zp_level); 9277754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 9287754SJeff.Bonwick@Sun.COM } 9297754SJeff.Bonwick@Sun.COM 9307754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 9317754SJeff.Bonwick@Sun.COM } 9327754SJeff.Bonwick@Sun.COM 9337754SJeff.Bonwick@Sun.COM /* 9347754SJeff.Bonwick@Sun.COM * ========================================================================== 9357754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline 9367754SJeff.Bonwick@Sun.COM * ========================================================================== 9377754SJeff.Bonwick@Sun.COM */ 9387754SJeff.Bonwick@Sun.COM 9397754SJeff.Bonwick@Sun.COM static void 9407754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) 9417754SJeff.Bonwick@Sun.COM { 9427754SJeff.Bonwick@Sun.COM zio_type_t t = zio->io_type; 9437754SJeff.Bonwick@Sun.COM 9447754SJeff.Bonwick@Sun.COM /* 9457754SJeff.Bonwick@Sun.COM * If we're a config writer, the normal issue and interrupt threads 9467754SJeff.Bonwick@Sun.COM * may all be blocked waiting for the config lock. In this case, 9477754SJeff.Bonwick@Sun.COM * select the otherwise-unused taskq for ZIO_TYPE_NULL. 9487754SJeff.Bonwick@Sun.COM */ 9497754SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) 9507754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 9517754SJeff.Bonwick@Sun.COM 9527754SJeff.Bonwick@Sun.COM /* 9537754SJeff.Bonwick@Sun.COM * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 9547754SJeff.Bonwick@Sun.COM */ 9557754SJeff.Bonwick@Sun.COM if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 9567754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 9577754SJeff.Bonwick@Sun.COM 9587754SJeff.Bonwick@Sun.COM (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], 9597754SJeff.Bonwick@Sun.COM (task_func_t *)zio_execute, zio, TQ_SLEEP); 9607754SJeff.Bonwick@Sun.COM } 9617754SJeff.Bonwick@Sun.COM 9627754SJeff.Bonwick@Sun.COM static boolean_t 9637754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 9647754SJeff.Bonwick@Sun.COM { 9657754SJeff.Bonwick@Sun.COM kthread_t *executor = zio->io_executor; 9667754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 967789Sahrens 9687754SJeff.Bonwick@Sun.COM for (zio_type_t t = 0; t < ZIO_TYPES; t++) 9697754SJeff.Bonwick@Sun.COM if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 9707754SJeff.Bonwick@Sun.COM return (B_TRUE); 9717754SJeff.Bonwick@Sun.COM 9727754SJeff.Bonwick@Sun.COM return (B_FALSE); 9737754SJeff.Bonwick@Sun.COM } 9747754SJeff.Bonwick@Sun.COM 9757754SJeff.Bonwick@Sun.COM static int 9767754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio) 9777754SJeff.Bonwick@Sun.COM { 9787754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 9797754SJeff.Bonwick@Sun.COM 9807754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 9817754SJeff.Bonwick@Sun.COM } 9827754SJeff.Bonwick@Sun.COM 9837754SJeff.Bonwick@Sun.COM void 9847754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio) 9857754SJeff.Bonwick@Sun.COM { 9867754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); 9877754SJeff.Bonwick@Sun.COM } 9887754SJeff.Bonwick@Sun.COM 9897754SJeff.Bonwick@Sun.COM /* 9907754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline until one of the following occurs: 9917754SJeff.Bonwick@Sun.COM * (1) the I/O completes; (2) the pipeline stalls waiting for 9927754SJeff.Bonwick@Sun.COM * dependent child I/Os; (3) the I/O issues, so we're waiting 9937754SJeff.Bonwick@Sun.COM * for an I/O completion interrupt; (4) the I/O is delegated by 9947754SJeff.Bonwick@Sun.COM * vdev-level caching or aggregation; (5) the I/O is deferred 9957754SJeff.Bonwick@Sun.COM * due to vdev-level queueing; (6) the I/O is handed off to 9967754SJeff.Bonwick@Sun.COM * another thread. In all cases, the pipeline stops whenever 9977754SJeff.Bonwick@Sun.COM * there's no CPU work; it never burns a thread in cv_wait(). 9987754SJeff.Bonwick@Sun.COM * 9997754SJeff.Bonwick@Sun.COM * There's no locking on io_stage because there's no legitimate way 10007754SJeff.Bonwick@Sun.COM * for multiple threads to be attempting to process the same I/O. 10017754SJeff.Bonwick@Sun.COM */ 10027754SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES]; 1003789Sahrens 10047754SJeff.Bonwick@Sun.COM void 10057754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio) 10067754SJeff.Bonwick@Sun.COM { 10077754SJeff.Bonwick@Sun.COM zio->io_executor = curthread; 10087754SJeff.Bonwick@Sun.COM 10097754SJeff.Bonwick@Sun.COM while (zio->io_stage < ZIO_STAGE_DONE) { 10107754SJeff.Bonwick@Sun.COM uint32_t pipeline = zio->io_pipeline; 10117754SJeff.Bonwick@Sun.COM zio_stage_t stage = zio->io_stage; 10127754SJeff.Bonwick@Sun.COM int rv; 10137754SJeff.Bonwick@Sun.COM 10147754SJeff.Bonwick@Sun.COM ASSERT(!MUTEX_HELD(&zio->io_lock)); 10157754SJeff.Bonwick@Sun.COM 10167754SJeff.Bonwick@Sun.COM while (((1U << ++stage) & pipeline) == 0) 10177754SJeff.Bonwick@Sun.COM continue; 10187754SJeff.Bonwick@Sun.COM 10197754SJeff.Bonwick@Sun.COM ASSERT(stage <= ZIO_STAGE_DONE); 10207754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 10217754SJeff.Bonwick@Sun.COM 10227754SJeff.Bonwick@Sun.COM /* 10237754SJeff.Bonwick@Sun.COM * If we are in interrupt context and this pipeline stage 10247754SJeff.Bonwick@Sun.COM * will grab a config lock that is held across I/O, 10257754SJeff.Bonwick@Sun.COM * issue async to avoid deadlock. 10267754SJeff.Bonwick@Sun.COM */ 10277754SJeff.Bonwick@Sun.COM if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) && 10287754SJeff.Bonwick@Sun.COM zio->io_vd == NULL && 10297754SJeff.Bonwick@Sun.COM zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 10307754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 10317754SJeff.Bonwick@Sun.COM return; 10327754SJeff.Bonwick@Sun.COM } 10337754SJeff.Bonwick@Sun.COM 10347754SJeff.Bonwick@Sun.COM zio->io_stage = stage; 10357754SJeff.Bonwick@Sun.COM rv = zio_pipeline[stage](zio); 10367754SJeff.Bonwick@Sun.COM 10377754SJeff.Bonwick@Sun.COM if (rv == ZIO_PIPELINE_STOP) 10387754SJeff.Bonwick@Sun.COM return; 10397754SJeff.Bonwick@Sun.COM 10407754SJeff.Bonwick@Sun.COM ASSERT(rv == ZIO_PIPELINE_CONTINUE); 10417754SJeff.Bonwick@Sun.COM } 1042789Sahrens } 1043789Sahrens 1044789Sahrens /* 1045789Sahrens * ========================================================================== 1046789Sahrens * Initiate I/O, either sync or async 1047789Sahrens * ========================================================================== 1048789Sahrens */ 1049789Sahrens int 1050789Sahrens zio_wait(zio_t *zio) 1051789Sahrens { 1052789Sahrens int error; 1053789Sahrens 1054789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 10557754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 1056789Sahrens 1057789Sahrens zio->io_waiter = curthread; 1058789Sahrens 10595530Sbonwick zio_execute(zio); 1060789Sahrens 1061789Sahrens mutex_enter(&zio->io_lock); 10627754SJeff.Bonwick@Sun.COM while (zio->io_executor != NULL) 1063789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 1064789Sahrens mutex_exit(&zio->io_lock); 1065789Sahrens 1066789Sahrens error = zio->io_error; 10676523Sek110237 zio_destroy(zio); 1068789Sahrens 1069789Sahrens return (error); 1070789Sahrens } 1071789Sahrens 1072789Sahrens void 1073789Sahrens zio_nowait(zio_t *zio) 1074789Sahrens { 10757754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 10767754SJeff.Bonwick@Sun.COM 10778632SBill.Moore@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL && 10788632SBill.Moore@Sun.COM zio_unique_parent(zio) == NULL) { 10797754SJeff.Bonwick@Sun.COM /* 10807754SJeff.Bonwick@Sun.COM * This is a logical async I/O with no parent to wait for it. 10819234SGeorge.Wilson@Sun.COM * We add it to the spa_async_root_zio "Godfather" I/O which 10829234SGeorge.Wilson@Sun.COM * will ensure they complete prior to unloading the pool. 10837754SJeff.Bonwick@Sun.COM */ 10847754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 10859234SGeorge.Wilson@Sun.COM 10869234SGeorge.Wilson@Sun.COM zio_add_child(spa->spa_async_zio_root, zio); 10877754SJeff.Bonwick@Sun.COM } 10887754SJeff.Bonwick@Sun.COM 10895530Sbonwick zio_execute(zio); 10905530Sbonwick } 10915530Sbonwick 10927754SJeff.Bonwick@Sun.COM /* 10937754SJeff.Bonwick@Sun.COM * ========================================================================== 10947754SJeff.Bonwick@Sun.COM * Reexecute or suspend/resume failed I/O 10957754SJeff.Bonwick@Sun.COM * ========================================================================== 10967754SJeff.Bonwick@Sun.COM */ 10977754SJeff.Bonwick@Sun.COM 10987754SJeff.Bonwick@Sun.COM static void 10997754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio) 11007754SJeff.Bonwick@Sun.COM { 11018632SBill.Moore@Sun.COM zio_t *cio, *cio_next; 11028632SBill.Moore@Sun.COM 11038632SBill.Moore@Sun.COM ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 11048632SBill.Moore@Sun.COM ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1105*9443SBill.Moore@Sun.COM ASSERT(pio->io_gang_leader == NULL); 1106*9443SBill.Moore@Sun.COM ASSERT(pio->io_gang_tree == NULL); 11077754SJeff.Bonwick@Sun.COM 11087754SJeff.Bonwick@Sun.COM pio->io_flags = pio->io_orig_flags; 11097754SJeff.Bonwick@Sun.COM pio->io_stage = pio->io_orig_stage; 11107754SJeff.Bonwick@Sun.COM pio->io_pipeline = pio->io_orig_pipeline; 11117754SJeff.Bonwick@Sun.COM pio->io_reexecute = 0; 11127754SJeff.Bonwick@Sun.COM pio->io_error = 0; 11138632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 11148632SBill.Moore@Sun.COM pio->io_state[w] = 0; 11157754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 11167754SJeff.Bonwick@Sun.COM pio->io_child_error[c] = 0; 11177754SJeff.Bonwick@Sun.COM 11187754SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(pio)) { 11197754SJeff.Bonwick@Sun.COM /* 11207754SJeff.Bonwick@Sun.COM * Remember the failed bp so that the io_ready() callback 11217754SJeff.Bonwick@Sun.COM * can update its accounting upon reexecution. The block 11227754SJeff.Bonwick@Sun.COM * was already freed in zio_done(); we indicate this with 11237754SJeff.Bonwick@Sun.COM * a fill count of -1 so that zio_free() knows to skip it. 11247754SJeff.Bonwick@Sun.COM */ 11257754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 11267754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); 11277754SJeff.Bonwick@Sun.COM bp->blk_fill = BLK_FILL_ALREADY_FREED; 11287754SJeff.Bonwick@Sun.COM pio->io_bp_orig = *bp; 11297754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 11307754SJeff.Bonwick@Sun.COM } 11317754SJeff.Bonwick@Sun.COM 11327754SJeff.Bonwick@Sun.COM /* 11337754SJeff.Bonwick@Sun.COM * As we reexecute pio's children, new children could be created. 11348632SBill.Moore@Sun.COM * New children go to the head of pio's io_child_list, however, 11357754SJeff.Bonwick@Sun.COM * so we will (correctly) not reexecute them. The key is that 11368632SBill.Moore@Sun.COM * the remainder of pio's io_child_list, from 'cio_next' onward, 11378632SBill.Moore@Sun.COM * cannot be affected by any side effects of reexecuting 'cio'. 11387754SJeff.Bonwick@Sun.COM */ 11398632SBill.Moore@Sun.COM for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 11408632SBill.Moore@Sun.COM cio_next = zio_walk_children(pio); 11417754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 11428632SBill.Moore@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 11438632SBill.Moore@Sun.COM pio->io_children[cio->io_child_type][w]++; 11447754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 11458632SBill.Moore@Sun.COM zio_reexecute(cio); 11467754SJeff.Bonwick@Sun.COM } 11477754SJeff.Bonwick@Sun.COM 11487754SJeff.Bonwick@Sun.COM /* 11497754SJeff.Bonwick@Sun.COM * Now that all children have been reexecuted, execute the parent. 11509234SGeorge.Wilson@Sun.COM * We don't reexecute "The Godfather" I/O here as it's the 11519234SGeorge.Wilson@Sun.COM * responsibility of the caller to wait on him. 11527754SJeff.Bonwick@Sun.COM */ 11539234SGeorge.Wilson@Sun.COM if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 11549234SGeorge.Wilson@Sun.COM zio_execute(pio); 11557754SJeff.Bonwick@Sun.COM } 11567754SJeff.Bonwick@Sun.COM 11575530Sbonwick void 11587754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio) 11595530Sbonwick { 11607754SJeff.Bonwick@Sun.COM if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 11617754SJeff.Bonwick@Sun.COM fm_panic("Pool '%s' has encountered an uncorrectable I/O " 11627754SJeff.Bonwick@Sun.COM "failure and the failure mode property for this pool " 11637754SJeff.Bonwick@Sun.COM "is set to panic.", spa_name(spa)); 11647754SJeff.Bonwick@Sun.COM 11657754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 11667754SJeff.Bonwick@Sun.COM 11677754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 11687754SJeff.Bonwick@Sun.COM 11697754SJeff.Bonwick@Sun.COM if (spa->spa_suspend_zio_root == NULL) 11709234SGeorge.Wilson@Sun.COM spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 11719234SGeorge.Wilson@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 11729234SGeorge.Wilson@Sun.COM ZIO_FLAG_GODFATHER); 11737754SJeff.Bonwick@Sun.COM 11747754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_TRUE; 11757754SJeff.Bonwick@Sun.COM 11767754SJeff.Bonwick@Sun.COM if (zio != NULL) { 11779234SGeorge.Wilson@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 11787754SJeff.Bonwick@Sun.COM ASSERT(zio != spa->spa_suspend_zio_root); 11797754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 11808632SBill.Moore@Sun.COM ASSERT(zio_unique_parent(zio) == NULL); 11817754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_DONE); 11827754SJeff.Bonwick@Sun.COM zio_add_child(spa->spa_suspend_zio_root, zio); 11837754SJeff.Bonwick@Sun.COM } 11847754SJeff.Bonwick@Sun.COM 11857754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 11865530Sbonwick } 11875530Sbonwick 11889234SGeorge.Wilson@Sun.COM int 11897754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa) 11905530Sbonwick { 11919234SGeorge.Wilson@Sun.COM zio_t *pio; 11927754SJeff.Bonwick@Sun.COM 11937754SJeff.Bonwick@Sun.COM /* 11947754SJeff.Bonwick@Sun.COM * Reexecute all previously suspended i/o. 11957754SJeff.Bonwick@Sun.COM */ 11967754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 11977754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_FALSE; 11987754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_suspend_cv); 11997754SJeff.Bonwick@Sun.COM pio = spa->spa_suspend_zio_root; 12007754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = NULL; 12017754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 12027754SJeff.Bonwick@Sun.COM 12037754SJeff.Bonwick@Sun.COM if (pio == NULL) 12049234SGeorge.Wilson@Sun.COM return (0); 12055530Sbonwick 12069234SGeorge.Wilson@Sun.COM zio_reexecute(pio); 12079234SGeorge.Wilson@Sun.COM return (zio_wait(pio)); 12087754SJeff.Bonwick@Sun.COM } 12097754SJeff.Bonwick@Sun.COM 12107754SJeff.Bonwick@Sun.COM void 12117754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa) 12127754SJeff.Bonwick@Sun.COM { 12137754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 12147754SJeff.Bonwick@Sun.COM while (spa_suspended(spa)) 12157754SJeff.Bonwick@Sun.COM cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 12167754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1217789Sahrens } 1218789Sahrens 1219789Sahrens /* 1220789Sahrens * ========================================================================== 12217754SJeff.Bonwick@Sun.COM * Gang blocks. 12227754SJeff.Bonwick@Sun.COM * 12237754SJeff.Bonwick@Sun.COM * A gang block is a collection of small blocks that looks to the DMU 12247754SJeff.Bonwick@Sun.COM * like one large block. When zio_dva_allocate() cannot find a block 12257754SJeff.Bonwick@Sun.COM * of the requested size, due to either severe fragmentation or the pool 12267754SJeff.Bonwick@Sun.COM * being nearly full, it calls zio_write_gang_block() to construct the 12277754SJeff.Bonwick@Sun.COM * block from smaller fragments. 12287754SJeff.Bonwick@Sun.COM * 12297754SJeff.Bonwick@Sun.COM * A gang block consists of a gang header (zio_gbh_phys_t) and up to 12307754SJeff.Bonwick@Sun.COM * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 12317754SJeff.Bonwick@Sun.COM * an indirect block: it's an array of block pointers. It consumes 12327754SJeff.Bonwick@Sun.COM * only one sector and hence is allocatable regardless of fragmentation. 12337754SJeff.Bonwick@Sun.COM * The gang header's bps point to its gang members, which hold the data. 12347754SJeff.Bonwick@Sun.COM * 12357754SJeff.Bonwick@Sun.COM * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 12367754SJeff.Bonwick@Sun.COM * as the verifier to ensure uniqueness of the SHA256 checksum. 12377754SJeff.Bonwick@Sun.COM * Critically, the gang block bp's blk_cksum is the checksum of the data, 12387754SJeff.Bonwick@Sun.COM * not the gang header. This ensures that data block signatures (needed for 12397754SJeff.Bonwick@Sun.COM * deduplication) are independent of how the block is physically stored. 12407754SJeff.Bonwick@Sun.COM * 12417754SJeff.Bonwick@Sun.COM * Gang blocks can be nested: a gang member may itself be a gang block. 12427754SJeff.Bonwick@Sun.COM * Thus every gang block is a tree in which root and all interior nodes are 12437754SJeff.Bonwick@Sun.COM * gang headers, and the leaves are normal blocks that contain user data. 12447754SJeff.Bonwick@Sun.COM * The root of the gang tree is called the gang leader. 12457754SJeff.Bonwick@Sun.COM * 12467754SJeff.Bonwick@Sun.COM * To perform any operation (read, rewrite, free, claim) on a gang block, 12477754SJeff.Bonwick@Sun.COM * zio_gang_assemble() first assembles the gang tree (minus data leaves) 12487754SJeff.Bonwick@Sun.COM * in the io_gang_tree field of the original logical i/o by recursively 12497754SJeff.Bonwick@Sun.COM * reading the gang leader and all gang headers below it. This yields 12507754SJeff.Bonwick@Sun.COM * an in-core tree containing the contents of every gang header and the 12517754SJeff.Bonwick@Sun.COM * bps for every constituent of the gang block. 12527754SJeff.Bonwick@Sun.COM * 12537754SJeff.Bonwick@Sun.COM * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 12547754SJeff.Bonwick@Sun.COM * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 12557754SJeff.Bonwick@Sun.COM * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 12567754SJeff.Bonwick@Sun.COM * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 12577754SJeff.Bonwick@Sun.COM * zio_read_gang() is a wrapper around zio_read() that omits reading gang 12587754SJeff.Bonwick@Sun.COM * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 12597754SJeff.Bonwick@Sun.COM * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 12607754SJeff.Bonwick@Sun.COM * of the gang header plus zio_checksum_compute() of the data to update the 12617754SJeff.Bonwick@Sun.COM * gang header's blk_cksum as described above. 12627754SJeff.Bonwick@Sun.COM * 12637754SJeff.Bonwick@Sun.COM * The two-phase assemble/issue model solves the problem of partial failure -- 12647754SJeff.Bonwick@Sun.COM * what if you'd freed part of a gang block but then couldn't read the 12657754SJeff.Bonwick@Sun.COM * gang header for another part? Assembling the entire gang tree first 12667754SJeff.Bonwick@Sun.COM * ensures that all the necessary gang header I/O has succeeded before 12677754SJeff.Bonwick@Sun.COM * starting the actual work of free, claim, or write. Once the gang tree 12687754SJeff.Bonwick@Sun.COM * is assembled, free and claim are in-memory operations that cannot fail. 12697754SJeff.Bonwick@Sun.COM * 12707754SJeff.Bonwick@Sun.COM * In the event that a gang write fails, zio_dva_unallocate() walks the 12717754SJeff.Bonwick@Sun.COM * gang tree to immediately free (i.e. insert back into the space map) 12727754SJeff.Bonwick@Sun.COM * everything we've allocated. This ensures that we don't get ENOSPC 12737754SJeff.Bonwick@Sun.COM * errors during repeated suspend/resume cycles due to a flaky device. 12747754SJeff.Bonwick@Sun.COM * 12757754SJeff.Bonwick@Sun.COM * Gang rewrites only happen during sync-to-convergence. If we can't assemble 12767754SJeff.Bonwick@Sun.COM * the gang tree, we won't modify the block, so we can safely defer the free 12777754SJeff.Bonwick@Sun.COM * (knowing that the block is still intact). If we *can* assemble the gang 12787754SJeff.Bonwick@Sun.COM * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 12797754SJeff.Bonwick@Sun.COM * each constituent bp and we can allocate a new block on the next sync pass. 12807754SJeff.Bonwick@Sun.COM * 12817754SJeff.Bonwick@Sun.COM * In all cases, the gang tree allows complete recovery from partial failure. 1282789Sahrens * ========================================================================== 1283789Sahrens */ 12845530Sbonwick 12857754SJeff.Bonwick@Sun.COM static zio_t * 12867754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 12877754SJeff.Bonwick@Sun.COM { 12887754SJeff.Bonwick@Sun.COM if (gn != NULL) 12897754SJeff.Bonwick@Sun.COM return (pio); 12905530Sbonwick 12917754SJeff.Bonwick@Sun.COM return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 12927754SJeff.Bonwick@Sun.COM NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 12937754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1294789Sahrens } 1295789Sahrens 12967754SJeff.Bonwick@Sun.COM zio_t * 12977754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 12986523Sek110237 { 12997754SJeff.Bonwick@Sun.COM zio_t *zio; 13006523Sek110237 13017754SJeff.Bonwick@Sun.COM if (gn != NULL) { 13027754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 13037754SJeff.Bonwick@Sun.COM gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 13047754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 13057754SJeff.Bonwick@Sun.COM /* 13067754SJeff.Bonwick@Sun.COM * As we rewrite each gang header, the pipeline will compute 13077754SJeff.Bonwick@Sun.COM * a new gang block header checksum for it; but no one will 13087754SJeff.Bonwick@Sun.COM * compute a new data checksum, so we do that here. The one 13097754SJeff.Bonwick@Sun.COM * exception is the gang leader: the pipeline already computed 13107754SJeff.Bonwick@Sun.COM * its data checksum because that stage precedes gang assembly. 13117754SJeff.Bonwick@Sun.COM * (Presently, nothing actually uses interior data checksums; 13127754SJeff.Bonwick@Sun.COM * this is just good hygiene.) 13137754SJeff.Bonwick@Sun.COM */ 1314*9443SBill.Moore@Sun.COM if (gn != pio->io_gang_leader->io_gang_tree) { 13157754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 13167754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp)); 13177754SJeff.Bonwick@Sun.COM } 13187754SJeff.Bonwick@Sun.COM } else { 13197754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 13207754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 13217754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 13226523Sek110237 } 13236523Sek110237 13247754SJeff.Bonwick@Sun.COM return (zio); 13257754SJeff.Bonwick@Sun.COM } 13267754SJeff.Bonwick@Sun.COM 13277754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 13287754SJeff.Bonwick@Sun.COM zio_t * 13297754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 13307754SJeff.Bonwick@Sun.COM { 13317754SJeff.Bonwick@Sun.COM return (zio_free(pio, pio->io_spa, pio->io_txg, bp, 13327754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 13337754SJeff.Bonwick@Sun.COM } 13347754SJeff.Bonwick@Sun.COM 13357754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 13367754SJeff.Bonwick@Sun.COM zio_t * 13377754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 13387754SJeff.Bonwick@Sun.COM { 13397754SJeff.Bonwick@Sun.COM return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 13407754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 13417754SJeff.Bonwick@Sun.COM } 13427754SJeff.Bonwick@Sun.COM 13437754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 13447754SJeff.Bonwick@Sun.COM NULL, 13457754SJeff.Bonwick@Sun.COM zio_read_gang, 13467754SJeff.Bonwick@Sun.COM zio_rewrite_gang, 13477754SJeff.Bonwick@Sun.COM zio_free_gang, 13487754SJeff.Bonwick@Sun.COM zio_claim_gang, 13497754SJeff.Bonwick@Sun.COM NULL 13507754SJeff.Bonwick@Sun.COM }; 13517754SJeff.Bonwick@Sun.COM 13527754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio); 13537754SJeff.Bonwick@Sun.COM 13547754SJeff.Bonwick@Sun.COM static zio_gang_node_t * 13557754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp) 13567754SJeff.Bonwick@Sun.COM { 13577754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn; 13587754SJeff.Bonwick@Sun.COM 13597754SJeff.Bonwick@Sun.COM ASSERT(*gnpp == NULL); 13607754SJeff.Bonwick@Sun.COM 13617754SJeff.Bonwick@Sun.COM gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 13627754SJeff.Bonwick@Sun.COM gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 13637754SJeff.Bonwick@Sun.COM *gnpp = gn; 13647754SJeff.Bonwick@Sun.COM 13657754SJeff.Bonwick@Sun.COM return (gn); 13666523Sek110237 } 13676523Sek110237 13686523Sek110237 static void 13697754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp) 13707754SJeff.Bonwick@Sun.COM { 13717754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 13727754SJeff.Bonwick@Sun.COM 13737754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 13747754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_child[g] == NULL); 13757754SJeff.Bonwick@Sun.COM 13767754SJeff.Bonwick@Sun.COM zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 13777754SJeff.Bonwick@Sun.COM kmem_free(gn, sizeof (*gn)); 13787754SJeff.Bonwick@Sun.COM *gnpp = NULL; 13797754SJeff.Bonwick@Sun.COM } 13807754SJeff.Bonwick@Sun.COM 13817754SJeff.Bonwick@Sun.COM static void 13827754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp) 1383789Sahrens { 13847754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 13857754SJeff.Bonwick@Sun.COM 13867754SJeff.Bonwick@Sun.COM if (gn == NULL) 13877754SJeff.Bonwick@Sun.COM return; 13887754SJeff.Bonwick@Sun.COM 13897754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 13907754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&gn->gn_child[g]); 13917754SJeff.Bonwick@Sun.COM 13927754SJeff.Bonwick@Sun.COM zio_gang_node_free(gnpp); 13937754SJeff.Bonwick@Sun.COM } 13947754SJeff.Bonwick@Sun.COM 13957754SJeff.Bonwick@Sun.COM static void 1396*9443SBill.Moore@Sun.COM zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 13977754SJeff.Bonwick@Sun.COM { 13987754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1399789Sahrens 1400*9443SBill.Moore@Sun.COM ASSERT(gio->io_gang_leader == gio); 14017754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp)); 14027754SJeff.Bonwick@Sun.COM 1403*9443SBill.Moore@Sun.COM zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 14047754SJeff.Bonwick@Sun.COM SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1405*9443SBill.Moore@Sun.COM gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 14067754SJeff.Bonwick@Sun.COM } 14077754SJeff.Bonwick@Sun.COM 14087754SJeff.Bonwick@Sun.COM static void 14097754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio) 14107754SJeff.Bonwick@Sun.COM { 1411*9443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 14127754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio->io_private; 14137754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 14147754SJeff.Bonwick@Sun.COM 1415*9443SBill.Moore@Sun.COM ASSERT(gio == zio_unique_parent(zio)); 14168632SBill.Moore@Sun.COM ASSERT(zio_walk_children(zio) == NULL); 14177754SJeff.Bonwick@Sun.COM 14187754SJeff.Bonwick@Sun.COM if (zio->io_error) 14197754SJeff.Bonwick@Sun.COM return; 14207754SJeff.Bonwick@Sun.COM 14217754SJeff.Bonwick@Sun.COM if (BP_SHOULD_BYTESWAP(bp)) 14227754SJeff.Bonwick@Sun.COM byteswap_uint64_array(zio->io_data, zio->io_size); 14237754SJeff.Bonwick@Sun.COM 14247754SJeff.Bonwick@Sun.COM ASSERT(zio->io_data == gn->gn_gbh); 14257754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 14267754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 14277754SJeff.Bonwick@Sun.COM 14287754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 14297754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 14307754SJeff.Bonwick@Sun.COM if (!BP_IS_GANG(gbp)) 14317754SJeff.Bonwick@Sun.COM continue; 1432*9443SBill.Moore@Sun.COM zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1433789Sahrens } 1434789Sahrens } 1435789Sahrens 14367754SJeff.Bonwick@Sun.COM static void 14377754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1438789Sahrens { 1439*9443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 14407754SJeff.Bonwick@Sun.COM zio_t *zio; 14417754SJeff.Bonwick@Sun.COM 14427754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) == !!gn); 1443*9443SBill.Moore@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1444*9443SBill.Moore@Sun.COM ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 14457754SJeff.Bonwick@Sun.COM 14467754SJeff.Bonwick@Sun.COM /* 14477754SJeff.Bonwick@Sun.COM * If you're a gang header, your data is in gn->gn_gbh. 14487754SJeff.Bonwick@Sun.COM * If you're a gang member, your data is in 'data' and gn == NULL. 14497754SJeff.Bonwick@Sun.COM */ 1450*9443SBill.Moore@Sun.COM zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1451789Sahrens 14527754SJeff.Bonwick@Sun.COM if (gn != NULL) { 14537754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 14547754SJeff.Bonwick@Sun.COM 14557754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 14567754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 14577754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(gbp)) 14587754SJeff.Bonwick@Sun.COM continue; 14597754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 14607754SJeff.Bonwick@Sun.COM data = (char *)data + BP_GET_PSIZE(gbp); 14617754SJeff.Bonwick@Sun.COM } 14627754SJeff.Bonwick@Sun.COM } 14637754SJeff.Bonwick@Sun.COM 1464*9443SBill.Moore@Sun.COM if (gn == gio->io_gang_tree) 1465*9443SBill.Moore@Sun.COM ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 14667754SJeff.Bonwick@Sun.COM 14677754SJeff.Bonwick@Sun.COM if (zio != pio) 14687754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1469789Sahrens } 1470789Sahrens 14715530Sbonwick static int 14727754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio) 14735329Sgw25295 { 14745530Sbonwick blkptr_t *bp = zio->io_bp; 14755530Sbonwick 1476*9443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1477*9443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1478*9443SBill.Moore@Sun.COM 1479*9443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 14805530Sbonwick 14817754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1482789Sahrens 14835530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1484789Sahrens } 1485789Sahrens 14865530Sbonwick static int 14877754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio) 14886523Sek110237 { 14896523Sek110237 blkptr_t *bp = zio->io_bp; 1490789Sahrens 14917754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 14927754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 14935329Sgw25295 1494*9443SBill.Moore@Sun.COM ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1495*9443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1496789Sahrens 14977754SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1498*9443SBill.Moore@Sun.COM zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 14997754SJeff.Bonwick@Sun.COM else 1500*9443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 1501789Sahrens 15027754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 15035530Sbonwick 15045530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1505789Sahrens } 1506789Sahrens 1507789Sahrens static void 15087754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio) 1509789Sahrens { 15108632SBill.Moore@Sun.COM zio_t *pio = zio_unique_parent(zio); 1511*9443SBill.Moore@Sun.COM zio_t *gio = zio->io_gang_leader; 15121775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 15131775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1514789Sahrens uint64_t asize; 15157754SJeff.Bonwick@Sun.COM 15167754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(zio->io_bp)) 15177754SJeff.Bonwick@Sun.COM return; 15187754SJeff.Bonwick@Sun.COM 15197754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1520789Sahrens 15217754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1522*9443SBill.Moore@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas); 15237754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 15247754SJeff.Bonwick@Sun.COM ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 15251775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 15261775Sbillm 1527789Sahrens mutex_enter(&pio->io_lock); 15287754SJeff.Bonwick@Sun.COM for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 15291775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 15301775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 15311775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 15321775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 15331775Sbillm } 1534789Sahrens mutex_exit(&pio->io_lock); 1535789Sahrens } 1536789Sahrens 15375329Sgw25295 static int 15387754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio) 1539789Sahrens { 15407754SJeff.Bonwick@Sun.COM spa_t *spa = pio->io_spa; 15417754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 1542*9443SBill.Moore@Sun.COM zio_t *gio = pio->io_gang_leader; 15437754SJeff.Bonwick@Sun.COM zio_t *zio; 15447754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn, **gnpp; 1545789Sahrens zio_gbh_phys_t *gbh; 15467754SJeff.Bonwick@Sun.COM uint64_t txg = pio->io_txg; 15477754SJeff.Bonwick@Sun.COM uint64_t resid = pio->io_size; 15487754SJeff.Bonwick@Sun.COM uint64_t lsize; 1549*9443SBill.Moore@Sun.COM int ndvas = gio->io_prop.zp_ndvas; 15501775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 15517754SJeff.Bonwick@Sun.COM zio_prop_t zp; 1552789Sahrens int error; 1553789Sahrens 15547754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, 1555*9443SBill.Moore@Sun.COM bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp, 15567754SJeff.Bonwick@Sun.COM METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 15575530Sbonwick if (error) { 15587754SJeff.Bonwick@Sun.COM pio->io_error = error; 15595530Sbonwick return (ZIO_PIPELINE_CONTINUE); 15605530Sbonwick } 1561789Sahrens 1562*9443SBill.Moore@Sun.COM if (pio == gio) { 1563*9443SBill.Moore@Sun.COM gnpp = &gio->io_gang_tree; 15647754SJeff.Bonwick@Sun.COM } else { 15657754SJeff.Bonwick@Sun.COM gnpp = pio->io_private; 15667754SJeff.Bonwick@Sun.COM ASSERT(pio->io_ready == zio_write_gang_member_ready); 1567789Sahrens } 1568789Sahrens 15697754SJeff.Bonwick@Sun.COM gn = zio_gang_node_alloc(gnpp); 15707754SJeff.Bonwick@Sun.COM gbh = gn->gn_gbh; 15717754SJeff.Bonwick@Sun.COM bzero(gbh, SPA_GANGBLOCKSIZE); 1572789Sahrens 15737754SJeff.Bonwick@Sun.COM /* 15747754SJeff.Bonwick@Sun.COM * Create the gang header. 15757754SJeff.Bonwick@Sun.COM */ 15767754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 15777754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 15785530Sbonwick 15791775Sbillm /* 15807754SJeff.Bonwick@Sun.COM * Create and nowait the gang children. 15811775Sbillm */ 15827754SJeff.Bonwick@Sun.COM for (int g = 0; resid != 0; resid -= lsize, g++) { 15837754SJeff.Bonwick@Sun.COM lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 15847754SJeff.Bonwick@Sun.COM SPA_MINBLOCKSIZE); 15857754SJeff.Bonwick@Sun.COM ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 15867754SJeff.Bonwick@Sun.COM 1587*9443SBill.Moore@Sun.COM zp.zp_checksum = gio->io_prop.zp_checksum; 15887754SJeff.Bonwick@Sun.COM zp.zp_compress = ZIO_COMPRESS_OFF; 15897754SJeff.Bonwick@Sun.COM zp.zp_type = DMU_OT_NONE; 15907754SJeff.Bonwick@Sun.COM zp.zp_level = 0; 1591*9443SBill.Moore@Sun.COM zp.zp_ndvas = gio->io_prop.zp_ndvas; 15927754SJeff.Bonwick@Sun.COM 15937754SJeff.Bonwick@Sun.COM zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 15947754SJeff.Bonwick@Sun.COM (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 15957754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready, NULL, &gn->gn_child[g], 15967754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 15977754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 15987754SJeff.Bonwick@Sun.COM } 15997754SJeff.Bonwick@Sun.COM 16007754SJeff.Bonwick@Sun.COM /* 16017754SJeff.Bonwick@Sun.COM * Set pio's pipeline to just wait for zio to finish. 16027754SJeff.Bonwick@Sun.COM */ 16037754SJeff.Bonwick@Sun.COM pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 16047754SJeff.Bonwick@Sun.COM 16057754SJeff.Bonwick@Sun.COM zio_nowait(zio); 16067754SJeff.Bonwick@Sun.COM 16077754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1608789Sahrens } 1609789Sahrens 1610789Sahrens /* 1611789Sahrens * ========================================================================== 1612789Sahrens * Allocate and free blocks 1613789Sahrens * ========================================================================== 1614789Sahrens */ 16157754SJeff.Bonwick@Sun.COM 16165530Sbonwick static int 1617789Sahrens zio_dva_allocate(zio_t *zio) 1618789Sahrens { 16194527Sperrin spa_t *spa = zio->io_spa; 16204527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1621789Sahrens blkptr_t *bp = zio->io_bp; 1622789Sahrens int error; 1623789Sahrens 1624*9443SBill.Moore@Sun.COM if (zio->io_gang_leader == NULL) { 1625*9443SBill.Moore@Sun.COM ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1626*9443SBill.Moore@Sun.COM zio->io_gang_leader = zio; 1627*9443SBill.Moore@Sun.COM } 1628*9443SBill.Moore@Sun.COM 1629789Sahrens ASSERT(BP_IS_HOLE(bp)); 16301775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 16317754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, >, 0); 16327754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa)); 1633789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1634789Sahrens 16357754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, mc, zio->io_size, bp, 16367754SJeff.Bonwick@Sun.COM zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0); 1637789Sahrens 16387754SJeff.Bonwick@Sun.COM if (error) { 16397754SJeff.Bonwick@Sun.COM if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 16407754SJeff.Bonwick@Sun.COM return (zio_write_gang_block(zio)); 1641789Sahrens zio->io_error = error; 1642789Sahrens } 16435530Sbonwick 16445530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1645789Sahrens } 1646789Sahrens 16475530Sbonwick static int 1648789Sahrens zio_dva_free(zio_t *zio) 1649789Sahrens { 16507754SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 1651789Sahrens 16525530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1653789Sahrens } 1654789Sahrens 16555530Sbonwick static int 1656789Sahrens zio_dva_claim(zio_t *zio) 1657789Sahrens { 16587754SJeff.Bonwick@Sun.COM int error; 16597754SJeff.Bonwick@Sun.COM 16607754SJeff.Bonwick@Sun.COM error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 16617754SJeff.Bonwick@Sun.COM if (error) 16627754SJeff.Bonwick@Sun.COM zio->io_error = error; 1663789Sahrens 16645530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1665789Sahrens } 1666789Sahrens 1667789Sahrens /* 16687754SJeff.Bonwick@Sun.COM * Undo an allocation. This is used by zio_done() when an I/O fails 16697754SJeff.Bonwick@Sun.COM * and we want to give back the block we just allocated. 16707754SJeff.Bonwick@Sun.COM * This handles both normal blocks and gang blocks. 16717754SJeff.Bonwick@Sun.COM */ 16727754SJeff.Bonwick@Sun.COM static void 16737754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 16747754SJeff.Bonwick@Sun.COM { 16757754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 16767754SJeff.Bonwick@Sun.COM boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); 16777754SJeff.Bonwick@Sun.COM 16787754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 16797754SJeff.Bonwick@Sun.COM 16807754SJeff.Bonwick@Sun.COM if (zio->io_bp == bp && !now) { 16817754SJeff.Bonwick@Sun.COM /* 16827754SJeff.Bonwick@Sun.COM * This is a rewrite for sync-to-convergence. 16837754SJeff.Bonwick@Sun.COM * We can't do a metaslab_free(NOW) because bp wasn't allocated 16847754SJeff.Bonwick@Sun.COM * during this sync pass, which means that metaslab_sync() 16857754SJeff.Bonwick@Sun.COM * already committed the allocation. 16867754SJeff.Bonwick@Sun.COM */ 16877754SJeff.Bonwick@Sun.COM ASSERT(DVA_EQUAL(BP_IDENTITY(bp), 16887754SJeff.Bonwick@Sun.COM BP_IDENTITY(&zio->io_bp_orig))); 16897754SJeff.Bonwick@Sun.COM ASSERT(spa_sync_pass(spa) > 1); 16907754SJeff.Bonwick@Sun.COM 16917754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && gn == NULL) { 16927754SJeff.Bonwick@Sun.COM /* 16937754SJeff.Bonwick@Sun.COM * This is a gang leader whose gang header(s) we 16947754SJeff.Bonwick@Sun.COM * couldn't read now, so defer the free until later. 16957754SJeff.Bonwick@Sun.COM * The block should still be intact because without 16967754SJeff.Bonwick@Sun.COM * the headers, we'd never even start the rewrite. 16977754SJeff.Bonwick@Sun.COM */ 16987754SJeff.Bonwick@Sun.COM bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 16997754SJeff.Bonwick@Sun.COM return; 17007754SJeff.Bonwick@Sun.COM } 17017754SJeff.Bonwick@Sun.COM } 17027754SJeff.Bonwick@Sun.COM 17037754SJeff.Bonwick@Sun.COM if (!BP_IS_HOLE(bp)) 17047754SJeff.Bonwick@Sun.COM metaslab_free(spa, bp, bp->blk_birth, now); 17057754SJeff.Bonwick@Sun.COM 17067754SJeff.Bonwick@Sun.COM if (gn != NULL) { 17077754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 17087754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, gn->gn_child[g], 17097754SJeff.Bonwick@Sun.COM &gn->gn_gbh->zg_blkptr[g]); 17107754SJeff.Bonwick@Sun.COM } 17117754SJeff.Bonwick@Sun.COM } 17127754SJeff.Bonwick@Sun.COM } 17137754SJeff.Bonwick@Sun.COM 17147754SJeff.Bonwick@Sun.COM /* 17157754SJeff.Bonwick@Sun.COM * Try to allocate an intent log block. Return 0 on success, errno on failure. 17167754SJeff.Bonwick@Sun.COM */ 17177754SJeff.Bonwick@Sun.COM int 17187754SJeff.Bonwick@Sun.COM zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17197754SJeff.Bonwick@Sun.COM uint64_t txg) 17207754SJeff.Bonwick@Sun.COM { 17217754SJeff.Bonwick@Sun.COM int error; 17227754SJeff.Bonwick@Sun.COM 17237754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa->spa_log_class, size, 17247754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 17257754SJeff.Bonwick@Sun.COM 17267754SJeff.Bonwick@Sun.COM if (error) 17277754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa->spa_normal_class, size, 17287754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 17297754SJeff.Bonwick@Sun.COM 17307754SJeff.Bonwick@Sun.COM if (error == 0) { 17317754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(new_bp, size); 17327754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(new_bp, size); 17337754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 17347754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 17357754SJeff.Bonwick@Sun.COM BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 17367754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(new_bp, 0); 17377754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 17387754SJeff.Bonwick@Sun.COM } 17397754SJeff.Bonwick@Sun.COM 17407754SJeff.Bonwick@Sun.COM return (error); 17417754SJeff.Bonwick@Sun.COM } 17427754SJeff.Bonwick@Sun.COM 17437754SJeff.Bonwick@Sun.COM /* 17447754SJeff.Bonwick@Sun.COM * Free an intent log block. We know it can't be a gang block, so there's 17457754SJeff.Bonwick@Sun.COM * nothing to do except metaslab_free() it. 17467754SJeff.Bonwick@Sun.COM */ 17477754SJeff.Bonwick@Sun.COM void 17487754SJeff.Bonwick@Sun.COM zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 17497754SJeff.Bonwick@Sun.COM { 17507754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(bp)); 17517754SJeff.Bonwick@Sun.COM 17527754SJeff.Bonwick@Sun.COM metaslab_free(spa, bp, txg, B_FALSE); 17537754SJeff.Bonwick@Sun.COM } 17547754SJeff.Bonwick@Sun.COM 17557754SJeff.Bonwick@Sun.COM /* 1756789Sahrens * ========================================================================== 1757789Sahrens * Read and write to physical devices 1758789Sahrens * ========================================================================== 1759789Sahrens */ 17605530Sbonwick static int 17611775Sbillm zio_vdev_io_start(zio_t *zio) 1762789Sahrens { 1763789Sahrens vdev_t *vd = zio->io_vd; 17641775Sbillm uint64_t align; 17655329Sgw25295 spa_t *spa = zio->io_spa; 17665329Sgw25295 17677754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0); 17687754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 17697754SJeff.Bonwick@Sun.COM 17707754SJeff.Bonwick@Sun.COM if (vd == NULL) { 17717754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 17727754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 1773789Sahrens 17747754SJeff.Bonwick@Sun.COM /* 17757754SJeff.Bonwick@Sun.COM * The mirror_ops handle multiple DVAs in a single BP. 17767754SJeff.Bonwick@Sun.COM */ 17775530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 17787754SJeff.Bonwick@Sun.COM } 17791775Sbillm 17807754SJeff.Bonwick@Sun.COM align = 1ULL << vd->vdev_top->vdev_ashift; 1781789Sahrens 17821732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 17831732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 17841732Sbonwick char *abuf = zio_buf_alloc(asize); 17857754SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 17861732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 17871732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 17881732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 17891732Sbonwick } 17907754SJeff.Bonwick@Sun.COM zio_push_transform(zio, abuf, asize, asize, zio_subblock); 17911732Sbonwick } 17921732Sbonwick 17931732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 17941732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 17958241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 17968241SJeff.Bonwick@Sun.COM 17978241SJeff.Bonwick@Sun.COM /* 17988241SJeff.Bonwick@Sun.COM * If this is a repair I/O, and there's no self-healing involved -- 17998241SJeff.Bonwick@Sun.COM * that is, we're just resilvering what we expect to resilver -- 18008241SJeff.Bonwick@Sun.COM * then don't do the I/O unless zio's txg is actually in vd's DTL. 18018241SJeff.Bonwick@Sun.COM * This prevents spurious resilvering with nested replication. 18028241SJeff.Bonwick@Sun.COM * For example, given a mirror of mirrors, (A+B)+(C+D), if only 18038241SJeff.Bonwick@Sun.COM * A is out of date, we'll read from C+D, then use the data to 18048241SJeff.Bonwick@Sun.COM * resilver A+B -- but we don't actually want to resilver B, just A. 18058241SJeff.Bonwick@Sun.COM * The top-level mirror has no way to know this, so instead we just 18068241SJeff.Bonwick@Sun.COM * discard unnecessary repairs as we work our way down the vdev tree. 18078241SJeff.Bonwick@Sun.COM * The same logic applies to any form of nested replication: 18088241SJeff.Bonwick@Sun.COM * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 18098241SJeff.Bonwick@Sun.COM */ 18108241SJeff.Bonwick@Sun.COM if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 18118241SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 18128241SJeff.Bonwick@Sun.COM zio->io_txg != 0 && /* not a delegated i/o */ 18138241SJeff.Bonwick@Sun.COM !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 18148241SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_WRITE); 18158241SJeff.Bonwick@Sun.COM zio_vdev_io_bypass(zio); 18168241SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 18178241SJeff.Bonwick@Sun.COM } 1818789Sahrens 18197754SJeff.Bonwick@Sun.COM if (vd->vdev_ops->vdev_op_leaf && 18207754SJeff.Bonwick@Sun.COM (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 18217754SJeff.Bonwick@Sun.COM 18227754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 18238632SBill.Moore@Sun.COM return (ZIO_PIPELINE_CONTINUE); 18247754SJeff.Bonwick@Sun.COM 18257754SJeff.Bonwick@Sun.COM if ((zio = vdev_queue_io(zio)) == NULL) 18267754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 18277754SJeff.Bonwick@Sun.COM 18287754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 18297754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 18307754SJeff.Bonwick@Sun.COM zio_interrupt(zio); 18317754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 18327754SJeff.Bonwick@Sun.COM } 18337754SJeff.Bonwick@Sun.COM } 18347754SJeff.Bonwick@Sun.COM 18355530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 1836789Sahrens } 1837789Sahrens 18385530Sbonwick static int 1839789Sahrens zio_vdev_io_done(zio_t *zio) 1840789Sahrens { 18417754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 18427754SJeff.Bonwick@Sun.COM vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 18437754SJeff.Bonwick@Sun.COM boolean_t unexpected_error = B_FALSE; 18445530Sbonwick 18457754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 18467754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 18477754SJeff.Bonwick@Sun.COM 18487754SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 1849789Sahrens 18507754SJeff.Bonwick@Sun.COM if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 18517754SJeff.Bonwick@Sun.COM 18527754SJeff.Bonwick@Sun.COM vdev_queue_io_done(zio); 18537754SJeff.Bonwick@Sun.COM 18547754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE) 18557754SJeff.Bonwick@Sun.COM vdev_cache_write(zio); 18567754SJeff.Bonwick@Sun.COM 18577754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 18587754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_device_injection(vd, EIO); 1859789Sahrens 18607754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 18617754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_label_injection(zio, EIO); 18627754SJeff.Bonwick@Sun.COM 18637754SJeff.Bonwick@Sun.COM if (zio->io_error) { 18647754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 18657754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 18667754SJeff.Bonwick@Sun.COM } else { 18677754SJeff.Bonwick@Sun.COM unexpected_error = B_TRUE; 18687754SJeff.Bonwick@Sun.COM } 18697754SJeff.Bonwick@Sun.COM } 18706976Seschrock } 18717754SJeff.Bonwick@Sun.COM 18727754SJeff.Bonwick@Sun.COM ops->vdev_op_io_done(zio); 1873789Sahrens 18747754SJeff.Bonwick@Sun.COM if (unexpected_error) 18758632SBill.Moore@Sun.COM VERIFY(vdev_probe(vd, zio) == NULL); 18767754SJeff.Bonwick@Sun.COM 18777754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1878789Sahrens } 1879789Sahrens 18805530Sbonwick static int 1881789Sahrens zio_vdev_io_assess(zio_t *zio) 1882789Sahrens { 1883789Sahrens vdev_t *vd = zio->io_vd; 1884789Sahrens 18857754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 18867754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1887789Sahrens 18887754SJeff.Bonwick@Sun.COM if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 18897754SJeff.Bonwick@Sun.COM spa_config_exit(zio->io_spa, SCL_ZIO, zio); 18907754SJeff.Bonwick@Sun.COM 18917754SJeff.Bonwick@Sun.COM if (zio->io_vsd != NULL) { 18927754SJeff.Bonwick@Sun.COM zio->io_vsd_free(zio); 18937754SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 18941732Sbonwick } 18951732Sbonwick 18967754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 18971544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1898789Sahrens 1899789Sahrens /* 1900789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1901789Sahrens */ 19027754SJeff.Bonwick@Sun.COM if (zio->io_error && vd == NULL && 19037754SJeff.Bonwick@Sun.COM !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 19047754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 19057754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 1906789Sahrens zio->io_error = 0; 19077754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_RETRY | 19087754SJeff.Bonwick@Sun.COM ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 19091775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 19107754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 19117754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 19127754SJeff.Bonwick@Sun.COM } 1913789Sahrens 19147754SJeff.Bonwick@Sun.COM /* 19157754SJeff.Bonwick@Sun.COM * If we got an error on a leaf device, convert it to ENXIO 19167754SJeff.Bonwick@Sun.COM * if the device is not accessible at all. 19177754SJeff.Bonwick@Sun.COM */ 19187754SJeff.Bonwick@Sun.COM if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 19197754SJeff.Bonwick@Sun.COM !vdev_accessible(vd, zio)) 19207754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 19217754SJeff.Bonwick@Sun.COM 19227754SJeff.Bonwick@Sun.COM /* 19237754SJeff.Bonwick@Sun.COM * If we can't write to an interior vdev (mirror or RAID-Z), 19247754SJeff.Bonwick@Sun.COM * set vdev_cant_write so that we stop trying to allocate from it. 19257754SJeff.Bonwick@Sun.COM */ 19267754SJeff.Bonwick@Sun.COM if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 19277754SJeff.Bonwick@Sun.COM vd != NULL && !vd->vdev_ops->vdev_op_leaf) 19287754SJeff.Bonwick@Sun.COM vd->vdev_cant_write = B_TRUE; 19297754SJeff.Bonwick@Sun.COM 19307754SJeff.Bonwick@Sun.COM if (zio->io_error) 19317754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1932789Sahrens 19335530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1934789Sahrens } 1935789Sahrens 1936789Sahrens void 1937789Sahrens zio_vdev_io_reissue(zio_t *zio) 1938789Sahrens { 1939789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1940789Sahrens ASSERT(zio->io_error == 0); 1941789Sahrens 1942789Sahrens zio->io_stage--; 1943789Sahrens } 1944789Sahrens 1945789Sahrens void 1946789Sahrens zio_vdev_io_redone(zio_t *zio) 1947789Sahrens { 1948789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1949789Sahrens 1950789Sahrens zio->io_stage--; 1951789Sahrens } 1952789Sahrens 1953789Sahrens void 1954789Sahrens zio_vdev_io_bypass(zio_t *zio) 1955789Sahrens { 1956789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1957789Sahrens ASSERT(zio->io_error == 0); 1958789Sahrens 1959789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1960789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1961789Sahrens } 1962789Sahrens 1963789Sahrens /* 1964789Sahrens * ========================================================================== 1965789Sahrens * Generate and verify checksums 1966789Sahrens * ========================================================================== 1967789Sahrens */ 19685530Sbonwick static int 1969789Sahrens zio_checksum_generate(zio_t *zio) 1970789Sahrens { 1971789Sahrens blkptr_t *bp = zio->io_bp; 19727754SJeff.Bonwick@Sun.COM enum zio_checksum checksum; 1973789Sahrens 19747754SJeff.Bonwick@Sun.COM if (bp == NULL) { 19757754SJeff.Bonwick@Sun.COM /* 19767754SJeff.Bonwick@Sun.COM * This is zio_write_phys(). 19777754SJeff.Bonwick@Sun.COM * We're either generating a label checksum, or none at all. 19787754SJeff.Bonwick@Sun.COM */ 19797754SJeff.Bonwick@Sun.COM checksum = zio->io_prop.zp_checksum; 1980789Sahrens 19817754SJeff.Bonwick@Sun.COM if (checksum == ZIO_CHECKSUM_OFF) 19827754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1983789Sahrens 19847754SJeff.Bonwick@Sun.COM ASSERT(checksum == ZIO_CHECKSUM_LABEL); 19857754SJeff.Bonwick@Sun.COM } else { 19867754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 19877754SJeff.Bonwick@Sun.COM ASSERT(!IO_IS_ALLOCATING(zio)); 19887754SJeff.Bonwick@Sun.COM checksum = ZIO_CHECKSUM_GANG_HEADER; 19897754SJeff.Bonwick@Sun.COM } else { 19907754SJeff.Bonwick@Sun.COM checksum = BP_GET_CHECKSUM(bp); 19917754SJeff.Bonwick@Sun.COM } 19927754SJeff.Bonwick@Sun.COM } 1993789Sahrens 19947754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 1995789Sahrens 19965530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1997789Sahrens } 1998789Sahrens 19995530Sbonwick static int 2000789Sahrens zio_checksum_verify(zio_t *zio) 2001789Sahrens { 20027754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 20037754SJeff.Bonwick@Sun.COM int error; 20047754SJeff.Bonwick@Sun.COM 20057754SJeff.Bonwick@Sun.COM if (bp == NULL) { 20067754SJeff.Bonwick@Sun.COM /* 20077754SJeff.Bonwick@Sun.COM * This is zio_read_phys(). 20087754SJeff.Bonwick@Sun.COM * We're either verifying a label checksum, or nothing at all. 20097754SJeff.Bonwick@Sun.COM */ 20107754SJeff.Bonwick@Sun.COM if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 20117754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 20127754SJeff.Bonwick@Sun.COM 20137754SJeff.Bonwick@Sun.COM ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 20147754SJeff.Bonwick@Sun.COM } 20157754SJeff.Bonwick@Sun.COM 20167754SJeff.Bonwick@Sun.COM if ((error = zio_checksum_error(zio)) != 0) { 20177754SJeff.Bonwick@Sun.COM zio->io_error = error; 20187754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 20191544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 20201544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 20217754SJeff.Bonwick@Sun.COM } 2022789Sahrens } 2023789Sahrens 20245530Sbonwick return (ZIO_PIPELINE_CONTINUE); 2025789Sahrens } 2026789Sahrens 2027789Sahrens /* 2028789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 2029789Sahrens */ 2030789Sahrens void 2031789Sahrens zio_checksum_verified(zio_t *zio) 2032789Sahrens { 2033789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 2034789Sahrens } 2035789Sahrens 2036789Sahrens /* 20377754SJeff.Bonwick@Sun.COM * ========================================================================== 20387754SJeff.Bonwick@Sun.COM * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 20397754SJeff.Bonwick@Sun.COM * An error of 0 indictes success. ENXIO indicates whole-device failure, 20407754SJeff.Bonwick@Sun.COM * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 20417754SJeff.Bonwick@Sun.COM * indicate errors that are specific to one I/O, and most likely permanent. 20427754SJeff.Bonwick@Sun.COM * Any other error is presumed to be worse because we weren't expecting it. 20437754SJeff.Bonwick@Sun.COM * ========================================================================== 2044789Sahrens */ 20457754SJeff.Bonwick@Sun.COM int 20467754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2) 2047789Sahrens { 20487754SJeff.Bonwick@Sun.COM static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 20497754SJeff.Bonwick@Sun.COM int r1, r2; 20501775Sbillm 20517754SJeff.Bonwick@Sun.COM for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 20527754SJeff.Bonwick@Sun.COM if (e1 == zio_error_rank[r1]) 20537754SJeff.Bonwick@Sun.COM break; 20547754SJeff.Bonwick@Sun.COM 20557754SJeff.Bonwick@Sun.COM for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 20567754SJeff.Bonwick@Sun.COM if (e2 == zio_error_rank[r2]) 20577754SJeff.Bonwick@Sun.COM break; 20587754SJeff.Bonwick@Sun.COM 20597754SJeff.Bonwick@Sun.COM return (r1 > r2 ? e1 : e2); 2060789Sahrens } 2061789Sahrens 2062789Sahrens /* 2063789Sahrens * ========================================================================== 20647754SJeff.Bonwick@Sun.COM * I/O completion 2065789Sahrens * ========================================================================== 2066789Sahrens */ 20677754SJeff.Bonwick@Sun.COM static int 20687754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio) 20697754SJeff.Bonwick@Sun.COM { 20707754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 20718632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 20727754SJeff.Bonwick@Sun.COM 2073*9443SBill.Moore@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) 2074*9443SBill.Moore@Sun.COM return (ZIO_PIPELINE_STOP); 2075*9443SBill.Moore@Sun.COM 20767754SJeff.Bonwick@Sun.COM if (zio->io_ready) { 20777754SJeff.Bonwick@Sun.COM ASSERT(IO_IS_ALLOCATING(zio)); 20787754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 20797754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 20807754SJeff.Bonwick@Sun.COM 20817754SJeff.Bonwick@Sun.COM zio->io_ready(zio); 20827754SJeff.Bonwick@Sun.COM } 20837754SJeff.Bonwick@Sun.COM 20847754SJeff.Bonwick@Sun.COM if (bp != NULL && bp != &zio->io_bp_copy) 20857754SJeff.Bonwick@Sun.COM zio->io_bp_copy = *bp; 20867754SJeff.Bonwick@Sun.COM 20877754SJeff.Bonwick@Sun.COM if (zio->io_error) 20887754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 20897754SJeff.Bonwick@Sun.COM 20908632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 20918632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_READY] = 1; 20928632SBill.Moore@Sun.COM pio = zio_walk_parents(zio); 20938632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 20948632SBill.Moore@Sun.COM 20958632SBill.Moore@Sun.COM /* 20968632SBill.Moore@Sun.COM * As we notify zio's parents, new parents could be added. 20978632SBill.Moore@Sun.COM * New parents go to the head of zio's io_parent_list, however, 20988632SBill.Moore@Sun.COM * so we will (correctly) not notify them. The remainder of zio's 20998632SBill.Moore@Sun.COM * io_parent_list, from 'pio_next' onward, cannot change because 21008632SBill.Moore@Sun.COM * all parents must wait for us to be done before they can be done. 21018632SBill.Moore@Sun.COM */ 21028632SBill.Moore@Sun.COM for (; pio != NULL; pio = pio_next) { 21038632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 21047754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_READY); 21058632SBill.Moore@Sun.COM } 21067754SJeff.Bonwick@Sun.COM 21077754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 21087754SJeff.Bonwick@Sun.COM } 21097754SJeff.Bonwick@Sun.COM 21107754SJeff.Bonwick@Sun.COM static int 21117754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio) 21127754SJeff.Bonwick@Sun.COM { 21137754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 21147754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 21157754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 21167754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 21177754SJeff.Bonwick@Sun.COM uint64_t psize = zio->io_size; 21188632SBill.Moore@Sun.COM zio_t *pio, *pio_next; 21197754SJeff.Bonwick@Sun.COM 21207754SJeff.Bonwick@Sun.COM /* 2121*9443SBill.Moore@Sun.COM * If our children haven't all completed, 21227754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 21237754SJeff.Bonwick@Sun.COM */ 21247754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 21257754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 21267754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 21277754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 21287754SJeff.Bonwick@Sun.COM 21297754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 21307754SJeff.Bonwick@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 21317754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[c][w] == 0); 21327754SJeff.Bonwick@Sun.COM 21337754SJeff.Bonwick@Sun.COM if (bp != NULL) { 21347754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[0] == 0); 21357754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[1] == 0); 21367754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[2] == 0); 21377754SJeff.Bonwick@Sun.COM ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 21388632SBill.Moore@Sun.COM (bp == zio_unique_parent(zio)->io_bp)); 21397754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 21407754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 21417754SJeff.Bonwick@Sun.COM ASSERT(!BP_SHOULD_BYTESWAP(bp)); 21427754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp)); 21437754SJeff.Bonwick@Sun.COM ASSERT(BP_COUNT_GANG(bp) == 0 || 21447754SJeff.Bonwick@Sun.COM (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 21457754SJeff.Bonwick@Sun.COM } 21467754SJeff.Bonwick@Sun.COM } 21477754SJeff.Bonwick@Sun.COM 21487754SJeff.Bonwick@Sun.COM /* 21497754SJeff.Bonwick@Sun.COM * If there were child vdev or gang errors, they apply to us now. 21507754SJeff.Bonwick@Sun.COM */ 21517754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 21527754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 21537754SJeff.Bonwick@Sun.COM 21547754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); /* note: may set zio->io_error */ 21557754SJeff.Bonwick@Sun.COM 21567754SJeff.Bonwick@Sun.COM vdev_stat_update(zio, psize); 21577754SJeff.Bonwick@Sun.COM 21587754SJeff.Bonwick@Sun.COM if (zio->io_error) { 21597754SJeff.Bonwick@Sun.COM /* 21607754SJeff.Bonwick@Sun.COM * If this I/O is attached to a particular vdev, 21617754SJeff.Bonwick@Sun.COM * generate an error message describing the I/O failure 21627754SJeff.Bonwick@Sun.COM * at the block level. We ignore these errors if the 21637754SJeff.Bonwick@Sun.COM * device is currently unavailable. 21647754SJeff.Bonwick@Sun.COM */ 21657754SJeff.Bonwick@Sun.COM if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 21667754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 21677754SJeff.Bonwick@Sun.COM 21687754SJeff.Bonwick@Sun.COM if ((zio->io_error == EIO || 21697754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { 21707754SJeff.Bonwick@Sun.COM /* 21717754SJeff.Bonwick@Sun.COM * For logical I/O requests, tell the SPA to log the 21727754SJeff.Bonwick@Sun.COM * error and generate a logical data ereport. 21737754SJeff.Bonwick@Sun.COM */ 21747754SJeff.Bonwick@Sun.COM spa_log_error(spa, zio); 21757754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 21767754SJeff.Bonwick@Sun.COM 0, 0); 21777754SJeff.Bonwick@Sun.COM } 21787754SJeff.Bonwick@Sun.COM } 21797754SJeff.Bonwick@Sun.COM 21807754SJeff.Bonwick@Sun.COM if (zio->io_error && zio == lio) { 21817754SJeff.Bonwick@Sun.COM /* 21827754SJeff.Bonwick@Sun.COM * Determine whether zio should be reexecuted. This will 21837754SJeff.Bonwick@Sun.COM * propagate all the way to the root via zio_notify_parent(). 21847754SJeff.Bonwick@Sun.COM */ 21857754SJeff.Bonwick@Sun.COM ASSERT(vd == NULL && bp != NULL); 2186789Sahrens 21877754SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio)) 21887754SJeff.Bonwick@Sun.COM if (zio->io_error != ENOSPC) 21897754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_NOW; 21907754SJeff.Bonwick@Sun.COM else 21917754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 21927754SJeff.Bonwick@Sun.COM 21937754SJeff.Bonwick@Sun.COM if ((zio->io_type == ZIO_TYPE_READ || 21947754SJeff.Bonwick@Sun.COM zio->io_type == ZIO_TYPE_FREE) && 21957754SJeff.Bonwick@Sun.COM zio->io_error == ENXIO && 21968241SJeff.Bonwick@Sun.COM spa->spa_load_state == SPA_LOAD_NONE && 21977754SJeff.Bonwick@Sun.COM spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 21987754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 21997754SJeff.Bonwick@Sun.COM 22007754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 22017754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 22027754SJeff.Bonwick@Sun.COM } 22037754SJeff.Bonwick@Sun.COM 22047754SJeff.Bonwick@Sun.COM /* 22057754SJeff.Bonwick@Sun.COM * If there were logical child errors, they apply to us now. 22067754SJeff.Bonwick@Sun.COM * We defer this until now to avoid conflating logical child 22077754SJeff.Bonwick@Sun.COM * errors with errors that happened to the zio itself when 22087754SJeff.Bonwick@Sun.COM * updating vdev stats and reporting FMA events above. 22097754SJeff.Bonwick@Sun.COM */ 22107754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 22117754SJeff.Bonwick@Sun.COM 2212*9443SBill.Moore@Sun.COM if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && 2213*9443SBill.Moore@Sun.COM zio->io_child_type == ZIO_CHILD_LOGICAL) { 2214*9443SBill.Moore@Sun.COM ASSERT(zio->io_child_type != ZIO_CHILD_GANG); 2215*9443SBill.Moore@Sun.COM zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2216*9443SBill.Moore@Sun.COM } 2217*9443SBill.Moore@Sun.COM 2218*9443SBill.Moore@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 2219*9443SBill.Moore@Sun.COM 22209234SGeorge.Wilson@Sun.COM if (zio->io_reexecute && !(zio->io_flags & ZIO_FLAG_GODFATHER)) { 22217754SJeff.Bonwick@Sun.COM /* 22227754SJeff.Bonwick@Sun.COM * This is a logical I/O that wants to reexecute. 22237754SJeff.Bonwick@Sun.COM * 22247754SJeff.Bonwick@Sun.COM * Reexecute is top-down. When an i/o fails, if it's not 22257754SJeff.Bonwick@Sun.COM * the root, it simply notifies its parent and sticks around. 22267754SJeff.Bonwick@Sun.COM * The parent, seeing that it still has children in zio_done(), 22277754SJeff.Bonwick@Sun.COM * does the same. This percolates all the way up to the root. 22287754SJeff.Bonwick@Sun.COM * The root i/o will reexecute or suspend the entire tree. 22297754SJeff.Bonwick@Sun.COM * 22307754SJeff.Bonwick@Sun.COM * This approach ensures that zio_reexecute() honors 22317754SJeff.Bonwick@Sun.COM * all the original i/o dependency relationships, e.g. 22327754SJeff.Bonwick@Sun.COM * parents not executing until children are ready. 22337754SJeff.Bonwick@Sun.COM */ 22347754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 22357754SJeff.Bonwick@Sun.COM 2236*9443SBill.Moore@Sun.COM zio->io_gang_leader = NULL; 22377754SJeff.Bonwick@Sun.COM 22388632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 22398632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 22408632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 22418632SBill.Moore@Sun.COM 22429234SGeorge.Wilson@Sun.COM /* 22439234SGeorge.Wilson@Sun.COM * "The Godfather" I/O monitors its children but is 22449234SGeorge.Wilson@Sun.COM * not a true parent to them. It will track them through 22459234SGeorge.Wilson@Sun.COM * the pipeline but severs its ties whenever they get into 22469234SGeorge.Wilson@Sun.COM * trouble (e.g. suspended). This allows "The Godfather" 22479234SGeorge.Wilson@Sun.COM * I/O to return status without blocking. 22489234SGeorge.Wilson@Sun.COM */ 22499234SGeorge.Wilson@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 22509234SGeorge.Wilson@Sun.COM zio_link_t *zl = zio->io_walk_link; 22519234SGeorge.Wilson@Sun.COM pio_next = zio_walk_parents(zio); 22529234SGeorge.Wilson@Sun.COM 22539234SGeorge.Wilson@Sun.COM if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 22549234SGeorge.Wilson@Sun.COM (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 22559234SGeorge.Wilson@Sun.COM zio_remove_child(pio, zio, zl); 22569234SGeorge.Wilson@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 22579234SGeorge.Wilson@Sun.COM } 22589234SGeorge.Wilson@Sun.COM } 22599234SGeorge.Wilson@Sun.COM 22608632SBill.Moore@Sun.COM if ((pio = zio_unique_parent(zio)) != NULL) { 22617754SJeff.Bonwick@Sun.COM /* 22627754SJeff.Bonwick@Sun.COM * We're not a root i/o, so there's nothing to do 22637754SJeff.Bonwick@Sun.COM * but notify our parent. Don't propagate errors 22647754SJeff.Bonwick@Sun.COM * upward since we haven't permanently failed yet. 22657754SJeff.Bonwick@Sun.COM */ 22667754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 22677754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 22687754SJeff.Bonwick@Sun.COM } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 22697754SJeff.Bonwick@Sun.COM /* 22707754SJeff.Bonwick@Sun.COM * We'd fail again if we reexecuted now, so suspend 22717754SJeff.Bonwick@Sun.COM * until conditions improve (e.g. device comes online). 22727754SJeff.Bonwick@Sun.COM */ 22737754SJeff.Bonwick@Sun.COM zio_suspend(spa, zio); 22747754SJeff.Bonwick@Sun.COM } else { 22757754SJeff.Bonwick@Sun.COM /* 22767754SJeff.Bonwick@Sun.COM * Reexecution is potentially a huge amount of work. 22777754SJeff.Bonwick@Sun.COM * Hand it off to the otherwise-unused claim taskq. 22787754SJeff.Bonwick@Sun.COM */ 22797754SJeff.Bonwick@Sun.COM (void) taskq_dispatch( 22807754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 22817754SJeff.Bonwick@Sun.COM (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 22827754SJeff.Bonwick@Sun.COM } 22837754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 22847754SJeff.Bonwick@Sun.COM } 22857754SJeff.Bonwick@Sun.COM 22868632SBill.Moore@Sun.COM ASSERT(zio_walk_children(zio) == NULL); 22879234SGeorge.Wilson@Sun.COM ASSERT(zio->io_reexecute == 0 || (zio->io_flags & ZIO_FLAG_GODFATHER)); 22887754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 22897754SJeff.Bonwick@Sun.COM 22908632SBill.Moore@Sun.COM /* 22918632SBill.Moore@Sun.COM * It is the responsibility of the done callback to ensure that this 22928632SBill.Moore@Sun.COM * particular zio is no longer discoverable for adoption, and as 22938632SBill.Moore@Sun.COM * such, cannot acquire any new parents. 22948632SBill.Moore@Sun.COM */ 22957754SJeff.Bonwick@Sun.COM if (zio->io_done) 22967754SJeff.Bonwick@Sun.COM zio->io_done(zio); 22977754SJeff.Bonwick@Sun.COM 22988632SBill.Moore@Sun.COM mutex_enter(&zio->io_lock); 22998632SBill.Moore@Sun.COM zio->io_state[ZIO_WAIT_DONE] = 1; 23008632SBill.Moore@Sun.COM mutex_exit(&zio->io_lock); 23017754SJeff.Bonwick@Sun.COM 23028632SBill.Moore@Sun.COM for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 23038632SBill.Moore@Sun.COM zio_link_t *zl = zio->io_walk_link; 23048632SBill.Moore@Sun.COM pio_next = zio_walk_parents(zio); 23058632SBill.Moore@Sun.COM zio_remove_child(pio, zio, zl); 23067754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 23077754SJeff.Bonwick@Sun.COM } 23087754SJeff.Bonwick@Sun.COM 23097754SJeff.Bonwick@Sun.COM if (zio->io_waiter != NULL) { 23107754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 23117754SJeff.Bonwick@Sun.COM zio->io_executor = NULL; 23127754SJeff.Bonwick@Sun.COM cv_broadcast(&zio->io_cv); 23137754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 23147754SJeff.Bonwick@Sun.COM } else { 23157754SJeff.Bonwick@Sun.COM zio_destroy(zio); 23167754SJeff.Bonwick@Sun.COM } 23177754SJeff.Bonwick@Sun.COM 23187754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 23197754SJeff.Bonwick@Sun.COM } 23207754SJeff.Bonwick@Sun.COM 23217754SJeff.Bonwick@Sun.COM /* 23227754SJeff.Bonwick@Sun.COM * ========================================================================== 23237754SJeff.Bonwick@Sun.COM * I/O pipeline definition 23247754SJeff.Bonwick@Sun.COM * ========================================================================== 23257754SJeff.Bonwick@Sun.COM */ 23267754SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = { 23275530Sbonwick NULL, 23285530Sbonwick zio_issue_async, 23297754SJeff.Bonwick@Sun.COM zio_read_bp_init, 23307754SJeff.Bonwick@Sun.COM zio_write_bp_init, 2331789Sahrens zio_checksum_generate, 23327754SJeff.Bonwick@Sun.COM zio_gang_assemble, 23337754SJeff.Bonwick@Sun.COM zio_gang_issue, 2334789Sahrens zio_dva_allocate, 2335789Sahrens zio_dva_free, 2336789Sahrens zio_dva_claim, 2337789Sahrens zio_ready, 2338789Sahrens zio_vdev_io_start, 2339789Sahrens zio_vdev_io_done, 2340789Sahrens zio_vdev_io_assess, 2341789Sahrens zio_checksum_verify, 23427754SJeff.Bonwick@Sun.COM zio_done 2343789Sahrens }; 2344