1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 226245Smaybee * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #include <sys/zfs_context.h> 271544Seschrock #include <sys/fm/fs/zfs.h> 28789Sahrens #include <sys/spa.h> 29789Sahrens #include <sys/txg.h> 30789Sahrens #include <sys/spa_impl.h> 31789Sahrens #include <sys/vdev_impl.h> 32789Sahrens #include <sys/zio_impl.h> 33789Sahrens #include <sys/zio_compress.h> 34789Sahrens #include <sys/zio_checksum.h> 35789Sahrens 36789Sahrens /* 37789Sahrens * ========================================================================== 38789Sahrens * I/O priority table 39789Sahrens * ========================================================================== 40789Sahrens */ 41789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 42789Sahrens 0, /* ZIO_PRIORITY_NOW */ 43789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 44789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 45789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 46789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 47789Sahrens 4, /* ZIO_PRIORITY_FREE */ 48789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 49789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 50789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 51789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 52789Sahrens }; 53789Sahrens 54789Sahrens /* 55789Sahrens * ========================================================================== 56789Sahrens * I/O type descriptions 57789Sahrens * ========================================================================== 58789Sahrens */ 59789Sahrens char *zio_type_name[ZIO_TYPES] = { 60789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 61789Sahrens 62*7754SJeff.Bonwick@Sun.COM #define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ 63*7754SJeff.Bonwick@Sun.COM #define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ 64*7754SJeff.Bonwick@Sun.COM #define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ 655329Sgw25295 66789Sahrens /* 67789Sahrens * ========================================================================== 68789Sahrens * I/O kmem caches 69789Sahrens * ========================================================================== 70789Sahrens */ 714055Seschrock kmem_cache_t *zio_cache; 72789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 733290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 743290Sjohansen 753290Sjohansen #ifdef _KERNEL 763290Sjohansen extern vmem_t *zio_alloc_arena; 773290Sjohansen #endif 78789Sahrens 795329Sgw25295 /* 80*7754SJeff.Bonwick@Sun.COM * An allocating zio is one that either currently has the DVA allocate 81*7754SJeff.Bonwick@Sun.COM * stage set or will have it later in its lifetime. 825329Sgw25295 */ 835329Sgw25295 #define IO_IS_ALLOCATING(zio) \ 845688Sbonwick ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 855329Sgw25295 86789Sahrens void 87789Sahrens zio_init(void) 88789Sahrens { 89789Sahrens size_t c; 903290Sjohansen vmem_t *data_alloc_arena = NULL; 913290Sjohansen 923290Sjohansen #ifdef _KERNEL 933290Sjohansen data_alloc_arena = zio_alloc_arena; 943290Sjohansen #endif 954055Seschrock zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 964055Seschrock NULL, NULL, NULL, NULL, NULL, 0); 974055Seschrock 98789Sahrens /* 99789Sahrens * For small buffers, we want a cache for each multiple of 100789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 101789Sahrens * for each quarter-power of 2. For large buffers, we want 102789Sahrens * a cache for each multiple of PAGESIZE. 103789Sahrens */ 104789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 105789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 106789Sahrens size_t p2 = size; 107789Sahrens size_t align = 0; 108789Sahrens 109789Sahrens while (p2 & (p2 - 1)) 110789Sahrens p2 &= p2 - 1; 111789Sahrens 112789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 113789Sahrens align = SPA_MINBLOCKSIZE; 114789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 115789Sahrens align = PAGESIZE; 116789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 117789Sahrens align = p2 >> 2; 118789Sahrens } 119789Sahrens 120789Sahrens if (align != 0) { 1213290Sjohansen char name[36]; 1222856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 123789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 124849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1253290Sjohansen 1263290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1273290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1283290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1293290Sjohansen KMC_NODEBUG); 130789Sahrens } 131789Sahrens } 132789Sahrens 133789Sahrens while (--c != 0) { 134789Sahrens ASSERT(zio_buf_cache[c] != NULL); 135789Sahrens if (zio_buf_cache[c - 1] == NULL) 136789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1373290Sjohansen 1383290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1393290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1403290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 141789Sahrens } 1421544Seschrock 1431544Seschrock zio_inject_init(); 144789Sahrens } 145789Sahrens 146789Sahrens void 147789Sahrens zio_fini(void) 148789Sahrens { 149789Sahrens size_t c; 150789Sahrens kmem_cache_t *last_cache = NULL; 1513290Sjohansen kmem_cache_t *last_data_cache = NULL; 152789Sahrens 153789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 154789Sahrens if (zio_buf_cache[c] != last_cache) { 155789Sahrens last_cache = zio_buf_cache[c]; 156789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 157789Sahrens } 158789Sahrens zio_buf_cache[c] = NULL; 1593290Sjohansen 1603290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1613290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1623290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1633290Sjohansen } 1643290Sjohansen zio_data_buf_cache[c] = NULL; 165789Sahrens } 1661544Seschrock 1674055Seschrock kmem_cache_destroy(zio_cache); 1684055Seschrock 1691544Seschrock zio_inject_fini(); 170789Sahrens } 171789Sahrens 172789Sahrens /* 173789Sahrens * ========================================================================== 174789Sahrens * Allocate and free I/O buffers 175789Sahrens * ========================================================================== 176789Sahrens */ 1773290Sjohansen 1783290Sjohansen /* 1793290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1803290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1813290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1823290Sjohansen * excess / transient data in-core during a crashdump. 1833290Sjohansen */ 184789Sahrens void * 185789Sahrens zio_buf_alloc(size_t size) 186789Sahrens { 187789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 188789Sahrens 189789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 190789Sahrens 1916245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 192789Sahrens } 193789Sahrens 1943290Sjohansen /* 1953290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 1963290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 1973290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 1983290Sjohansen * of kernel heap dumped to disk when the kernel panics) 1993290Sjohansen */ 2003290Sjohansen void * 2013290Sjohansen zio_data_buf_alloc(size_t size) 2023290Sjohansen { 2033290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2043290Sjohansen 2053290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2063290Sjohansen 2076245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2083290Sjohansen } 2093290Sjohansen 210789Sahrens void 211789Sahrens zio_buf_free(void *buf, size_t size) 212789Sahrens { 213789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 214789Sahrens 215789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 216789Sahrens 217789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 218789Sahrens } 219789Sahrens 2203290Sjohansen void 2213290Sjohansen zio_data_buf_free(void *buf, size_t size) 2223290Sjohansen { 2233290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2243290Sjohansen 2253290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2263290Sjohansen 2273290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2283290Sjohansen } 2293463Sahrens 230789Sahrens /* 231789Sahrens * ========================================================================== 232789Sahrens * Push and pop I/O transform buffers 233789Sahrens * ========================================================================== 234789Sahrens */ 235789Sahrens static void 236*7754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 237*7754SJeff.Bonwick@Sun.COM zio_transform_func_t *transform) 238789Sahrens { 239789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 240789Sahrens 241*7754SJeff.Bonwick@Sun.COM zt->zt_orig_data = zio->io_data; 242*7754SJeff.Bonwick@Sun.COM zt->zt_orig_size = zio->io_size; 243789Sahrens zt->zt_bufsize = bufsize; 244*7754SJeff.Bonwick@Sun.COM zt->zt_transform = transform; 245789Sahrens 246789Sahrens zt->zt_next = zio->io_transform_stack; 247789Sahrens zio->io_transform_stack = zt; 248789Sahrens 249789Sahrens zio->io_data = data; 250789Sahrens zio->io_size = size; 251789Sahrens } 252789Sahrens 253789Sahrens static void 254*7754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio) 255789Sahrens { 256*7754SJeff.Bonwick@Sun.COM zio_transform_t *zt; 257789Sahrens 258*7754SJeff.Bonwick@Sun.COM while ((zt = zio->io_transform_stack) != NULL) { 259*7754SJeff.Bonwick@Sun.COM if (zt->zt_transform != NULL) 260*7754SJeff.Bonwick@Sun.COM zt->zt_transform(zio, 261*7754SJeff.Bonwick@Sun.COM zt->zt_orig_data, zt->zt_orig_size); 262789Sahrens 263*7754SJeff.Bonwick@Sun.COM zio_buf_free(zio->io_data, zt->zt_bufsize); 264789Sahrens 265*7754SJeff.Bonwick@Sun.COM zio->io_data = zt->zt_orig_data; 266*7754SJeff.Bonwick@Sun.COM zio->io_size = zt->zt_orig_size; 267*7754SJeff.Bonwick@Sun.COM zio->io_transform_stack = zt->zt_next; 268789Sahrens 269*7754SJeff.Bonwick@Sun.COM kmem_free(zt, sizeof (zio_transform_t)); 270789Sahrens } 271789Sahrens } 272789Sahrens 273789Sahrens /* 274789Sahrens * ========================================================================== 275*7754SJeff.Bonwick@Sun.COM * I/O transform callbacks for subblocks and decompression 276*7754SJeff.Bonwick@Sun.COM * ========================================================================== 277*7754SJeff.Bonwick@Sun.COM */ 278*7754SJeff.Bonwick@Sun.COM static void 279*7754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size) 280*7754SJeff.Bonwick@Sun.COM { 281*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size > size); 282*7754SJeff.Bonwick@Sun.COM 283*7754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ) 284*7754SJeff.Bonwick@Sun.COM bcopy(zio->io_data, data, size); 285*7754SJeff.Bonwick@Sun.COM } 286*7754SJeff.Bonwick@Sun.COM 287*7754SJeff.Bonwick@Sun.COM static void 288*7754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size) 289*7754SJeff.Bonwick@Sun.COM { 290*7754SJeff.Bonwick@Sun.COM if (zio->io_error == 0 && 291*7754SJeff.Bonwick@Sun.COM zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 292*7754SJeff.Bonwick@Sun.COM zio->io_data, zio->io_size, data, size) != 0) 293*7754SJeff.Bonwick@Sun.COM zio->io_error = EIO; 294*7754SJeff.Bonwick@Sun.COM } 295*7754SJeff.Bonwick@Sun.COM 296*7754SJeff.Bonwick@Sun.COM /* 297*7754SJeff.Bonwick@Sun.COM * ========================================================================== 298*7754SJeff.Bonwick@Sun.COM * I/O parent/child relationships and pipeline interlocks 299*7754SJeff.Bonwick@Sun.COM * ========================================================================== 300*7754SJeff.Bonwick@Sun.COM */ 301*7754SJeff.Bonwick@Sun.COM 302*7754SJeff.Bonwick@Sun.COM static void 303*7754SJeff.Bonwick@Sun.COM zio_add_child(zio_t *pio, zio_t *zio) 304*7754SJeff.Bonwick@Sun.COM { 305*7754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 306*7754SJeff.Bonwick@Sun.COM if (zio->io_stage < ZIO_STAGE_READY) 307*7754SJeff.Bonwick@Sun.COM pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; 308*7754SJeff.Bonwick@Sun.COM if (zio->io_stage < ZIO_STAGE_DONE) 309*7754SJeff.Bonwick@Sun.COM pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; 310*7754SJeff.Bonwick@Sun.COM zio->io_sibling_prev = NULL; 311*7754SJeff.Bonwick@Sun.COM zio->io_sibling_next = pio->io_child; 312*7754SJeff.Bonwick@Sun.COM if (pio->io_child != NULL) 313*7754SJeff.Bonwick@Sun.COM pio->io_child->io_sibling_prev = zio; 314*7754SJeff.Bonwick@Sun.COM pio->io_child = zio; 315*7754SJeff.Bonwick@Sun.COM zio->io_parent = pio; 316*7754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 317*7754SJeff.Bonwick@Sun.COM } 318*7754SJeff.Bonwick@Sun.COM 319*7754SJeff.Bonwick@Sun.COM static void 320*7754SJeff.Bonwick@Sun.COM zio_remove_child(zio_t *pio, zio_t *zio) 321*7754SJeff.Bonwick@Sun.COM { 322*7754SJeff.Bonwick@Sun.COM zio_t *next, *prev; 323*7754SJeff.Bonwick@Sun.COM 324*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_parent == pio); 325*7754SJeff.Bonwick@Sun.COM 326*7754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 327*7754SJeff.Bonwick@Sun.COM next = zio->io_sibling_next; 328*7754SJeff.Bonwick@Sun.COM prev = zio->io_sibling_prev; 329*7754SJeff.Bonwick@Sun.COM if (next != NULL) 330*7754SJeff.Bonwick@Sun.COM next->io_sibling_prev = prev; 331*7754SJeff.Bonwick@Sun.COM if (prev != NULL) 332*7754SJeff.Bonwick@Sun.COM prev->io_sibling_next = next; 333*7754SJeff.Bonwick@Sun.COM if (pio->io_child == zio) 334*7754SJeff.Bonwick@Sun.COM pio->io_child = next; 335*7754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 336*7754SJeff.Bonwick@Sun.COM } 337*7754SJeff.Bonwick@Sun.COM 338*7754SJeff.Bonwick@Sun.COM static boolean_t 339*7754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 340*7754SJeff.Bonwick@Sun.COM { 341*7754SJeff.Bonwick@Sun.COM uint64_t *countp = &zio->io_children[child][wait]; 342*7754SJeff.Bonwick@Sun.COM boolean_t waiting = B_FALSE; 343*7754SJeff.Bonwick@Sun.COM 344*7754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 345*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 346*7754SJeff.Bonwick@Sun.COM if (*countp != 0) { 347*7754SJeff.Bonwick@Sun.COM zio->io_stage--; 348*7754SJeff.Bonwick@Sun.COM zio->io_stall = countp; 349*7754SJeff.Bonwick@Sun.COM waiting = B_TRUE; 350*7754SJeff.Bonwick@Sun.COM } 351*7754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 352*7754SJeff.Bonwick@Sun.COM 353*7754SJeff.Bonwick@Sun.COM return (waiting); 354*7754SJeff.Bonwick@Sun.COM } 355*7754SJeff.Bonwick@Sun.COM 356*7754SJeff.Bonwick@Sun.COM static void 357*7754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 358*7754SJeff.Bonwick@Sun.COM { 359*7754SJeff.Bonwick@Sun.COM uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 360*7754SJeff.Bonwick@Sun.COM int *errorp = &pio->io_child_error[zio->io_child_type]; 361*7754SJeff.Bonwick@Sun.COM 362*7754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 363*7754SJeff.Bonwick@Sun.COM if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 364*7754SJeff.Bonwick@Sun.COM *errorp = zio_worst_error(*errorp, zio->io_error); 365*7754SJeff.Bonwick@Sun.COM pio->io_reexecute |= zio->io_reexecute; 366*7754SJeff.Bonwick@Sun.COM ASSERT3U(*countp, >, 0); 367*7754SJeff.Bonwick@Sun.COM if (--*countp == 0 && pio->io_stall == countp) { 368*7754SJeff.Bonwick@Sun.COM pio->io_stall = NULL; 369*7754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 370*7754SJeff.Bonwick@Sun.COM zio_execute(pio); 371*7754SJeff.Bonwick@Sun.COM } else { 372*7754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 373*7754SJeff.Bonwick@Sun.COM } 374*7754SJeff.Bonwick@Sun.COM } 375*7754SJeff.Bonwick@Sun.COM 376*7754SJeff.Bonwick@Sun.COM static void 377*7754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c) 378*7754SJeff.Bonwick@Sun.COM { 379*7754SJeff.Bonwick@Sun.COM if (zio->io_child_error[c] != 0 && zio->io_error == 0) 380*7754SJeff.Bonwick@Sun.COM zio->io_error = zio->io_child_error[c]; 381*7754SJeff.Bonwick@Sun.COM } 382*7754SJeff.Bonwick@Sun.COM 383*7754SJeff.Bonwick@Sun.COM /* 384*7754SJeff.Bonwick@Sun.COM * ========================================================================== 385*7754SJeff.Bonwick@Sun.COM * Create the various types of I/O (read, write, free, etc) 386789Sahrens * ========================================================================== 387789Sahrens */ 388789Sahrens static zio_t * 389789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 390789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 391*7754SJeff.Bonwick@Sun.COM zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset, 392*7754SJeff.Bonwick@Sun.COM const zbookmark_t *zb, uint8_t stage, uint32_t pipeline) 393789Sahrens { 394789Sahrens zio_t *zio; 395789Sahrens 396789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 397789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 398*7754SJeff.Bonwick@Sun.COM ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 399789Sahrens 400*7754SJeff.Bonwick@Sun.COM ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 401*7754SJeff.Bonwick@Sun.COM ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 402*7754SJeff.Bonwick@Sun.COM ASSERT(vd || stage == ZIO_STAGE_OPEN); 4037046Sahrens 4044055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 4054055Seschrock bzero(zio, sizeof (zio_t)); 406*7754SJeff.Bonwick@Sun.COM 407*7754SJeff.Bonwick@Sun.COM mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 408*7754SJeff.Bonwick@Sun.COM cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 409*7754SJeff.Bonwick@Sun.COM 410*7754SJeff.Bonwick@Sun.COM if (vd != NULL) 411*7754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_VDEV; 412*7754SJeff.Bonwick@Sun.COM else if (flags & ZIO_FLAG_GANG_CHILD) 413*7754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_GANG; 414*7754SJeff.Bonwick@Sun.COM else 415*7754SJeff.Bonwick@Sun.COM zio->io_child_type = ZIO_CHILD_LOGICAL; 416*7754SJeff.Bonwick@Sun.COM 417789Sahrens if (bp != NULL) { 418789Sahrens zio->io_bp = bp; 419789Sahrens zio->io_bp_copy = *bp; 420789Sahrens zio->io_bp_orig = *bp; 421*7754SJeff.Bonwick@Sun.COM if (type != ZIO_TYPE_WRITE) 422*7754SJeff.Bonwick@Sun.COM zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 423*7754SJeff.Bonwick@Sun.COM if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 424*7754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp)) 425*7754SJeff.Bonwick@Sun.COM pipeline |= ZIO_GANG_STAGES; 426*7754SJeff.Bonwick@Sun.COM zio->io_logical = zio; 427*7754SJeff.Bonwick@Sun.COM } 428789Sahrens } 429*7754SJeff.Bonwick@Sun.COM 430*7754SJeff.Bonwick@Sun.COM zio->io_spa = spa; 431*7754SJeff.Bonwick@Sun.COM zio->io_txg = txg; 432*7754SJeff.Bonwick@Sun.COM zio->io_data = data; 433*7754SJeff.Bonwick@Sun.COM zio->io_size = size; 434789Sahrens zio->io_done = done; 435789Sahrens zio->io_private = private; 436789Sahrens zio->io_type = type; 437789Sahrens zio->io_priority = priority; 438*7754SJeff.Bonwick@Sun.COM zio->io_vd = vd; 439*7754SJeff.Bonwick@Sun.COM zio->io_offset = offset; 440*7754SJeff.Bonwick@Sun.COM zio->io_orig_flags = zio->io_flags = flags; 441*7754SJeff.Bonwick@Sun.COM zio->io_orig_stage = zio->io_stage = stage; 442*7754SJeff.Bonwick@Sun.COM zio->io_orig_pipeline = zio->io_pipeline = pipeline; 443*7754SJeff.Bonwick@Sun.COM 444*7754SJeff.Bonwick@Sun.COM if (zb != NULL) 445*7754SJeff.Bonwick@Sun.COM zio->io_bookmark = *zb; 446789Sahrens 447*7754SJeff.Bonwick@Sun.COM if (pio != NULL) { 448*7754SJeff.Bonwick@Sun.COM /* 449*7754SJeff.Bonwick@Sun.COM * Logical I/Os can have logical, gang, or vdev children. 450*7754SJeff.Bonwick@Sun.COM * Gang I/Os can have gang or vdev children. 451*7754SJeff.Bonwick@Sun.COM * Vdev I/Os can only have vdev children. 452*7754SJeff.Bonwick@Sun.COM * The following ASSERT captures all of these constraints. 453*7754SJeff.Bonwick@Sun.COM */ 454*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type <= pio->io_child_type); 455*7754SJeff.Bonwick@Sun.COM if (zio->io_logical == NULL) 4561544Seschrock zio->io_logical = pio->io_logical; 457*7754SJeff.Bonwick@Sun.COM zio_add_child(pio, zio); 458789Sahrens } 459789Sahrens 460789Sahrens return (zio); 461789Sahrens } 462789Sahrens 4635329Sgw25295 static void 464*7754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio) 4655329Sgw25295 { 466*7754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 467*7754SJeff.Bonwick@Sun.COM uint8_t async_root = zio->io_async_root; 4685329Sgw25295 469*7754SJeff.Bonwick@Sun.COM mutex_destroy(&zio->io_lock); 470*7754SJeff.Bonwick@Sun.COM cv_destroy(&zio->io_cv); 471*7754SJeff.Bonwick@Sun.COM kmem_cache_free(zio_cache, zio); 472*7754SJeff.Bonwick@Sun.COM 473*7754SJeff.Bonwick@Sun.COM if (async_root) { 474*7754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_async_root_lock); 475*7754SJeff.Bonwick@Sun.COM if (--spa->spa_async_root_count == 0) 476*7754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_async_root_cv); 477*7754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_async_root_lock); 478*7754SJeff.Bonwick@Sun.COM } 4795329Sgw25295 } 4805329Sgw25295 481789Sahrens zio_t * 482789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 483789Sahrens int flags) 484789Sahrens { 485789Sahrens zio_t *zio; 486789Sahrens 487789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 488*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, 489*7754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 490789Sahrens 491789Sahrens return (zio); 492789Sahrens } 493789Sahrens 494789Sahrens zio_t * 495789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 496789Sahrens { 497789Sahrens return (zio_null(NULL, spa, done, private, flags)); 498789Sahrens } 499789Sahrens 500789Sahrens zio_t * 501*7754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 502*7754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_done_func_t *done, void *private, 5037046Sahrens int priority, int flags, const zbookmark_t *zb) 504789Sahrens { 505789Sahrens zio_t *zio; 506789Sahrens 5077046Sahrens zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, 5087046Sahrens data, size, done, private, 509*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 5102981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 511789Sahrens 512789Sahrens return (zio); 513789Sahrens } 514789Sahrens 515789Sahrens zio_t * 516*7754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 517*7754SJeff.Bonwick@Sun.COM void *data, uint64_t size, zio_prop_t *zp, 518*7754SJeff.Bonwick@Sun.COM zio_done_func_t *ready, zio_done_func_t *done, void *private, 519*7754SJeff.Bonwick@Sun.COM int priority, int flags, const zbookmark_t *zb) 520789Sahrens { 521789Sahrens zio_t *zio; 522789Sahrens 523*7754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 524*7754SJeff.Bonwick@Sun.COM zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 525*7754SJeff.Bonwick@Sun.COM zp->zp_compress >= ZIO_COMPRESS_OFF && 526*7754SJeff.Bonwick@Sun.COM zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 527*7754SJeff.Bonwick@Sun.COM zp->zp_type < DMU_OT_NUMTYPES && 528*7754SJeff.Bonwick@Sun.COM zp->zp_level < 32 && 529*7754SJeff.Bonwick@Sun.COM zp->zp_ndvas > 0 && 530*7754SJeff.Bonwick@Sun.COM zp->zp_ndvas <= spa_max_replication(spa)); 531*7754SJeff.Bonwick@Sun.COM ASSERT(ready != NULL); 5325329Sgw25295 533789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 534*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 535789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 536789Sahrens 5373547Smaybee zio->io_ready = ready; 538*7754SJeff.Bonwick@Sun.COM zio->io_prop = *zp; 539789Sahrens 540789Sahrens return (zio); 541789Sahrens } 542789Sahrens 543789Sahrens zio_t * 544*7754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 545*7754SJeff.Bonwick@Sun.COM uint64_t size, zio_done_func_t *done, void *private, int priority, 546*7754SJeff.Bonwick@Sun.COM int flags, zbookmark_t *zb) 547789Sahrens { 548789Sahrens zio_t *zio; 549789Sahrens 5507181Sperrin zio = zio_create(pio, spa, txg, bp, data, size, done, private, 551*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 552*7754SJeff.Bonwick@Sun.COM ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 553789Sahrens 554789Sahrens return (zio); 555789Sahrens } 556789Sahrens 557789Sahrens zio_t * 558789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 559*7754SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int flags) 560789Sahrens { 561789Sahrens zio_t *zio; 562789Sahrens 563789Sahrens ASSERT(!BP_IS_HOLE(bp)); 564789Sahrens 565*7754SJeff.Bonwick@Sun.COM if (bp->blk_fill == BLK_FILL_ALREADY_FREED) 566*7754SJeff.Bonwick@Sun.COM return (zio_null(pio, spa, NULL, NULL, flags)); 567*7754SJeff.Bonwick@Sun.COM 568789Sahrens if (txg == spa->spa_syncing_txg && 569*7754SJeff.Bonwick@Sun.COM spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { 570789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 571*7754SJeff.Bonwick@Sun.COM return (zio_null(pio, spa, NULL, NULL, flags)); 572789Sahrens } 573789Sahrens 574*7754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 575*7754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 576*7754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 577789Sahrens 578789Sahrens return (zio); 579789Sahrens } 580789Sahrens 581789Sahrens zio_t * 582789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 583*7754SJeff.Bonwick@Sun.COM zio_done_func_t *done, void *private, int flags) 584789Sahrens { 585789Sahrens zio_t *zio; 586789Sahrens 587789Sahrens /* 588789Sahrens * A claim is an allocation of a specific block. Claims are needed 589789Sahrens * to support immediate writes in the intent log. The issue is that 590789Sahrens * immediate writes contain committed data, but in a txg that was 591789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 592789Sahrens * the intent log claims all blocks that contain immediate write data 593789Sahrens * so that the SPA knows they're in use. 594789Sahrens * 595789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 596789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 597789Sahrens */ 598789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 599789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 600789Sahrens 601*7754SJeff.Bonwick@Sun.COM zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 602*7754SJeff.Bonwick@Sun.COM done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 603*7754SJeff.Bonwick@Sun.COM NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 604789Sahrens 605789Sahrens return (zio); 606789Sahrens } 607789Sahrens 608789Sahrens zio_t * 609789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 610789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 611789Sahrens { 612789Sahrens zio_t *zio; 613789Sahrens int c; 614789Sahrens 615789Sahrens if (vd->vdev_children == 0) { 616789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 617*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 618789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 619789Sahrens 620789Sahrens zio->io_cmd = cmd; 621789Sahrens } else { 622789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 623789Sahrens 624789Sahrens for (c = 0; c < vd->vdev_children; c++) 625789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 626789Sahrens done, private, priority, flags)); 627789Sahrens } 628789Sahrens 629789Sahrens return (zio); 630789Sahrens } 631789Sahrens 632789Sahrens zio_t * 633789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 634789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 6355450Sbrendan int priority, int flags, boolean_t labels) 636789Sahrens { 637789Sahrens zio_t *zio; 6385329Sgw25295 639*7754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 640*7754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 641*7754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 642*7754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 643789Sahrens 644*7754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 645*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 646789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 647789Sahrens 648*7754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 649789Sahrens 650789Sahrens return (zio); 651789Sahrens } 652789Sahrens 653789Sahrens zio_t * 654789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 655789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 6565450Sbrendan int priority, int flags, boolean_t labels) 657789Sahrens { 658789Sahrens zio_t *zio; 659789Sahrens 660*7754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_children == 0); 661*7754SJeff.Bonwick@Sun.COM ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 662*7754SJeff.Bonwick@Sun.COM offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 663*7754SJeff.Bonwick@Sun.COM ASSERT3U(offset + size, <=, vd->vdev_psize); 6645329Sgw25295 665*7754SJeff.Bonwick@Sun.COM zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 666*7754SJeff.Bonwick@Sun.COM ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 667789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 668789Sahrens 669*7754SJeff.Bonwick@Sun.COM zio->io_prop.zp_checksum = checksum; 670789Sahrens 671789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 672789Sahrens /* 673789Sahrens * zbt checksums are necessarily destructive -- they modify 674*7754SJeff.Bonwick@Sun.COM * the end of the write buffer to hold the verifier/checksum. 675789Sahrens * Therefore, we must make a local copy in case the data is 676*7754SJeff.Bonwick@Sun.COM * being written to multiple places in parallel. 677789Sahrens */ 678*7754SJeff.Bonwick@Sun.COM void *wbuf = zio_buf_alloc(size); 679789Sahrens bcopy(data, wbuf, size); 680*7754SJeff.Bonwick@Sun.COM zio_push_transform(zio, wbuf, size, size, NULL); 681789Sahrens } 682789Sahrens 683789Sahrens return (zio); 684789Sahrens } 685789Sahrens 686789Sahrens /* 687*7754SJeff.Bonwick@Sun.COM * Create a child I/O to do some work for us. 688789Sahrens */ 689789Sahrens zio_t * 690*7754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 691789Sahrens void *data, uint64_t size, int type, int priority, int flags, 692789Sahrens zio_done_func_t *done, void *private) 693789Sahrens { 694789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 695*7754SJeff.Bonwick@Sun.COM zio_t *zio; 696*7754SJeff.Bonwick@Sun.COM 697*7754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_parent == 698*7754SJeff.Bonwick@Sun.COM (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 699789Sahrens 700789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 701789Sahrens /* 702789Sahrens * If we have the bp, then the child should perform the 703789Sahrens * checksum and the parent need not. This pushes error 704789Sahrens * detection as close to the leaves as possible and 705789Sahrens * eliminates redundant checksums in the interior nodes. 706789Sahrens */ 707789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 708*7754SJeff.Bonwick@Sun.COM pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 709*7754SJeff.Bonwick@Sun.COM } 710*7754SJeff.Bonwick@Sun.COM 711*7754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0) 712*7754SJeff.Bonwick@Sun.COM offset += VDEV_LABEL_START_SIZE; 713*7754SJeff.Bonwick@Sun.COM 714*7754SJeff.Bonwick@Sun.COM zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 715*7754SJeff.Bonwick@Sun.COM done, private, type, priority, 716*7754SJeff.Bonwick@Sun.COM (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) | 717*7754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags, 718*7754SJeff.Bonwick@Sun.COM vd, offset, &pio->io_bookmark, 719*7754SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START - 1, pipeline); 720*7754SJeff.Bonwick@Sun.COM 721*7754SJeff.Bonwick@Sun.COM return (zio); 722*7754SJeff.Bonwick@Sun.COM } 723*7754SJeff.Bonwick@Sun.COM 724*7754SJeff.Bonwick@Sun.COM zio_t * 725*7754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 726*7754SJeff.Bonwick@Sun.COM int type, int priority, int flags, zio_done_func_t *done, void *private) 727*7754SJeff.Bonwick@Sun.COM { 728*7754SJeff.Bonwick@Sun.COM zio_t *zio; 729*7754SJeff.Bonwick@Sun.COM 730*7754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_ops->vdev_op_leaf); 731*7754SJeff.Bonwick@Sun.COM 732*7754SJeff.Bonwick@Sun.COM zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 733*7754SJeff.Bonwick@Sun.COM data, size, done, private, type, priority, 734*7754SJeff.Bonwick@Sun.COM flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 735*7754SJeff.Bonwick@Sun.COM vd, offset, NULL, 736*7754SJeff.Bonwick@Sun.COM ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE); 737*7754SJeff.Bonwick@Sun.COM 738*7754SJeff.Bonwick@Sun.COM return (zio); 739*7754SJeff.Bonwick@Sun.COM } 740*7754SJeff.Bonwick@Sun.COM 741*7754SJeff.Bonwick@Sun.COM void 742*7754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd) 743*7754SJeff.Bonwick@Sun.COM { 744*7754SJeff.Bonwick@Sun.COM zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 745*7754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_PRIORITY_NOW, 746*7754SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 747*7754SJeff.Bonwick@Sun.COM } 748*7754SJeff.Bonwick@Sun.COM 749*7754SJeff.Bonwick@Sun.COM /* 750*7754SJeff.Bonwick@Sun.COM * ========================================================================== 751*7754SJeff.Bonwick@Sun.COM * Prepare to read and write logical blocks 752*7754SJeff.Bonwick@Sun.COM * ========================================================================== 753*7754SJeff.Bonwick@Sun.COM */ 754*7754SJeff.Bonwick@Sun.COM 755*7754SJeff.Bonwick@Sun.COM static int 756*7754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio) 757*7754SJeff.Bonwick@Sun.COM { 758*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 759*7754SJeff.Bonwick@Sun.COM 760*7754SJeff.Bonwick@Sun.COM if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { 761*7754SJeff.Bonwick@Sun.COM uint64_t csize = BP_GET_PSIZE(bp); 762*7754SJeff.Bonwick@Sun.COM void *cbuf = zio_buf_alloc(csize); 763*7754SJeff.Bonwick@Sun.COM 764*7754SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, csize, csize, zio_decompress); 765*7754SJeff.Bonwick@Sun.COM } 766*7754SJeff.Bonwick@Sun.COM 767*7754SJeff.Bonwick@Sun.COM if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 768*7754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_CACHE; 769*7754SJeff.Bonwick@Sun.COM 770*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 771*7754SJeff.Bonwick@Sun.COM } 772*7754SJeff.Bonwick@Sun.COM 773*7754SJeff.Bonwick@Sun.COM static int 774*7754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio) 775*7754SJeff.Bonwick@Sun.COM { 776*7754SJeff.Bonwick@Sun.COM zio_prop_t *zp = &zio->io_prop; 777*7754SJeff.Bonwick@Sun.COM int compress = zp->zp_compress; 778*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 779*7754SJeff.Bonwick@Sun.COM void *cbuf; 780*7754SJeff.Bonwick@Sun.COM uint64_t lsize = zio->io_size; 781*7754SJeff.Bonwick@Sun.COM uint64_t csize = lsize; 782*7754SJeff.Bonwick@Sun.COM uint64_t cbufsize = 0; 783*7754SJeff.Bonwick@Sun.COM int pass = 1; 784*7754SJeff.Bonwick@Sun.COM 785*7754SJeff.Bonwick@Sun.COM /* 786*7754SJeff.Bonwick@Sun.COM * If our children haven't all reached the ready stage, 787*7754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 788*7754SJeff.Bonwick@Sun.COM */ 789*7754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 790*7754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 791*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 792*7754SJeff.Bonwick@Sun.COM 793*7754SJeff.Bonwick@Sun.COM if (!IO_IS_ALLOCATING(zio)) 794*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 795*7754SJeff.Bonwick@Sun.COM 796*7754SJeff.Bonwick@Sun.COM ASSERT(compress != ZIO_COMPRESS_INHERIT); 797*7754SJeff.Bonwick@Sun.COM 798*7754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg) { 799*7754SJeff.Bonwick@Sun.COM /* 800*7754SJeff.Bonwick@Sun.COM * We're rewriting an existing block, which means we're 801*7754SJeff.Bonwick@Sun.COM * working on behalf of spa_sync(). For spa_sync() to 802*7754SJeff.Bonwick@Sun.COM * converge, it must eventually be the case that we don't 803*7754SJeff.Bonwick@Sun.COM * have to allocate new blocks. But compression changes 804*7754SJeff.Bonwick@Sun.COM * the blocksize, which forces a reallocate, and makes 805*7754SJeff.Bonwick@Sun.COM * convergence take longer. Therefore, after the first 806*7754SJeff.Bonwick@Sun.COM * few passes, stop compressing to ensure convergence. 807*7754SJeff.Bonwick@Sun.COM */ 808*7754SJeff.Bonwick@Sun.COM pass = spa_sync_pass(zio->io_spa); 809*7754SJeff.Bonwick@Sun.COM ASSERT(pass > 1); 810*7754SJeff.Bonwick@Sun.COM 811*7754SJeff.Bonwick@Sun.COM if (pass > SYNC_PASS_DONT_COMPRESS) 812*7754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 813*7754SJeff.Bonwick@Sun.COM 814*7754SJeff.Bonwick@Sun.COM /* 815*7754SJeff.Bonwick@Sun.COM * Only MOS (objset 0) data should need to be rewritten. 816*7754SJeff.Bonwick@Sun.COM */ 817*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); 818*7754SJeff.Bonwick@Sun.COM 819*7754SJeff.Bonwick@Sun.COM /* Make sure someone doesn't change their mind on overwrites */ 820*7754SJeff.Bonwick@Sun.COM ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), 821*7754SJeff.Bonwick@Sun.COM spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); 822*7754SJeff.Bonwick@Sun.COM } 823*7754SJeff.Bonwick@Sun.COM 824*7754SJeff.Bonwick@Sun.COM if (compress != ZIO_COMPRESS_OFF) { 825*7754SJeff.Bonwick@Sun.COM if (!zio_compress_data(compress, zio->io_data, zio->io_size, 826*7754SJeff.Bonwick@Sun.COM &cbuf, &csize, &cbufsize)) { 827*7754SJeff.Bonwick@Sun.COM compress = ZIO_COMPRESS_OFF; 828*7754SJeff.Bonwick@Sun.COM } else if (csize != 0) { 829*7754SJeff.Bonwick@Sun.COM zio_push_transform(zio, cbuf, csize, cbufsize, NULL); 830*7754SJeff.Bonwick@Sun.COM } 831789Sahrens } 832789Sahrens 833*7754SJeff.Bonwick@Sun.COM /* 834*7754SJeff.Bonwick@Sun.COM * The final pass of spa_sync() must be all rewrites, but the first 835*7754SJeff.Bonwick@Sun.COM * few passes offer a trade-off: allocating blocks defers convergence, 836*7754SJeff.Bonwick@Sun.COM * but newly allocated blocks are sequential, so they can be written 837*7754SJeff.Bonwick@Sun.COM * to disk faster. Therefore, we allow the first few passes of 838*7754SJeff.Bonwick@Sun.COM * spa_sync() to allocate new blocks, but force rewrites after that. 839*7754SJeff.Bonwick@Sun.COM * There should only be a handful of blocks after pass 1 in any case. 840*7754SJeff.Bonwick@Sun.COM */ 841*7754SJeff.Bonwick@Sun.COM if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 842*7754SJeff.Bonwick@Sun.COM pass > SYNC_PASS_REWRITE) { 843*7754SJeff.Bonwick@Sun.COM ASSERT(csize != 0); 844*7754SJeff.Bonwick@Sun.COM uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 845*7754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 846*7754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_REWRITE; 847*7754SJeff.Bonwick@Sun.COM } else { 848*7754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 849*7754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_WRITE_PIPELINE; 850*7754SJeff.Bonwick@Sun.COM } 851*7754SJeff.Bonwick@Sun.COM 852*7754SJeff.Bonwick@Sun.COM if (csize == 0) { 853*7754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 854*7754SJeff.Bonwick@Sun.COM } else { 855*7754SJeff.Bonwick@Sun.COM ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 856*7754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(bp, lsize); 857*7754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(bp, csize); 858*7754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(bp, compress); 859*7754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(bp, zp->zp_checksum); 860*7754SJeff.Bonwick@Sun.COM BP_SET_TYPE(bp, zp->zp_type); 861*7754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(bp, zp->zp_level); 862*7754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 863*7754SJeff.Bonwick@Sun.COM } 864*7754SJeff.Bonwick@Sun.COM 865*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 866*7754SJeff.Bonwick@Sun.COM } 867*7754SJeff.Bonwick@Sun.COM 868*7754SJeff.Bonwick@Sun.COM /* 869*7754SJeff.Bonwick@Sun.COM * ========================================================================== 870*7754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline 871*7754SJeff.Bonwick@Sun.COM * ========================================================================== 872*7754SJeff.Bonwick@Sun.COM */ 873*7754SJeff.Bonwick@Sun.COM 874*7754SJeff.Bonwick@Sun.COM static void 875*7754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) 876*7754SJeff.Bonwick@Sun.COM { 877*7754SJeff.Bonwick@Sun.COM zio_type_t t = zio->io_type; 878*7754SJeff.Bonwick@Sun.COM 879*7754SJeff.Bonwick@Sun.COM /* 880*7754SJeff.Bonwick@Sun.COM * If we're a config writer, the normal issue and interrupt threads 881*7754SJeff.Bonwick@Sun.COM * may all be blocked waiting for the config lock. In this case, 882*7754SJeff.Bonwick@Sun.COM * select the otherwise-unused taskq for ZIO_TYPE_NULL. 883*7754SJeff.Bonwick@Sun.COM */ 884*7754SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) 885*7754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 886*7754SJeff.Bonwick@Sun.COM 887*7754SJeff.Bonwick@Sun.COM /* 888*7754SJeff.Bonwick@Sun.COM * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 889*7754SJeff.Bonwick@Sun.COM */ 890*7754SJeff.Bonwick@Sun.COM if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 891*7754SJeff.Bonwick@Sun.COM t = ZIO_TYPE_NULL; 892*7754SJeff.Bonwick@Sun.COM 893*7754SJeff.Bonwick@Sun.COM (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], 894*7754SJeff.Bonwick@Sun.COM (task_func_t *)zio_execute, zio, TQ_SLEEP); 895*7754SJeff.Bonwick@Sun.COM } 896*7754SJeff.Bonwick@Sun.COM 897*7754SJeff.Bonwick@Sun.COM static boolean_t 898*7754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 899*7754SJeff.Bonwick@Sun.COM { 900*7754SJeff.Bonwick@Sun.COM kthread_t *executor = zio->io_executor; 901*7754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 902789Sahrens 903*7754SJeff.Bonwick@Sun.COM for (zio_type_t t = 0; t < ZIO_TYPES; t++) 904*7754SJeff.Bonwick@Sun.COM if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 905*7754SJeff.Bonwick@Sun.COM return (B_TRUE); 906*7754SJeff.Bonwick@Sun.COM 907*7754SJeff.Bonwick@Sun.COM return (B_FALSE); 908*7754SJeff.Bonwick@Sun.COM } 909*7754SJeff.Bonwick@Sun.COM 910*7754SJeff.Bonwick@Sun.COM static int 911*7754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio) 912*7754SJeff.Bonwick@Sun.COM { 913*7754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 914*7754SJeff.Bonwick@Sun.COM 915*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 916*7754SJeff.Bonwick@Sun.COM } 917*7754SJeff.Bonwick@Sun.COM 918*7754SJeff.Bonwick@Sun.COM void 919*7754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio) 920*7754SJeff.Bonwick@Sun.COM { 921*7754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); 922*7754SJeff.Bonwick@Sun.COM } 923*7754SJeff.Bonwick@Sun.COM 924*7754SJeff.Bonwick@Sun.COM /* 925*7754SJeff.Bonwick@Sun.COM * Execute the I/O pipeline until one of the following occurs: 926*7754SJeff.Bonwick@Sun.COM * (1) the I/O completes; (2) the pipeline stalls waiting for 927*7754SJeff.Bonwick@Sun.COM * dependent child I/Os; (3) the I/O issues, so we're waiting 928*7754SJeff.Bonwick@Sun.COM * for an I/O completion interrupt; (4) the I/O is delegated by 929*7754SJeff.Bonwick@Sun.COM * vdev-level caching or aggregation; (5) the I/O is deferred 930*7754SJeff.Bonwick@Sun.COM * due to vdev-level queueing; (6) the I/O is handed off to 931*7754SJeff.Bonwick@Sun.COM * another thread. In all cases, the pipeline stops whenever 932*7754SJeff.Bonwick@Sun.COM * there's no CPU work; it never burns a thread in cv_wait(). 933*7754SJeff.Bonwick@Sun.COM * 934*7754SJeff.Bonwick@Sun.COM * There's no locking on io_stage because there's no legitimate way 935*7754SJeff.Bonwick@Sun.COM * for multiple threads to be attempting to process the same I/O. 936*7754SJeff.Bonwick@Sun.COM */ 937*7754SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES]; 938789Sahrens 939*7754SJeff.Bonwick@Sun.COM void 940*7754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio) 941*7754SJeff.Bonwick@Sun.COM { 942*7754SJeff.Bonwick@Sun.COM zio->io_executor = curthread; 943*7754SJeff.Bonwick@Sun.COM 944*7754SJeff.Bonwick@Sun.COM while (zio->io_stage < ZIO_STAGE_DONE) { 945*7754SJeff.Bonwick@Sun.COM uint32_t pipeline = zio->io_pipeline; 946*7754SJeff.Bonwick@Sun.COM zio_stage_t stage = zio->io_stage; 947*7754SJeff.Bonwick@Sun.COM int rv; 948*7754SJeff.Bonwick@Sun.COM 949*7754SJeff.Bonwick@Sun.COM ASSERT(!MUTEX_HELD(&zio->io_lock)); 950*7754SJeff.Bonwick@Sun.COM 951*7754SJeff.Bonwick@Sun.COM while (((1U << ++stage) & pipeline) == 0) 952*7754SJeff.Bonwick@Sun.COM continue; 953*7754SJeff.Bonwick@Sun.COM 954*7754SJeff.Bonwick@Sun.COM ASSERT(stage <= ZIO_STAGE_DONE); 955*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stall == NULL); 956*7754SJeff.Bonwick@Sun.COM 957*7754SJeff.Bonwick@Sun.COM /* 958*7754SJeff.Bonwick@Sun.COM * If we are in interrupt context and this pipeline stage 959*7754SJeff.Bonwick@Sun.COM * will grab a config lock that is held across I/O, 960*7754SJeff.Bonwick@Sun.COM * issue async to avoid deadlock. 961*7754SJeff.Bonwick@Sun.COM */ 962*7754SJeff.Bonwick@Sun.COM if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) && 963*7754SJeff.Bonwick@Sun.COM zio->io_vd == NULL && 964*7754SJeff.Bonwick@Sun.COM zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 965*7754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 966*7754SJeff.Bonwick@Sun.COM return; 967*7754SJeff.Bonwick@Sun.COM } 968*7754SJeff.Bonwick@Sun.COM 969*7754SJeff.Bonwick@Sun.COM zio->io_stage = stage; 970*7754SJeff.Bonwick@Sun.COM rv = zio_pipeline[stage](zio); 971*7754SJeff.Bonwick@Sun.COM 972*7754SJeff.Bonwick@Sun.COM if (rv == ZIO_PIPELINE_STOP) 973*7754SJeff.Bonwick@Sun.COM return; 974*7754SJeff.Bonwick@Sun.COM 975*7754SJeff.Bonwick@Sun.COM ASSERT(rv == ZIO_PIPELINE_CONTINUE); 976*7754SJeff.Bonwick@Sun.COM } 977789Sahrens } 978789Sahrens 979789Sahrens /* 980789Sahrens * ========================================================================== 981789Sahrens * Initiate I/O, either sync or async 982789Sahrens * ========================================================================== 983789Sahrens */ 984789Sahrens int 985789Sahrens zio_wait(zio_t *zio) 986789Sahrens { 987789Sahrens int error; 988789Sahrens 989789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 990*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 991789Sahrens 992789Sahrens zio->io_waiter = curthread; 993789Sahrens 9945530Sbonwick zio_execute(zio); 995789Sahrens 996789Sahrens mutex_enter(&zio->io_lock); 997*7754SJeff.Bonwick@Sun.COM while (zio->io_executor != NULL) 998789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 999789Sahrens mutex_exit(&zio->io_lock); 1000789Sahrens 1001789Sahrens error = zio->io_error; 10026523Sek110237 zio_destroy(zio); 1003789Sahrens 1004789Sahrens return (error); 1005789Sahrens } 1006789Sahrens 1007789Sahrens void 1008789Sahrens zio_nowait(zio_t *zio) 1009789Sahrens { 1010*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_executor == NULL); 1011*7754SJeff.Bonwick@Sun.COM 1012*7754SJeff.Bonwick@Sun.COM if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { 1013*7754SJeff.Bonwick@Sun.COM /* 1014*7754SJeff.Bonwick@Sun.COM * This is a logical async I/O with no parent to wait for it. 1015*7754SJeff.Bonwick@Sun.COM * Attach it to the pool's global async root zio so that 1016*7754SJeff.Bonwick@Sun.COM * spa_unload() has a way of waiting for async I/O to finish. 1017*7754SJeff.Bonwick@Sun.COM */ 1018*7754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 1019*7754SJeff.Bonwick@Sun.COM zio->io_async_root = B_TRUE; 1020*7754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_async_root_lock); 1021*7754SJeff.Bonwick@Sun.COM spa->spa_async_root_count++; 1022*7754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_async_root_lock); 1023*7754SJeff.Bonwick@Sun.COM } 1024*7754SJeff.Bonwick@Sun.COM 10255530Sbonwick zio_execute(zio); 10265530Sbonwick } 10275530Sbonwick 1028*7754SJeff.Bonwick@Sun.COM /* 1029*7754SJeff.Bonwick@Sun.COM * ========================================================================== 1030*7754SJeff.Bonwick@Sun.COM * Reexecute or suspend/resume failed I/O 1031*7754SJeff.Bonwick@Sun.COM * ========================================================================== 1032*7754SJeff.Bonwick@Sun.COM */ 1033*7754SJeff.Bonwick@Sun.COM 1034*7754SJeff.Bonwick@Sun.COM static void 1035*7754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio) 1036*7754SJeff.Bonwick@Sun.COM { 1037*7754SJeff.Bonwick@Sun.COM zio_t *zio, *zio_next; 1038*7754SJeff.Bonwick@Sun.COM 1039*7754SJeff.Bonwick@Sun.COM pio->io_flags = pio->io_orig_flags; 1040*7754SJeff.Bonwick@Sun.COM pio->io_stage = pio->io_orig_stage; 1041*7754SJeff.Bonwick@Sun.COM pio->io_pipeline = pio->io_orig_pipeline; 1042*7754SJeff.Bonwick@Sun.COM pio->io_reexecute = 0; 1043*7754SJeff.Bonwick@Sun.COM pio->io_error = 0; 1044*7754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1045*7754SJeff.Bonwick@Sun.COM pio->io_child_error[c] = 0; 1046*7754SJeff.Bonwick@Sun.COM 1047*7754SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(pio)) { 1048*7754SJeff.Bonwick@Sun.COM /* 1049*7754SJeff.Bonwick@Sun.COM * Remember the failed bp so that the io_ready() callback 1050*7754SJeff.Bonwick@Sun.COM * can update its accounting upon reexecution. The block 1051*7754SJeff.Bonwick@Sun.COM * was already freed in zio_done(); we indicate this with 1052*7754SJeff.Bonwick@Sun.COM * a fill count of -1 so that zio_free() knows to skip it. 1053*7754SJeff.Bonwick@Sun.COM */ 1054*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 1055*7754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); 1056*7754SJeff.Bonwick@Sun.COM bp->blk_fill = BLK_FILL_ALREADY_FREED; 1057*7754SJeff.Bonwick@Sun.COM pio->io_bp_orig = *bp; 1058*7754SJeff.Bonwick@Sun.COM BP_ZERO(bp); 1059*7754SJeff.Bonwick@Sun.COM } 1060*7754SJeff.Bonwick@Sun.COM 1061*7754SJeff.Bonwick@Sun.COM /* 1062*7754SJeff.Bonwick@Sun.COM * As we reexecute pio's children, new children could be created. 1063*7754SJeff.Bonwick@Sun.COM * New children go to the head of the io_child list, however, 1064*7754SJeff.Bonwick@Sun.COM * so we will (correctly) not reexecute them. The key is that 1065*7754SJeff.Bonwick@Sun.COM * the remainder of the io_child list, from 'zio_next' onward, 1066*7754SJeff.Bonwick@Sun.COM * cannot be affected by any side effects of reexecuting 'zio'. 1067*7754SJeff.Bonwick@Sun.COM */ 1068*7754SJeff.Bonwick@Sun.COM for (zio = pio->io_child; zio != NULL; zio = zio_next) { 1069*7754SJeff.Bonwick@Sun.COM zio_next = zio->io_sibling_next; 1070*7754SJeff.Bonwick@Sun.COM mutex_enter(&pio->io_lock); 1071*7754SJeff.Bonwick@Sun.COM pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; 1072*7754SJeff.Bonwick@Sun.COM pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; 1073*7754SJeff.Bonwick@Sun.COM mutex_exit(&pio->io_lock); 1074*7754SJeff.Bonwick@Sun.COM zio_reexecute(zio); 1075*7754SJeff.Bonwick@Sun.COM } 1076*7754SJeff.Bonwick@Sun.COM 1077*7754SJeff.Bonwick@Sun.COM /* 1078*7754SJeff.Bonwick@Sun.COM * Now that all children have been reexecuted, execute the parent. 1079*7754SJeff.Bonwick@Sun.COM */ 1080*7754SJeff.Bonwick@Sun.COM zio_execute(pio); 1081*7754SJeff.Bonwick@Sun.COM } 1082*7754SJeff.Bonwick@Sun.COM 10835530Sbonwick void 1084*7754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio) 10855530Sbonwick { 1086*7754SJeff.Bonwick@Sun.COM if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1087*7754SJeff.Bonwick@Sun.COM fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1088*7754SJeff.Bonwick@Sun.COM "failure and the failure mode property for this pool " 1089*7754SJeff.Bonwick@Sun.COM "is set to panic.", spa_name(spa)); 1090*7754SJeff.Bonwick@Sun.COM 1091*7754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1092*7754SJeff.Bonwick@Sun.COM 1093*7754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 1094*7754SJeff.Bonwick@Sun.COM 1095*7754SJeff.Bonwick@Sun.COM if (spa->spa_suspend_zio_root == NULL) 1096*7754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); 1097*7754SJeff.Bonwick@Sun.COM 1098*7754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_TRUE; 1099*7754SJeff.Bonwick@Sun.COM 1100*7754SJeff.Bonwick@Sun.COM if (zio != NULL) { 1101*7754SJeff.Bonwick@Sun.COM ASSERT(zio != spa->spa_suspend_zio_root); 1102*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1103*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_parent == NULL); 1104*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1105*7754SJeff.Bonwick@Sun.COM zio_add_child(spa->spa_suspend_zio_root, zio); 1106*7754SJeff.Bonwick@Sun.COM } 1107*7754SJeff.Bonwick@Sun.COM 1108*7754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 11095530Sbonwick } 11105530Sbonwick 1111*7754SJeff.Bonwick@Sun.COM void 1112*7754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa) 11135530Sbonwick { 1114*7754SJeff.Bonwick@Sun.COM zio_t *pio, *zio; 1115*7754SJeff.Bonwick@Sun.COM 1116*7754SJeff.Bonwick@Sun.COM /* 1117*7754SJeff.Bonwick@Sun.COM * Reexecute all previously suspended i/o. 1118*7754SJeff.Bonwick@Sun.COM */ 1119*7754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 1120*7754SJeff.Bonwick@Sun.COM spa->spa_suspended = B_FALSE; 1121*7754SJeff.Bonwick@Sun.COM cv_broadcast(&spa->spa_suspend_cv); 1122*7754SJeff.Bonwick@Sun.COM pio = spa->spa_suspend_zio_root; 1123*7754SJeff.Bonwick@Sun.COM spa->spa_suspend_zio_root = NULL; 1124*7754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1125*7754SJeff.Bonwick@Sun.COM 1126*7754SJeff.Bonwick@Sun.COM if (pio == NULL) 1127*7754SJeff.Bonwick@Sun.COM return; 11285530Sbonwick 1129*7754SJeff.Bonwick@Sun.COM while ((zio = pio->io_child) != NULL) { 1130*7754SJeff.Bonwick@Sun.COM zio_remove_child(pio, zio); 1131*7754SJeff.Bonwick@Sun.COM zio->io_parent = NULL; 1132*7754SJeff.Bonwick@Sun.COM zio_reexecute(zio); 1133*7754SJeff.Bonwick@Sun.COM } 1134*7754SJeff.Bonwick@Sun.COM 1135*7754SJeff.Bonwick@Sun.COM ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); 1136*7754SJeff.Bonwick@Sun.COM 1137*7754SJeff.Bonwick@Sun.COM (void) zio_wait(pio); 1138*7754SJeff.Bonwick@Sun.COM } 1139*7754SJeff.Bonwick@Sun.COM 1140*7754SJeff.Bonwick@Sun.COM void 1141*7754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa) 1142*7754SJeff.Bonwick@Sun.COM { 1143*7754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_suspend_lock); 1144*7754SJeff.Bonwick@Sun.COM while (spa_suspended(spa)) 1145*7754SJeff.Bonwick@Sun.COM cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1146*7754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_suspend_lock); 1147789Sahrens } 1148789Sahrens 1149789Sahrens /* 1150789Sahrens * ========================================================================== 1151*7754SJeff.Bonwick@Sun.COM * Gang blocks. 1152*7754SJeff.Bonwick@Sun.COM * 1153*7754SJeff.Bonwick@Sun.COM * A gang block is a collection of small blocks that looks to the DMU 1154*7754SJeff.Bonwick@Sun.COM * like one large block. When zio_dva_allocate() cannot find a block 1155*7754SJeff.Bonwick@Sun.COM * of the requested size, due to either severe fragmentation or the pool 1156*7754SJeff.Bonwick@Sun.COM * being nearly full, it calls zio_write_gang_block() to construct the 1157*7754SJeff.Bonwick@Sun.COM * block from smaller fragments. 1158*7754SJeff.Bonwick@Sun.COM * 1159*7754SJeff.Bonwick@Sun.COM * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1160*7754SJeff.Bonwick@Sun.COM * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1161*7754SJeff.Bonwick@Sun.COM * an indirect block: it's an array of block pointers. It consumes 1162*7754SJeff.Bonwick@Sun.COM * only one sector and hence is allocatable regardless of fragmentation. 1163*7754SJeff.Bonwick@Sun.COM * The gang header's bps point to its gang members, which hold the data. 1164*7754SJeff.Bonwick@Sun.COM * 1165*7754SJeff.Bonwick@Sun.COM * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1166*7754SJeff.Bonwick@Sun.COM * as the verifier to ensure uniqueness of the SHA256 checksum. 1167*7754SJeff.Bonwick@Sun.COM * Critically, the gang block bp's blk_cksum is the checksum of the data, 1168*7754SJeff.Bonwick@Sun.COM * not the gang header. This ensures that data block signatures (needed for 1169*7754SJeff.Bonwick@Sun.COM * deduplication) are independent of how the block is physically stored. 1170*7754SJeff.Bonwick@Sun.COM * 1171*7754SJeff.Bonwick@Sun.COM * Gang blocks can be nested: a gang member may itself be a gang block. 1172*7754SJeff.Bonwick@Sun.COM * Thus every gang block is a tree in which root and all interior nodes are 1173*7754SJeff.Bonwick@Sun.COM * gang headers, and the leaves are normal blocks that contain user data. 1174*7754SJeff.Bonwick@Sun.COM * The root of the gang tree is called the gang leader. 1175*7754SJeff.Bonwick@Sun.COM * 1176*7754SJeff.Bonwick@Sun.COM * To perform any operation (read, rewrite, free, claim) on a gang block, 1177*7754SJeff.Bonwick@Sun.COM * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1178*7754SJeff.Bonwick@Sun.COM * in the io_gang_tree field of the original logical i/o by recursively 1179*7754SJeff.Bonwick@Sun.COM * reading the gang leader and all gang headers below it. This yields 1180*7754SJeff.Bonwick@Sun.COM * an in-core tree containing the contents of every gang header and the 1181*7754SJeff.Bonwick@Sun.COM * bps for every constituent of the gang block. 1182*7754SJeff.Bonwick@Sun.COM * 1183*7754SJeff.Bonwick@Sun.COM * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1184*7754SJeff.Bonwick@Sun.COM * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1185*7754SJeff.Bonwick@Sun.COM * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1186*7754SJeff.Bonwick@Sun.COM * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1187*7754SJeff.Bonwick@Sun.COM * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1188*7754SJeff.Bonwick@Sun.COM * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1189*7754SJeff.Bonwick@Sun.COM * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1190*7754SJeff.Bonwick@Sun.COM * of the gang header plus zio_checksum_compute() of the data to update the 1191*7754SJeff.Bonwick@Sun.COM * gang header's blk_cksum as described above. 1192*7754SJeff.Bonwick@Sun.COM * 1193*7754SJeff.Bonwick@Sun.COM * The two-phase assemble/issue model solves the problem of partial failure -- 1194*7754SJeff.Bonwick@Sun.COM * what if you'd freed part of a gang block but then couldn't read the 1195*7754SJeff.Bonwick@Sun.COM * gang header for another part? Assembling the entire gang tree first 1196*7754SJeff.Bonwick@Sun.COM * ensures that all the necessary gang header I/O has succeeded before 1197*7754SJeff.Bonwick@Sun.COM * starting the actual work of free, claim, or write. Once the gang tree 1198*7754SJeff.Bonwick@Sun.COM * is assembled, free and claim are in-memory operations that cannot fail. 1199*7754SJeff.Bonwick@Sun.COM * 1200*7754SJeff.Bonwick@Sun.COM * In the event that a gang write fails, zio_dva_unallocate() walks the 1201*7754SJeff.Bonwick@Sun.COM * gang tree to immediately free (i.e. insert back into the space map) 1202*7754SJeff.Bonwick@Sun.COM * everything we've allocated. This ensures that we don't get ENOSPC 1203*7754SJeff.Bonwick@Sun.COM * errors during repeated suspend/resume cycles due to a flaky device. 1204*7754SJeff.Bonwick@Sun.COM * 1205*7754SJeff.Bonwick@Sun.COM * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1206*7754SJeff.Bonwick@Sun.COM * the gang tree, we won't modify the block, so we can safely defer the free 1207*7754SJeff.Bonwick@Sun.COM * (knowing that the block is still intact). If we *can* assemble the gang 1208*7754SJeff.Bonwick@Sun.COM * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1209*7754SJeff.Bonwick@Sun.COM * each constituent bp and we can allocate a new block on the next sync pass. 1210*7754SJeff.Bonwick@Sun.COM * 1211*7754SJeff.Bonwick@Sun.COM * In all cases, the gang tree allows complete recovery from partial failure. 1212789Sahrens * ========================================================================== 1213789Sahrens */ 12145530Sbonwick 1215*7754SJeff.Bonwick@Sun.COM static zio_t * 1216*7754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1217*7754SJeff.Bonwick@Sun.COM { 1218*7754SJeff.Bonwick@Sun.COM if (gn != NULL) 1219*7754SJeff.Bonwick@Sun.COM return (pio); 12205530Sbonwick 1221*7754SJeff.Bonwick@Sun.COM return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1222*7754SJeff.Bonwick@Sun.COM NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1223*7754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1224789Sahrens } 1225789Sahrens 1226*7754SJeff.Bonwick@Sun.COM zio_t * 1227*7754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 12286523Sek110237 { 1229*7754SJeff.Bonwick@Sun.COM zio_t *zio; 12306523Sek110237 1231*7754SJeff.Bonwick@Sun.COM if (gn != NULL) { 1232*7754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1233*7754SJeff.Bonwick@Sun.COM gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1234*7754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1235*7754SJeff.Bonwick@Sun.COM /* 1236*7754SJeff.Bonwick@Sun.COM * As we rewrite each gang header, the pipeline will compute 1237*7754SJeff.Bonwick@Sun.COM * a new gang block header checksum for it; but no one will 1238*7754SJeff.Bonwick@Sun.COM * compute a new data checksum, so we do that here. The one 1239*7754SJeff.Bonwick@Sun.COM * exception is the gang leader: the pipeline already computed 1240*7754SJeff.Bonwick@Sun.COM * its data checksum because that stage precedes gang assembly. 1241*7754SJeff.Bonwick@Sun.COM * (Presently, nothing actually uses interior data checksums; 1242*7754SJeff.Bonwick@Sun.COM * this is just good hygiene.) 1243*7754SJeff.Bonwick@Sun.COM */ 1244*7754SJeff.Bonwick@Sun.COM if (gn != pio->io_logical->io_gang_tree) { 1245*7754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1246*7754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp)); 1247*7754SJeff.Bonwick@Sun.COM } 1248*7754SJeff.Bonwick@Sun.COM } else { 1249*7754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1250*7754SJeff.Bonwick@Sun.COM data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1251*7754SJeff.Bonwick@Sun.COM ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 12526523Sek110237 } 12536523Sek110237 1254*7754SJeff.Bonwick@Sun.COM return (zio); 1255*7754SJeff.Bonwick@Sun.COM } 1256*7754SJeff.Bonwick@Sun.COM 1257*7754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 1258*7754SJeff.Bonwick@Sun.COM zio_t * 1259*7754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1260*7754SJeff.Bonwick@Sun.COM { 1261*7754SJeff.Bonwick@Sun.COM return (zio_free(pio, pio->io_spa, pio->io_txg, bp, 1262*7754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1263*7754SJeff.Bonwick@Sun.COM } 1264*7754SJeff.Bonwick@Sun.COM 1265*7754SJeff.Bonwick@Sun.COM /* ARGSUSED */ 1266*7754SJeff.Bonwick@Sun.COM zio_t * 1267*7754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1268*7754SJeff.Bonwick@Sun.COM { 1269*7754SJeff.Bonwick@Sun.COM return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1270*7754SJeff.Bonwick@Sun.COM NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1271*7754SJeff.Bonwick@Sun.COM } 1272*7754SJeff.Bonwick@Sun.COM 1273*7754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1274*7754SJeff.Bonwick@Sun.COM NULL, 1275*7754SJeff.Bonwick@Sun.COM zio_read_gang, 1276*7754SJeff.Bonwick@Sun.COM zio_rewrite_gang, 1277*7754SJeff.Bonwick@Sun.COM zio_free_gang, 1278*7754SJeff.Bonwick@Sun.COM zio_claim_gang, 1279*7754SJeff.Bonwick@Sun.COM NULL 1280*7754SJeff.Bonwick@Sun.COM }; 1281*7754SJeff.Bonwick@Sun.COM 1282*7754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio); 1283*7754SJeff.Bonwick@Sun.COM 1284*7754SJeff.Bonwick@Sun.COM static zio_gang_node_t * 1285*7754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp) 1286*7754SJeff.Bonwick@Sun.COM { 1287*7754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn; 1288*7754SJeff.Bonwick@Sun.COM 1289*7754SJeff.Bonwick@Sun.COM ASSERT(*gnpp == NULL); 1290*7754SJeff.Bonwick@Sun.COM 1291*7754SJeff.Bonwick@Sun.COM gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1292*7754SJeff.Bonwick@Sun.COM gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1293*7754SJeff.Bonwick@Sun.COM *gnpp = gn; 1294*7754SJeff.Bonwick@Sun.COM 1295*7754SJeff.Bonwick@Sun.COM return (gn); 12966523Sek110237 } 12976523Sek110237 12986523Sek110237 static void 1299*7754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp) 1300*7754SJeff.Bonwick@Sun.COM { 1301*7754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 1302*7754SJeff.Bonwick@Sun.COM 1303*7754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1304*7754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_child[g] == NULL); 1305*7754SJeff.Bonwick@Sun.COM 1306*7754SJeff.Bonwick@Sun.COM zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1307*7754SJeff.Bonwick@Sun.COM kmem_free(gn, sizeof (*gn)); 1308*7754SJeff.Bonwick@Sun.COM *gnpp = NULL; 1309*7754SJeff.Bonwick@Sun.COM } 1310*7754SJeff.Bonwick@Sun.COM 1311*7754SJeff.Bonwick@Sun.COM static void 1312*7754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp) 1313789Sahrens { 1314*7754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = *gnpp; 1315*7754SJeff.Bonwick@Sun.COM 1316*7754SJeff.Bonwick@Sun.COM if (gn == NULL) 1317*7754SJeff.Bonwick@Sun.COM return; 1318*7754SJeff.Bonwick@Sun.COM 1319*7754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1320*7754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&gn->gn_child[g]); 1321*7754SJeff.Bonwick@Sun.COM 1322*7754SJeff.Bonwick@Sun.COM zio_gang_node_free(gnpp); 1323*7754SJeff.Bonwick@Sun.COM } 1324*7754SJeff.Bonwick@Sun.COM 1325*7754SJeff.Bonwick@Sun.COM static void 1326*7754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) 1327*7754SJeff.Bonwick@Sun.COM { 1328*7754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1329789Sahrens 1330*7754SJeff.Bonwick@Sun.COM ASSERT(lio->io_logical == lio); 1331*7754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp)); 1332*7754SJeff.Bonwick@Sun.COM 1333*7754SJeff.Bonwick@Sun.COM zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, 1334*7754SJeff.Bonwick@Sun.COM SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1335*7754SJeff.Bonwick@Sun.COM lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); 1336*7754SJeff.Bonwick@Sun.COM } 1337*7754SJeff.Bonwick@Sun.COM 1338*7754SJeff.Bonwick@Sun.COM static void 1339*7754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio) 1340*7754SJeff.Bonwick@Sun.COM { 1341*7754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 1342*7754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn = zio->io_private; 1343*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 1344*7754SJeff.Bonwick@Sun.COM 1345*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_parent == lio); 1346*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child == NULL); 1347*7754SJeff.Bonwick@Sun.COM 1348*7754SJeff.Bonwick@Sun.COM if (zio->io_error) 1349*7754SJeff.Bonwick@Sun.COM return; 1350*7754SJeff.Bonwick@Sun.COM 1351*7754SJeff.Bonwick@Sun.COM if (BP_SHOULD_BYTESWAP(bp)) 1352*7754SJeff.Bonwick@Sun.COM byteswap_uint64_array(zio->io_data, zio->io_size); 1353*7754SJeff.Bonwick@Sun.COM 1354*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_data == gn->gn_gbh); 1355*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1356*7754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1357*7754SJeff.Bonwick@Sun.COM 1358*7754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1359*7754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1360*7754SJeff.Bonwick@Sun.COM if (!BP_IS_GANG(gbp)) 1361*7754SJeff.Bonwick@Sun.COM continue; 1362*7754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); 1363789Sahrens } 1364789Sahrens } 1365789Sahrens 1366*7754SJeff.Bonwick@Sun.COM static void 1367*7754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1368789Sahrens { 1369*7754SJeff.Bonwick@Sun.COM zio_t *lio = pio->io_logical; 1370*7754SJeff.Bonwick@Sun.COM zio_t *zio; 1371*7754SJeff.Bonwick@Sun.COM 1372*7754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) == !!gn); 1373*7754SJeff.Bonwick@Sun.COM ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); 1374*7754SJeff.Bonwick@Sun.COM ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); 1375*7754SJeff.Bonwick@Sun.COM 1376*7754SJeff.Bonwick@Sun.COM /* 1377*7754SJeff.Bonwick@Sun.COM * If you're a gang header, your data is in gn->gn_gbh. 1378*7754SJeff.Bonwick@Sun.COM * If you're a gang member, your data is in 'data' and gn == NULL. 1379*7754SJeff.Bonwick@Sun.COM */ 1380*7754SJeff.Bonwick@Sun.COM zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); 1381789Sahrens 1382*7754SJeff.Bonwick@Sun.COM if (gn != NULL) { 1383*7754SJeff.Bonwick@Sun.COM ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1384*7754SJeff.Bonwick@Sun.COM 1385*7754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1386*7754SJeff.Bonwick@Sun.COM blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1387*7754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(gbp)) 1388*7754SJeff.Bonwick@Sun.COM continue; 1389*7754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1390*7754SJeff.Bonwick@Sun.COM data = (char *)data + BP_GET_PSIZE(gbp); 1391*7754SJeff.Bonwick@Sun.COM } 1392*7754SJeff.Bonwick@Sun.COM } 1393*7754SJeff.Bonwick@Sun.COM 1394*7754SJeff.Bonwick@Sun.COM if (gn == lio->io_gang_tree) 1395*7754SJeff.Bonwick@Sun.COM ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); 1396*7754SJeff.Bonwick@Sun.COM 1397*7754SJeff.Bonwick@Sun.COM if (zio != pio) 1398*7754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1399789Sahrens } 1400789Sahrens 14015530Sbonwick static int 1402*7754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio) 14035329Sgw25295 { 14045530Sbonwick blkptr_t *bp = zio->io_bp; 14055530Sbonwick 1406*7754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); 14075530Sbonwick 1408*7754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1409789Sahrens 14105530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1411789Sahrens } 1412789Sahrens 14135530Sbonwick static int 1414*7754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio) 14156523Sek110237 { 1416*7754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 14176523Sek110237 blkptr_t *bp = zio->io_bp; 1418789Sahrens 1419*7754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1420*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 14215329Sgw25295 1422*7754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_GANG(bp) && zio == lio); 1423789Sahrens 1424*7754SJeff.Bonwick@Sun.COM if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1425*7754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); 1426*7754SJeff.Bonwick@Sun.COM else 1427*7754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&lio->io_gang_tree); 1428789Sahrens 1429*7754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 14305530Sbonwick 14315530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1432789Sahrens } 1433789Sahrens 1434789Sahrens static void 1435*7754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio) 1436789Sahrens { 1437789Sahrens zio_t *pio = zio->io_parent; 1438*7754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 14391775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 14401775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1441789Sahrens uint64_t asize; 1442*7754SJeff.Bonwick@Sun.COM 1443*7754SJeff.Bonwick@Sun.COM if (BP_IS_HOLE(zio->io_bp)) 1444*7754SJeff.Bonwick@Sun.COM return; 1445*7754SJeff.Bonwick@Sun.COM 1446*7754SJeff.Bonwick@Sun.COM ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1447789Sahrens 1448*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1449*7754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); 1450*7754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 1451*7754SJeff.Bonwick@Sun.COM ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 14521775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 14531775Sbillm 1454789Sahrens mutex_enter(&pio->io_lock); 1455*7754SJeff.Bonwick@Sun.COM for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 14561775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 14571775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 14581775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 14591775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 14601775Sbillm } 1461789Sahrens mutex_exit(&pio->io_lock); 1462789Sahrens } 1463789Sahrens 14645329Sgw25295 static int 1465*7754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio) 1466789Sahrens { 1467*7754SJeff.Bonwick@Sun.COM spa_t *spa = pio->io_spa; 1468*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = pio->io_bp; 1469*7754SJeff.Bonwick@Sun.COM zio_t *lio = pio->io_logical; 1470*7754SJeff.Bonwick@Sun.COM zio_t *zio; 1471*7754SJeff.Bonwick@Sun.COM zio_gang_node_t *gn, **gnpp; 1472789Sahrens zio_gbh_phys_t *gbh; 1473*7754SJeff.Bonwick@Sun.COM uint64_t txg = pio->io_txg; 1474*7754SJeff.Bonwick@Sun.COM uint64_t resid = pio->io_size; 1475*7754SJeff.Bonwick@Sun.COM uint64_t lsize; 1476*7754SJeff.Bonwick@Sun.COM int ndvas = lio->io_prop.zp_ndvas; 14771775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1478*7754SJeff.Bonwick@Sun.COM zio_prop_t zp; 1479789Sahrens int error; 1480789Sahrens 1481*7754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, 1482*7754SJeff.Bonwick@Sun.COM bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, 1483*7754SJeff.Bonwick@Sun.COM METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 14845530Sbonwick if (error) { 1485*7754SJeff.Bonwick@Sun.COM pio->io_error = error; 14865530Sbonwick return (ZIO_PIPELINE_CONTINUE); 14875530Sbonwick } 1488789Sahrens 1489*7754SJeff.Bonwick@Sun.COM if (pio == lio) { 1490*7754SJeff.Bonwick@Sun.COM gnpp = &lio->io_gang_tree; 1491*7754SJeff.Bonwick@Sun.COM } else { 1492*7754SJeff.Bonwick@Sun.COM gnpp = pio->io_private; 1493*7754SJeff.Bonwick@Sun.COM ASSERT(pio->io_ready == zio_write_gang_member_ready); 1494789Sahrens } 1495789Sahrens 1496*7754SJeff.Bonwick@Sun.COM gn = zio_gang_node_alloc(gnpp); 1497*7754SJeff.Bonwick@Sun.COM gbh = gn->gn_gbh; 1498*7754SJeff.Bonwick@Sun.COM bzero(gbh, SPA_GANGBLOCKSIZE); 1499789Sahrens 1500*7754SJeff.Bonwick@Sun.COM /* 1501*7754SJeff.Bonwick@Sun.COM * Create the gang header. 1502*7754SJeff.Bonwick@Sun.COM */ 1503*7754SJeff.Bonwick@Sun.COM zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1504*7754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 15055530Sbonwick 15061775Sbillm /* 1507*7754SJeff.Bonwick@Sun.COM * Create and nowait the gang children. 15081775Sbillm */ 1509*7754SJeff.Bonwick@Sun.COM for (int g = 0; resid != 0; resid -= lsize, g++) { 1510*7754SJeff.Bonwick@Sun.COM lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1511*7754SJeff.Bonwick@Sun.COM SPA_MINBLOCKSIZE); 1512*7754SJeff.Bonwick@Sun.COM ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1513*7754SJeff.Bonwick@Sun.COM 1514*7754SJeff.Bonwick@Sun.COM zp.zp_checksum = lio->io_prop.zp_checksum; 1515*7754SJeff.Bonwick@Sun.COM zp.zp_compress = ZIO_COMPRESS_OFF; 1516*7754SJeff.Bonwick@Sun.COM zp.zp_type = DMU_OT_NONE; 1517*7754SJeff.Bonwick@Sun.COM zp.zp_level = 0; 1518*7754SJeff.Bonwick@Sun.COM zp.zp_ndvas = lio->io_prop.zp_ndvas; 1519*7754SJeff.Bonwick@Sun.COM 1520*7754SJeff.Bonwick@Sun.COM zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1521*7754SJeff.Bonwick@Sun.COM (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1522*7754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1523*7754SJeff.Bonwick@Sun.COM pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1524*7754SJeff.Bonwick@Sun.COM &pio->io_bookmark)); 1525*7754SJeff.Bonwick@Sun.COM } 1526*7754SJeff.Bonwick@Sun.COM 1527*7754SJeff.Bonwick@Sun.COM /* 1528*7754SJeff.Bonwick@Sun.COM * Set pio's pipeline to just wait for zio to finish. 1529*7754SJeff.Bonwick@Sun.COM */ 1530*7754SJeff.Bonwick@Sun.COM pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1531*7754SJeff.Bonwick@Sun.COM 1532*7754SJeff.Bonwick@Sun.COM zio_nowait(zio); 1533*7754SJeff.Bonwick@Sun.COM 1534*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1535789Sahrens } 1536789Sahrens 1537789Sahrens /* 1538789Sahrens * ========================================================================== 1539789Sahrens * Allocate and free blocks 1540789Sahrens * ========================================================================== 1541789Sahrens */ 1542*7754SJeff.Bonwick@Sun.COM 15435530Sbonwick static int 1544789Sahrens zio_dva_allocate(zio_t *zio) 1545789Sahrens { 15464527Sperrin spa_t *spa = zio->io_spa; 15474527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1548789Sahrens blkptr_t *bp = zio->io_bp; 1549789Sahrens int error; 1550789Sahrens 1551789Sahrens ASSERT(BP_IS_HOLE(bp)); 15521775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1553*7754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, >, 0); 1554*7754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa)); 1555789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1556789Sahrens 1557*7754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, mc, zio->io_size, bp, 1558*7754SJeff.Bonwick@Sun.COM zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0); 1559789Sahrens 1560*7754SJeff.Bonwick@Sun.COM if (error) { 1561*7754SJeff.Bonwick@Sun.COM if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 1562*7754SJeff.Bonwick@Sun.COM return (zio_write_gang_block(zio)); 1563789Sahrens zio->io_error = error; 1564789Sahrens } 15655530Sbonwick 15665530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1567789Sahrens } 1568789Sahrens 15695530Sbonwick static int 1570789Sahrens zio_dva_free(zio_t *zio) 1571789Sahrens { 1572*7754SJeff.Bonwick@Sun.COM metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 1573789Sahrens 15745530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1575789Sahrens } 1576789Sahrens 15775530Sbonwick static int 1578789Sahrens zio_dva_claim(zio_t *zio) 1579789Sahrens { 1580*7754SJeff.Bonwick@Sun.COM int error; 1581*7754SJeff.Bonwick@Sun.COM 1582*7754SJeff.Bonwick@Sun.COM error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1583*7754SJeff.Bonwick@Sun.COM if (error) 1584*7754SJeff.Bonwick@Sun.COM zio->io_error = error; 1585789Sahrens 15865530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1587789Sahrens } 1588789Sahrens 1589789Sahrens /* 1590*7754SJeff.Bonwick@Sun.COM * Undo an allocation. This is used by zio_done() when an I/O fails 1591*7754SJeff.Bonwick@Sun.COM * and we want to give back the block we just allocated. 1592*7754SJeff.Bonwick@Sun.COM * This handles both normal blocks and gang blocks. 1593*7754SJeff.Bonwick@Sun.COM */ 1594*7754SJeff.Bonwick@Sun.COM static void 1595*7754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 1596*7754SJeff.Bonwick@Sun.COM { 1597*7754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 1598*7754SJeff.Bonwick@Sun.COM boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); 1599*7754SJeff.Bonwick@Sun.COM 1600*7754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 1601*7754SJeff.Bonwick@Sun.COM 1602*7754SJeff.Bonwick@Sun.COM if (zio->io_bp == bp && !now) { 1603*7754SJeff.Bonwick@Sun.COM /* 1604*7754SJeff.Bonwick@Sun.COM * This is a rewrite for sync-to-convergence. 1605*7754SJeff.Bonwick@Sun.COM * We can't do a metaslab_free(NOW) because bp wasn't allocated 1606*7754SJeff.Bonwick@Sun.COM * during this sync pass, which means that metaslab_sync() 1607*7754SJeff.Bonwick@Sun.COM * already committed the allocation. 1608*7754SJeff.Bonwick@Sun.COM */ 1609*7754SJeff.Bonwick@Sun.COM ASSERT(DVA_EQUAL(BP_IDENTITY(bp), 1610*7754SJeff.Bonwick@Sun.COM BP_IDENTITY(&zio->io_bp_orig))); 1611*7754SJeff.Bonwick@Sun.COM ASSERT(spa_sync_pass(spa) > 1); 1612*7754SJeff.Bonwick@Sun.COM 1613*7754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && gn == NULL) { 1614*7754SJeff.Bonwick@Sun.COM /* 1615*7754SJeff.Bonwick@Sun.COM * This is a gang leader whose gang header(s) we 1616*7754SJeff.Bonwick@Sun.COM * couldn't read now, so defer the free until later. 1617*7754SJeff.Bonwick@Sun.COM * The block should still be intact because without 1618*7754SJeff.Bonwick@Sun.COM * the headers, we'd never even start the rewrite. 1619*7754SJeff.Bonwick@Sun.COM */ 1620*7754SJeff.Bonwick@Sun.COM bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 1621*7754SJeff.Bonwick@Sun.COM return; 1622*7754SJeff.Bonwick@Sun.COM } 1623*7754SJeff.Bonwick@Sun.COM } 1624*7754SJeff.Bonwick@Sun.COM 1625*7754SJeff.Bonwick@Sun.COM if (!BP_IS_HOLE(bp)) 1626*7754SJeff.Bonwick@Sun.COM metaslab_free(spa, bp, bp->blk_birth, now); 1627*7754SJeff.Bonwick@Sun.COM 1628*7754SJeff.Bonwick@Sun.COM if (gn != NULL) { 1629*7754SJeff.Bonwick@Sun.COM for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1630*7754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, gn->gn_child[g], 1631*7754SJeff.Bonwick@Sun.COM &gn->gn_gbh->zg_blkptr[g]); 1632*7754SJeff.Bonwick@Sun.COM } 1633*7754SJeff.Bonwick@Sun.COM } 1634*7754SJeff.Bonwick@Sun.COM } 1635*7754SJeff.Bonwick@Sun.COM 1636*7754SJeff.Bonwick@Sun.COM /* 1637*7754SJeff.Bonwick@Sun.COM * Try to allocate an intent log block. Return 0 on success, errno on failure. 1638*7754SJeff.Bonwick@Sun.COM */ 1639*7754SJeff.Bonwick@Sun.COM int 1640*7754SJeff.Bonwick@Sun.COM zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 1641*7754SJeff.Bonwick@Sun.COM uint64_t txg) 1642*7754SJeff.Bonwick@Sun.COM { 1643*7754SJeff.Bonwick@Sun.COM int error; 1644*7754SJeff.Bonwick@Sun.COM 1645*7754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa->spa_log_class, size, 1646*7754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 1647*7754SJeff.Bonwick@Sun.COM 1648*7754SJeff.Bonwick@Sun.COM if (error) 1649*7754SJeff.Bonwick@Sun.COM error = metaslab_alloc(spa, spa->spa_normal_class, size, 1650*7754SJeff.Bonwick@Sun.COM new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 1651*7754SJeff.Bonwick@Sun.COM 1652*7754SJeff.Bonwick@Sun.COM if (error == 0) { 1653*7754SJeff.Bonwick@Sun.COM BP_SET_LSIZE(new_bp, size); 1654*7754SJeff.Bonwick@Sun.COM BP_SET_PSIZE(new_bp, size); 1655*7754SJeff.Bonwick@Sun.COM BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 1656*7754SJeff.Bonwick@Sun.COM BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 1657*7754SJeff.Bonwick@Sun.COM BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 1658*7754SJeff.Bonwick@Sun.COM BP_SET_LEVEL(new_bp, 0); 1659*7754SJeff.Bonwick@Sun.COM BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 1660*7754SJeff.Bonwick@Sun.COM } 1661*7754SJeff.Bonwick@Sun.COM 1662*7754SJeff.Bonwick@Sun.COM return (error); 1663*7754SJeff.Bonwick@Sun.COM } 1664*7754SJeff.Bonwick@Sun.COM 1665*7754SJeff.Bonwick@Sun.COM /* 1666*7754SJeff.Bonwick@Sun.COM * Free an intent log block. We know it can't be a gang block, so there's 1667*7754SJeff.Bonwick@Sun.COM * nothing to do except metaslab_free() it. 1668*7754SJeff.Bonwick@Sun.COM */ 1669*7754SJeff.Bonwick@Sun.COM void 1670*7754SJeff.Bonwick@Sun.COM zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1671*7754SJeff.Bonwick@Sun.COM { 1672*7754SJeff.Bonwick@Sun.COM ASSERT(!BP_IS_GANG(bp)); 1673*7754SJeff.Bonwick@Sun.COM 1674*7754SJeff.Bonwick@Sun.COM metaslab_free(spa, bp, txg, B_FALSE); 1675*7754SJeff.Bonwick@Sun.COM } 1676*7754SJeff.Bonwick@Sun.COM 1677*7754SJeff.Bonwick@Sun.COM /* 1678789Sahrens * ========================================================================== 1679789Sahrens * Read and write to physical devices 1680789Sahrens * ========================================================================== 1681789Sahrens */ 1682789Sahrens 1683*7754SJeff.Bonwick@Sun.COM static void 1684*7754SJeff.Bonwick@Sun.COM zio_vdev_io_probe_done(zio_t *zio) 1685*7754SJeff.Bonwick@Sun.COM { 1686*7754SJeff.Bonwick@Sun.COM zio_t *dio; 1687*7754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_private; 1688*7754SJeff.Bonwick@Sun.COM 1689*7754SJeff.Bonwick@Sun.COM mutex_enter(&vd->vdev_probe_lock); 1690*7754SJeff.Bonwick@Sun.COM ASSERT(vd->vdev_probe_zio == zio); 1691*7754SJeff.Bonwick@Sun.COM vd->vdev_probe_zio = NULL; 1692*7754SJeff.Bonwick@Sun.COM mutex_exit(&vd->vdev_probe_lock); 1693*7754SJeff.Bonwick@Sun.COM 1694*7754SJeff.Bonwick@Sun.COM while ((dio = zio->io_delegate_list) != NULL) { 1695*7754SJeff.Bonwick@Sun.COM zio->io_delegate_list = dio->io_delegate_next; 1696*7754SJeff.Bonwick@Sun.COM dio->io_delegate_next = NULL; 1697*7754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, dio)) 1698*7754SJeff.Bonwick@Sun.COM dio->io_error = ENXIO; 1699*7754SJeff.Bonwick@Sun.COM zio_execute(dio); 1700*7754SJeff.Bonwick@Sun.COM } 1701*7754SJeff.Bonwick@Sun.COM } 1702*7754SJeff.Bonwick@Sun.COM 1703*7754SJeff.Bonwick@Sun.COM /* 1704*7754SJeff.Bonwick@Sun.COM * Probe the device to determine whether I/O failure is specific to this 1705*7754SJeff.Bonwick@Sun.COM * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). 1706*7754SJeff.Bonwick@Sun.COM */ 1707*7754SJeff.Bonwick@Sun.COM static int 1708*7754SJeff.Bonwick@Sun.COM zio_vdev_io_probe(zio_t *zio) 1709*7754SJeff.Bonwick@Sun.COM { 1710*7754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 1711*7754SJeff.Bonwick@Sun.COM zio_t *pio = NULL; 1712*7754SJeff.Bonwick@Sun.COM boolean_t created_pio = B_FALSE; 1713*7754SJeff.Bonwick@Sun.COM 1714*7754SJeff.Bonwick@Sun.COM /* 1715*7754SJeff.Bonwick@Sun.COM * Don't probe the probe. 1716*7754SJeff.Bonwick@Sun.COM */ 1717*7754SJeff.Bonwick@Sun.COM if (zio->io_flags & ZIO_FLAG_PROBE) 1718*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1719*7754SJeff.Bonwick@Sun.COM 1720*7754SJeff.Bonwick@Sun.COM /* 1721*7754SJeff.Bonwick@Sun.COM * To prevent 'probe storms' when a device fails, we create 1722*7754SJeff.Bonwick@Sun.COM * just one probe i/o at a time. All zios that want to probe 1723*7754SJeff.Bonwick@Sun.COM * this vdev will join the probe zio's io_delegate_list. 1724*7754SJeff.Bonwick@Sun.COM */ 1725*7754SJeff.Bonwick@Sun.COM mutex_enter(&vd->vdev_probe_lock); 1726*7754SJeff.Bonwick@Sun.COM 1727*7754SJeff.Bonwick@Sun.COM if ((pio = vd->vdev_probe_zio) == NULL) { 1728*7754SJeff.Bonwick@Sun.COM vd->vdev_probe_zio = pio = zio_root(zio->io_spa, 1729*7754SJeff.Bonwick@Sun.COM zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); 1730*7754SJeff.Bonwick@Sun.COM created_pio = B_TRUE; 1731*7754SJeff.Bonwick@Sun.COM vd->vdev_probe_wanted = B_TRUE; 1732*7754SJeff.Bonwick@Sun.COM spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); 1733*7754SJeff.Bonwick@Sun.COM } 1734*7754SJeff.Bonwick@Sun.COM 1735*7754SJeff.Bonwick@Sun.COM zio->io_delegate_next = pio->io_delegate_list; 1736*7754SJeff.Bonwick@Sun.COM pio->io_delegate_list = zio; 1737*7754SJeff.Bonwick@Sun.COM 1738*7754SJeff.Bonwick@Sun.COM mutex_exit(&vd->vdev_probe_lock); 1739*7754SJeff.Bonwick@Sun.COM 1740*7754SJeff.Bonwick@Sun.COM if (created_pio) { 1741*7754SJeff.Bonwick@Sun.COM zio_nowait(vdev_probe(vd, pio)); 1742*7754SJeff.Bonwick@Sun.COM zio_nowait(pio); 1743*7754SJeff.Bonwick@Sun.COM } 1744*7754SJeff.Bonwick@Sun.COM 1745*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1746*7754SJeff.Bonwick@Sun.COM } 1747*7754SJeff.Bonwick@Sun.COM 17485530Sbonwick static int 17491775Sbillm zio_vdev_io_start(zio_t *zio) 1750789Sahrens { 1751789Sahrens vdev_t *vd = zio->io_vd; 17521775Sbillm uint64_t align; 17535329Sgw25295 spa_t *spa = zio->io_spa; 17545329Sgw25295 1755*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0); 1756*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 1757*7754SJeff.Bonwick@Sun.COM 1758*7754SJeff.Bonwick@Sun.COM if (vd == NULL) { 1759*7754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 1760*7754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 1761789Sahrens 1762*7754SJeff.Bonwick@Sun.COM /* 1763*7754SJeff.Bonwick@Sun.COM * The mirror_ops handle multiple DVAs in a single BP. 1764*7754SJeff.Bonwick@Sun.COM */ 17655530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 1766*7754SJeff.Bonwick@Sun.COM } 17671775Sbillm 1768*7754SJeff.Bonwick@Sun.COM align = 1ULL << vd->vdev_top->vdev_ashift; 1769789Sahrens 17701732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 17711732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 17721732Sbonwick char *abuf = zio_buf_alloc(asize); 1773*7754SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 17741732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 17751732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 17761732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 17771732Sbonwick } 1778*7754SJeff.Bonwick@Sun.COM zio_push_transform(zio, abuf, asize, asize, zio_subblock); 17791732Sbonwick } 17801732Sbonwick 17811732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 17821732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 1783789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1784789Sahrens 1785*7754SJeff.Bonwick@Sun.COM if (vd->vdev_ops->vdev_op_leaf && 1786*7754SJeff.Bonwick@Sun.COM (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 1787*7754SJeff.Bonwick@Sun.COM 1788*7754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 1789*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1790*7754SJeff.Bonwick@Sun.COM 1791*7754SJeff.Bonwick@Sun.COM if ((zio = vdev_queue_io(zio)) == NULL) 1792*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1793*7754SJeff.Bonwick@Sun.COM 1794*7754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 1795*7754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 1796*7754SJeff.Bonwick@Sun.COM zio_interrupt(zio); 1797*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1798*7754SJeff.Bonwick@Sun.COM } 1799*7754SJeff.Bonwick@Sun.COM 1800*7754SJeff.Bonwick@Sun.COM } 1801*7754SJeff.Bonwick@Sun.COM 18025530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 1803789Sahrens } 1804789Sahrens 18055530Sbonwick static int 1806789Sahrens zio_vdev_io_done(zio_t *zio) 1807789Sahrens { 1808*7754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 1809*7754SJeff.Bonwick@Sun.COM vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 1810*7754SJeff.Bonwick@Sun.COM boolean_t unexpected_error = B_FALSE; 18115530Sbonwick 1812*7754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 1813*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1814*7754SJeff.Bonwick@Sun.COM 1815*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 1816789Sahrens 1817*7754SJeff.Bonwick@Sun.COM if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 1818*7754SJeff.Bonwick@Sun.COM 1819*7754SJeff.Bonwick@Sun.COM vdev_queue_io_done(zio); 1820*7754SJeff.Bonwick@Sun.COM 1821*7754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE) 1822*7754SJeff.Bonwick@Sun.COM vdev_cache_write(zio); 1823*7754SJeff.Bonwick@Sun.COM 1824*7754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 1825*7754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_device_injection(vd, EIO); 1826789Sahrens 1827*7754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 1828*7754SJeff.Bonwick@Sun.COM zio->io_error = zio_handle_label_injection(zio, EIO); 1829*7754SJeff.Bonwick@Sun.COM 1830*7754SJeff.Bonwick@Sun.COM if (zio->io_error) { 1831*7754SJeff.Bonwick@Sun.COM if (!vdev_accessible(vd, zio)) { 1832*7754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 1833*7754SJeff.Bonwick@Sun.COM } else { 1834*7754SJeff.Bonwick@Sun.COM unexpected_error = B_TRUE; 1835*7754SJeff.Bonwick@Sun.COM } 1836*7754SJeff.Bonwick@Sun.COM } 18376976Seschrock } 1838*7754SJeff.Bonwick@Sun.COM 1839*7754SJeff.Bonwick@Sun.COM ops->vdev_op_io_done(zio); 1840789Sahrens 1841*7754SJeff.Bonwick@Sun.COM if (unexpected_error) 1842*7754SJeff.Bonwick@Sun.COM return (zio_vdev_io_probe(zio)); 1843*7754SJeff.Bonwick@Sun.COM 1844*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1845789Sahrens } 1846789Sahrens 18475530Sbonwick static int 1848789Sahrens zio_vdev_io_assess(zio_t *zio) 1849789Sahrens { 1850789Sahrens vdev_t *vd = zio->io_vd; 1851789Sahrens 1852*7754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 1853*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1854789Sahrens 1855*7754SJeff.Bonwick@Sun.COM if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 1856*7754SJeff.Bonwick@Sun.COM spa_config_exit(zio->io_spa, SCL_ZIO, zio); 1857*7754SJeff.Bonwick@Sun.COM 1858*7754SJeff.Bonwick@Sun.COM if (zio->io_vsd != NULL) { 1859*7754SJeff.Bonwick@Sun.COM zio->io_vsd_free(zio); 1860*7754SJeff.Bonwick@Sun.COM zio->io_vsd = NULL; 18611732Sbonwick } 18621732Sbonwick 1863*7754SJeff.Bonwick@Sun.COM if (zio_injection_enabled && zio->io_error == 0) 18641544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1865789Sahrens 1866789Sahrens /* 1867789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1868789Sahrens */ 1869*7754SJeff.Bonwick@Sun.COM if (zio->io_error && vd == NULL && 1870*7754SJeff.Bonwick@Sun.COM !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 1871*7754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 1872*7754SJeff.Bonwick@Sun.COM ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 1873789Sahrens zio->io_error = 0; 1874*7754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_IO_RETRY | 1875*7754SJeff.Bonwick@Sun.COM ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 18761775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1877*7754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1878*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 1879*7754SJeff.Bonwick@Sun.COM } 1880789Sahrens 1881*7754SJeff.Bonwick@Sun.COM /* 1882*7754SJeff.Bonwick@Sun.COM * If we got an error on a leaf device, convert it to ENXIO 1883*7754SJeff.Bonwick@Sun.COM * if the device is not accessible at all. 1884*7754SJeff.Bonwick@Sun.COM */ 1885*7754SJeff.Bonwick@Sun.COM if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 1886*7754SJeff.Bonwick@Sun.COM !vdev_accessible(vd, zio)) 1887*7754SJeff.Bonwick@Sun.COM zio->io_error = ENXIO; 1888*7754SJeff.Bonwick@Sun.COM 1889*7754SJeff.Bonwick@Sun.COM /* 1890*7754SJeff.Bonwick@Sun.COM * If we can't write to an interior vdev (mirror or RAID-Z), 1891*7754SJeff.Bonwick@Sun.COM * set vdev_cant_write so that we stop trying to allocate from it. 1892*7754SJeff.Bonwick@Sun.COM */ 1893*7754SJeff.Bonwick@Sun.COM if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 1894*7754SJeff.Bonwick@Sun.COM vd != NULL && !vd->vdev_ops->vdev_op_leaf) 1895*7754SJeff.Bonwick@Sun.COM vd->vdev_cant_write = B_TRUE; 1896*7754SJeff.Bonwick@Sun.COM 1897*7754SJeff.Bonwick@Sun.COM if (zio->io_error) 1898*7754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1899789Sahrens 19005530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1901789Sahrens } 1902789Sahrens 1903789Sahrens void 1904789Sahrens zio_vdev_io_reissue(zio_t *zio) 1905789Sahrens { 1906789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1907789Sahrens ASSERT(zio->io_error == 0); 1908789Sahrens 1909789Sahrens zio->io_stage--; 1910789Sahrens } 1911789Sahrens 1912789Sahrens void 1913789Sahrens zio_vdev_io_redone(zio_t *zio) 1914789Sahrens { 1915789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1916789Sahrens 1917789Sahrens zio->io_stage--; 1918789Sahrens } 1919789Sahrens 1920789Sahrens void 1921789Sahrens zio_vdev_io_bypass(zio_t *zio) 1922789Sahrens { 1923789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1924789Sahrens ASSERT(zio->io_error == 0); 1925789Sahrens 1926789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1927789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1928789Sahrens } 1929789Sahrens 1930789Sahrens /* 1931789Sahrens * ========================================================================== 1932789Sahrens * Generate and verify checksums 1933789Sahrens * ========================================================================== 1934789Sahrens */ 19355530Sbonwick static int 1936789Sahrens zio_checksum_generate(zio_t *zio) 1937789Sahrens { 1938789Sahrens blkptr_t *bp = zio->io_bp; 1939*7754SJeff.Bonwick@Sun.COM enum zio_checksum checksum; 1940789Sahrens 1941*7754SJeff.Bonwick@Sun.COM if (bp == NULL) { 1942*7754SJeff.Bonwick@Sun.COM /* 1943*7754SJeff.Bonwick@Sun.COM * This is zio_write_phys(). 1944*7754SJeff.Bonwick@Sun.COM * We're either generating a label checksum, or none at all. 1945*7754SJeff.Bonwick@Sun.COM */ 1946*7754SJeff.Bonwick@Sun.COM checksum = zio->io_prop.zp_checksum; 1947789Sahrens 1948*7754SJeff.Bonwick@Sun.COM if (checksum == ZIO_CHECKSUM_OFF) 1949*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1950789Sahrens 1951*7754SJeff.Bonwick@Sun.COM ASSERT(checksum == ZIO_CHECKSUM_LABEL); 1952*7754SJeff.Bonwick@Sun.COM } else { 1953*7754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 1954*7754SJeff.Bonwick@Sun.COM ASSERT(!IO_IS_ALLOCATING(zio)); 1955*7754SJeff.Bonwick@Sun.COM checksum = ZIO_CHECKSUM_GANG_HEADER; 1956*7754SJeff.Bonwick@Sun.COM } else { 1957*7754SJeff.Bonwick@Sun.COM checksum = BP_GET_CHECKSUM(bp); 1958*7754SJeff.Bonwick@Sun.COM } 1959*7754SJeff.Bonwick@Sun.COM } 1960789Sahrens 1961*7754SJeff.Bonwick@Sun.COM zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 1962789Sahrens 19635530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1964789Sahrens } 1965789Sahrens 19665530Sbonwick static int 1967789Sahrens zio_checksum_verify(zio_t *zio) 1968789Sahrens { 1969*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 1970*7754SJeff.Bonwick@Sun.COM int error; 1971*7754SJeff.Bonwick@Sun.COM 1972*7754SJeff.Bonwick@Sun.COM if (bp == NULL) { 1973*7754SJeff.Bonwick@Sun.COM /* 1974*7754SJeff.Bonwick@Sun.COM * This is zio_read_phys(). 1975*7754SJeff.Bonwick@Sun.COM * We're either verifying a label checksum, or nothing at all. 1976*7754SJeff.Bonwick@Sun.COM */ 1977*7754SJeff.Bonwick@Sun.COM if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 1978*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 1979*7754SJeff.Bonwick@Sun.COM 1980*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 1981*7754SJeff.Bonwick@Sun.COM } 1982*7754SJeff.Bonwick@Sun.COM 1983*7754SJeff.Bonwick@Sun.COM if ((error = zio_checksum_error(zio)) != 0) { 1984*7754SJeff.Bonwick@Sun.COM zio->io_error = error; 1985*7754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 19861544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 19871544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1988*7754SJeff.Bonwick@Sun.COM } 1989789Sahrens } 1990789Sahrens 19915530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1992789Sahrens } 1993789Sahrens 1994789Sahrens /* 1995789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1996789Sahrens */ 1997789Sahrens void 1998789Sahrens zio_checksum_verified(zio_t *zio) 1999789Sahrens { 2000789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 2001789Sahrens } 2002789Sahrens 2003789Sahrens /* 2004*7754SJeff.Bonwick@Sun.COM * ========================================================================== 2005*7754SJeff.Bonwick@Sun.COM * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2006*7754SJeff.Bonwick@Sun.COM * An error of 0 indictes success. ENXIO indicates whole-device failure, 2007*7754SJeff.Bonwick@Sun.COM * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2008*7754SJeff.Bonwick@Sun.COM * indicate errors that are specific to one I/O, and most likely permanent. 2009*7754SJeff.Bonwick@Sun.COM * Any other error is presumed to be worse because we weren't expecting it. 2010*7754SJeff.Bonwick@Sun.COM * ========================================================================== 2011789Sahrens */ 2012*7754SJeff.Bonwick@Sun.COM int 2013*7754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2) 2014789Sahrens { 2015*7754SJeff.Bonwick@Sun.COM static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2016*7754SJeff.Bonwick@Sun.COM int r1, r2; 20171775Sbillm 2018*7754SJeff.Bonwick@Sun.COM for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2019*7754SJeff.Bonwick@Sun.COM if (e1 == zio_error_rank[r1]) 2020*7754SJeff.Bonwick@Sun.COM break; 2021*7754SJeff.Bonwick@Sun.COM 2022*7754SJeff.Bonwick@Sun.COM for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2023*7754SJeff.Bonwick@Sun.COM if (e2 == zio_error_rank[r2]) 2024*7754SJeff.Bonwick@Sun.COM break; 2025*7754SJeff.Bonwick@Sun.COM 2026*7754SJeff.Bonwick@Sun.COM return (r1 > r2 ? e1 : e2); 2027789Sahrens } 2028789Sahrens 2029789Sahrens /* 2030789Sahrens * ========================================================================== 2031*7754SJeff.Bonwick@Sun.COM * I/O completion 2032789Sahrens * ========================================================================== 2033789Sahrens */ 2034*7754SJeff.Bonwick@Sun.COM static int 2035*7754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio) 2036*7754SJeff.Bonwick@Sun.COM { 2037*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 2038*7754SJeff.Bonwick@Sun.COM zio_t *pio = zio->io_parent; 2039*7754SJeff.Bonwick@Sun.COM 2040*7754SJeff.Bonwick@Sun.COM if (zio->io_ready) { 2041*7754SJeff.Bonwick@Sun.COM if (BP_IS_GANG(bp) && 2042*7754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) 2043*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2044*7754SJeff.Bonwick@Sun.COM 2045*7754SJeff.Bonwick@Sun.COM ASSERT(IO_IS_ALLOCATING(zio)); 2046*7754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2047*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2048*7754SJeff.Bonwick@Sun.COM 2049*7754SJeff.Bonwick@Sun.COM zio->io_ready(zio); 2050*7754SJeff.Bonwick@Sun.COM } 2051*7754SJeff.Bonwick@Sun.COM 2052*7754SJeff.Bonwick@Sun.COM if (bp != NULL && bp != &zio->io_bp_copy) 2053*7754SJeff.Bonwick@Sun.COM zio->io_bp_copy = *bp; 2054*7754SJeff.Bonwick@Sun.COM 2055*7754SJeff.Bonwick@Sun.COM if (zio->io_error) 2056*7754SJeff.Bonwick@Sun.COM zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2057*7754SJeff.Bonwick@Sun.COM 2058*7754SJeff.Bonwick@Sun.COM if (pio != NULL) 2059*7754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2060*7754SJeff.Bonwick@Sun.COM 2061*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_CONTINUE); 2062*7754SJeff.Bonwick@Sun.COM } 2063*7754SJeff.Bonwick@Sun.COM 2064*7754SJeff.Bonwick@Sun.COM static int 2065*7754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio) 2066*7754SJeff.Bonwick@Sun.COM { 2067*7754SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 2068*7754SJeff.Bonwick@Sun.COM zio_t *pio = zio->io_parent; 2069*7754SJeff.Bonwick@Sun.COM zio_t *lio = zio->io_logical; 2070*7754SJeff.Bonwick@Sun.COM blkptr_t *bp = zio->io_bp; 2071*7754SJeff.Bonwick@Sun.COM vdev_t *vd = zio->io_vd; 2072*7754SJeff.Bonwick@Sun.COM uint64_t psize = zio->io_size; 2073*7754SJeff.Bonwick@Sun.COM 2074*7754SJeff.Bonwick@Sun.COM /* 2075*7754SJeff.Bonwick@Sun.COM * If our of children haven't all completed, 2076*7754SJeff.Bonwick@Sun.COM * wait for them and then repeat this pipeline stage. 2077*7754SJeff.Bonwick@Sun.COM */ 2078*7754SJeff.Bonwick@Sun.COM if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2079*7754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2080*7754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2081*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2082*7754SJeff.Bonwick@Sun.COM 2083*7754SJeff.Bonwick@Sun.COM for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2084*7754SJeff.Bonwick@Sun.COM for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2085*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_children[c][w] == 0); 2086*7754SJeff.Bonwick@Sun.COM 2087*7754SJeff.Bonwick@Sun.COM if (bp != NULL) { 2088*7754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[0] == 0); 2089*7754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[1] == 0); 2090*7754SJeff.Bonwick@Sun.COM ASSERT(bp->blk_pad[2] == 0); 2091*7754SJeff.Bonwick@Sun.COM ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2092*7754SJeff.Bonwick@Sun.COM (pio != NULL && bp == pio->io_bp)); 2093*7754SJeff.Bonwick@Sun.COM if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2094*7754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2095*7754SJeff.Bonwick@Sun.COM ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2096*7754SJeff.Bonwick@Sun.COM ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp)); 2097*7754SJeff.Bonwick@Sun.COM ASSERT(BP_COUNT_GANG(bp) == 0 || 2098*7754SJeff.Bonwick@Sun.COM (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2099*7754SJeff.Bonwick@Sun.COM } 2100*7754SJeff.Bonwick@Sun.COM } 2101*7754SJeff.Bonwick@Sun.COM 2102*7754SJeff.Bonwick@Sun.COM /* 2103*7754SJeff.Bonwick@Sun.COM * If there were child vdev or gang errors, they apply to us now. 2104*7754SJeff.Bonwick@Sun.COM */ 2105*7754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2106*7754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2107*7754SJeff.Bonwick@Sun.COM 2108*7754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio); /* note: may set zio->io_error */ 2109*7754SJeff.Bonwick@Sun.COM 2110*7754SJeff.Bonwick@Sun.COM vdev_stat_update(zio, psize); 2111*7754SJeff.Bonwick@Sun.COM 2112*7754SJeff.Bonwick@Sun.COM if (zio->io_error) { 2113*7754SJeff.Bonwick@Sun.COM /* 2114*7754SJeff.Bonwick@Sun.COM * If this I/O is attached to a particular vdev, 2115*7754SJeff.Bonwick@Sun.COM * generate an error message describing the I/O failure 2116*7754SJeff.Bonwick@Sun.COM * at the block level. We ignore these errors if the 2117*7754SJeff.Bonwick@Sun.COM * device is currently unavailable. 2118*7754SJeff.Bonwick@Sun.COM */ 2119*7754SJeff.Bonwick@Sun.COM if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2120*7754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2121*7754SJeff.Bonwick@Sun.COM 2122*7754SJeff.Bonwick@Sun.COM if ((zio->io_error == EIO || 2123*7754SJeff.Bonwick@Sun.COM !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { 2124*7754SJeff.Bonwick@Sun.COM /* 2125*7754SJeff.Bonwick@Sun.COM * For logical I/O requests, tell the SPA to log the 2126*7754SJeff.Bonwick@Sun.COM * error and generate a logical data ereport. 2127*7754SJeff.Bonwick@Sun.COM */ 2128*7754SJeff.Bonwick@Sun.COM spa_log_error(spa, zio); 2129*7754SJeff.Bonwick@Sun.COM zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2130*7754SJeff.Bonwick@Sun.COM 0, 0); 2131*7754SJeff.Bonwick@Sun.COM } 2132*7754SJeff.Bonwick@Sun.COM } 2133*7754SJeff.Bonwick@Sun.COM 2134*7754SJeff.Bonwick@Sun.COM if (zio->io_error && zio == lio) { 2135*7754SJeff.Bonwick@Sun.COM /* 2136*7754SJeff.Bonwick@Sun.COM * Determine whether zio should be reexecuted. This will 2137*7754SJeff.Bonwick@Sun.COM * propagate all the way to the root via zio_notify_parent(). 2138*7754SJeff.Bonwick@Sun.COM */ 2139*7754SJeff.Bonwick@Sun.COM ASSERT(vd == NULL && bp != NULL); 2140789Sahrens 2141*7754SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio)) 2142*7754SJeff.Bonwick@Sun.COM if (zio->io_error != ENOSPC) 2143*7754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2144*7754SJeff.Bonwick@Sun.COM else 2145*7754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2146*7754SJeff.Bonwick@Sun.COM 2147*7754SJeff.Bonwick@Sun.COM if ((zio->io_type == ZIO_TYPE_READ || 2148*7754SJeff.Bonwick@Sun.COM zio->io_type == ZIO_TYPE_FREE) && 2149*7754SJeff.Bonwick@Sun.COM zio->io_error == ENXIO && 2150*7754SJeff.Bonwick@Sun.COM spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2151*7754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2152*7754SJeff.Bonwick@Sun.COM 2153*7754SJeff.Bonwick@Sun.COM if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2154*7754SJeff.Bonwick@Sun.COM zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2155*7754SJeff.Bonwick@Sun.COM } 2156*7754SJeff.Bonwick@Sun.COM 2157*7754SJeff.Bonwick@Sun.COM /* 2158*7754SJeff.Bonwick@Sun.COM * If there were logical child errors, they apply to us now. 2159*7754SJeff.Bonwick@Sun.COM * We defer this until now to avoid conflating logical child 2160*7754SJeff.Bonwick@Sun.COM * errors with errors that happened to the zio itself when 2161*7754SJeff.Bonwick@Sun.COM * updating vdev stats and reporting FMA events above. 2162*7754SJeff.Bonwick@Sun.COM */ 2163*7754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2164*7754SJeff.Bonwick@Sun.COM 2165*7754SJeff.Bonwick@Sun.COM if (zio->io_reexecute) { 2166*7754SJeff.Bonwick@Sun.COM /* 2167*7754SJeff.Bonwick@Sun.COM * This is a logical I/O that wants to reexecute. 2168*7754SJeff.Bonwick@Sun.COM * 2169*7754SJeff.Bonwick@Sun.COM * Reexecute is top-down. When an i/o fails, if it's not 2170*7754SJeff.Bonwick@Sun.COM * the root, it simply notifies its parent and sticks around. 2171*7754SJeff.Bonwick@Sun.COM * The parent, seeing that it still has children in zio_done(), 2172*7754SJeff.Bonwick@Sun.COM * does the same. This percolates all the way up to the root. 2173*7754SJeff.Bonwick@Sun.COM * The root i/o will reexecute or suspend the entire tree. 2174*7754SJeff.Bonwick@Sun.COM * 2175*7754SJeff.Bonwick@Sun.COM * This approach ensures that zio_reexecute() honors 2176*7754SJeff.Bonwick@Sun.COM * all the original i/o dependency relationships, e.g. 2177*7754SJeff.Bonwick@Sun.COM * parents not executing until children are ready. 2178*7754SJeff.Bonwick@Sun.COM */ 2179*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2180*7754SJeff.Bonwick@Sun.COM 2181*7754SJeff.Bonwick@Sun.COM if (IO_IS_ALLOCATING(zio)) 2182*7754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2183*7754SJeff.Bonwick@Sun.COM 2184*7754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 2185*7754SJeff.Bonwick@Sun.COM 2186*7754SJeff.Bonwick@Sun.COM if (pio != NULL) { 2187*7754SJeff.Bonwick@Sun.COM /* 2188*7754SJeff.Bonwick@Sun.COM * We're not a root i/o, so there's nothing to do 2189*7754SJeff.Bonwick@Sun.COM * but notify our parent. Don't propagate errors 2190*7754SJeff.Bonwick@Sun.COM * upward since we haven't permanently failed yet. 2191*7754SJeff.Bonwick@Sun.COM */ 2192*7754SJeff.Bonwick@Sun.COM zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 2193*7754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2194*7754SJeff.Bonwick@Sun.COM } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 2195*7754SJeff.Bonwick@Sun.COM /* 2196*7754SJeff.Bonwick@Sun.COM * We'd fail again if we reexecuted now, so suspend 2197*7754SJeff.Bonwick@Sun.COM * until conditions improve (e.g. device comes online). 2198*7754SJeff.Bonwick@Sun.COM */ 2199*7754SJeff.Bonwick@Sun.COM zio_suspend(spa, zio); 2200*7754SJeff.Bonwick@Sun.COM } else { 2201*7754SJeff.Bonwick@Sun.COM /* 2202*7754SJeff.Bonwick@Sun.COM * Reexecution is potentially a huge amount of work. 2203*7754SJeff.Bonwick@Sun.COM * Hand it off to the otherwise-unused claim taskq. 2204*7754SJeff.Bonwick@Sun.COM */ 2205*7754SJeff.Bonwick@Sun.COM (void) taskq_dispatch( 2206*7754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 2207*7754SJeff.Bonwick@Sun.COM (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 2208*7754SJeff.Bonwick@Sun.COM } 2209*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2210*7754SJeff.Bonwick@Sun.COM } 2211*7754SJeff.Bonwick@Sun.COM 2212*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_child == NULL); 2213*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_reexecute == 0); 2214*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 2215*7754SJeff.Bonwick@Sun.COM 2216*7754SJeff.Bonwick@Sun.COM if (zio->io_done) 2217*7754SJeff.Bonwick@Sun.COM zio->io_done(zio); 2218*7754SJeff.Bonwick@Sun.COM 2219*7754SJeff.Bonwick@Sun.COM zio_gang_tree_free(&zio->io_gang_tree); 2220*7754SJeff.Bonwick@Sun.COM 2221*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_delegate_list == NULL); 2222*7754SJeff.Bonwick@Sun.COM ASSERT(zio->io_delegate_next == NULL); 2223*7754SJeff.Bonwick@Sun.COM 2224*7754SJeff.Bonwick@Sun.COM if (pio != NULL) { 2225*7754SJeff.Bonwick@Sun.COM zio_remove_child(pio, zio); 2226*7754SJeff.Bonwick@Sun.COM zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2227*7754SJeff.Bonwick@Sun.COM } 2228*7754SJeff.Bonwick@Sun.COM 2229*7754SJeff.Bonwick@Sun.COM if (zio->io_waiter != NULL) { 2230*7754SJeff.Bonwick@Sun.COM mutex_enter(&zio->io_lock); 2231*7754SJeff.Bonwick@Sun.COM zio->io_executor = NULL; 2232*7754SJeff.Bonwick@Sun.COM cv_broadcast(&zio->io_cv); 2233*7754SJeff.Bonwick@Sun.COM mutex_exit(&zio->io_lock); 2234*7754SJeff.Bonwick@Sun.COM } else { 2235*7754SJeff.Bonwick@Sun.COM zio_destroy(zio); 2236*7754SJeff.Bonwick@Sun.COM } 2237*7754SJeff.Bonwick@Sun.COM 2238*7754SJeff.Bonwick@Sun.COM return (ZIO_PIPELINE_STOP); 2239*7754SJeff.Bonwick@Sun.COM } 2240*7754SJeff.Bonwick@Sun.COM 2241*7754SJeff.Bonwick@Sun.COM /* 2242*7754SJeff.Bonwick@Sun.COM * ========================================================================== 2243*7754SJeff.Bonwick@Sun.COM * I/O pipeline definition 2244*7754SJeff.Bonwick@Sun.COM * ========================================================================== 2245*7754SJeff.Bonwick@Sun.COM */ 2246*7754SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = { 22475530Sbonwick NULL, 22485530Sbonwick zio_issue_async, 2249*7754SJeff.Bonwick@Sun.COM zio_read_bp_init, 2250*7754SJeff.Bonwick@Sun.COM zio_write_bp_init, 2251789Sahrens zio_checksum_generate, 2252*7754SJeff.Bonwick@Sun.COM zio_gang_assemble, 2253*7754SJeff.Bonwick@Sun.COM zio_gang_issue, 2254789Sahrens zio_dva_allocate, 2255789Sahrens zio_dva_free, 2256789Sahrens zio_dva_claim, 2257789Sahrens zio_ready, 2258789Sahrens zio_vdev_io_start, 2259789Sahrens zio_vdev_io_done, 2260789Sahrens zio_vdev_io_assess, 2261789Sahrens zio_checksum_verify, 2262*7754SJeff.Bonwick@Sun.COM zio_done 2263789Sahrens }; 2264