1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221544Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 67789Sahrens typedef struct zio_sync_pass { 68789Sahrens int zp_defer_free; /* defer frees after this pass */ 69789Sahrens int zp_dontcompress; /* don't compress after this pass */ 70789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 71789Sahrens } zio_sync_pass_t; 72789Sahrens 73789Sahrens zio_sync_pass_t zio_sync_pass = { 74789Sahrens 1, /* zp_defer_free */ 75789Sahrens 4, /* zp_dontcompress */ 76789Sahrens 1, /* zp_rewrite */ 77789Sahrens }; 78789Sahrens 79789Sahrens /* 80789Sahrens * ========================================================================== 81789Sahrens * I/O kmem caches 82789Sahrens * ========================================================================== 83789Sahrens */ 84789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 85789Sahrens 86789Sahrens void 87789Sahrens zio_init(void) 88789Sahrens { 89789Sahrens size_t c; 90789Sahrens 91789Sahrens /* 92789Sahrens * For small buffers, we want a cache for each multiple of 93789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 94789Sahrens * for each quarter-power of 2. For large buffers, we want 95789Sahrens * a cache for each multiple of PAGESIZE. 96789Sahrens */ 97789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 98789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 99789Sahrens size_t p2 = size; 100789Sahrens size_t align = 0; 101789Sahrens 102789Sahrens while (p2 & (p2 - 1)) 103789Sahrens p2 &= p2 - 1; 104789Sahrens 105789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 106789Sahrens align = SPA_MINBLOCKSIZE; 107789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 108789Sahrens align = PAGESIZE; 109789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 110789Sahrens align = p2 >> 2; 111789Sahrens } 112789Sahrens 113789Sahrens if (align != 0) { 114789Sahrens char name[30]; 115789Sahrens (void) sprintf(name, "zio_buf_%lu", size); 116789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 117849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 118789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 119789Sahrens size, align); 120789Sahrens } 121789Sahrens } 122789Sahrens 123789Sahrens while (--c != 0) { 124789Sahrens ASSERT(zio_buf_cache[c] != NULL); 125789Sahrens if (zio_buf_cache[c - 1] == NULL) 126789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 127789Sahrens } 1281544Seschrock 1291544Seschrock zio_inject_init(); 130789Sahrens } 131789Sahrens 132789Sahrens void 133789Sahrens zio_fini(void) 134789Sahrens { 135789Sahrens size_t c; 136789Sahrens kmem_cache_t *last_cache = NULL; 137789Sahrens 138789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 139789Sahrens if (zio_buf_cache[c] != last_cache) { 140789Sahrens last_cache = zio_buf_cache[c]; 141789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 142789Sahrens } 143789Sahrens zio_buf_cache[c] = NULL; 144789Sahrens } 1451544Seschrock 1461544Seschrock zio_inject_fini(); 147789Sahrens } 148789Sahrens 149789Sahrens /* 150789Sahrens * ========================================================================== 151789Sahrens * Allocate and free I/O buffers 152789Sahrens * ========================================================================== 153789Sahrens */ 154789Sahrens void * 155789Sahrens zio_buf_alloc(size_t size) 156789Sahrens { 157789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 158789Sahrens 159789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 160789Sahrens 161789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 162789Sahrens } 163789Sahrens 164789Sahrens void 165789Sahrens zio_buf_free(void *buf, size_t size) 166789Sahrens { 167789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 168789Sahrens 169789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 170789Sahrens 171789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 172789Sahrens } 173789Sahrens 174789Sahrens /* 175789Sahrens * ========================================================================== 176789Sahrens * Push and pop I/O transform buffers 177789Sahrens * ========================================================================== 178789Sahrens */ 179789Sahrens static void 180789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 181789Sahrens { 182789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 183789Sahrens 184789Sahrens zt->zt_data = data; 185789Sahrens zt->zt_size = size; 186789Sahrens zt->zt_bufsize = bufsize; 187789Sahrens 188789Sahrens zt->zt_next = zio->io_transform_stack; 189789Sahrens zio->io_transform_stack = zt; 190789Sahrens 191789Sahrens zio->io_data = data; 192789Sahrens zio->io_size = size; 193789Sahrens } 194789Sahrens 195789Sahrens static void 196789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 197789Sahrens { 198789Sahrens zio_transform_t *zt = zio->io_transform_stack; 199789Sahrens 200789Sahrens *data = zt->zt_data; 201789Sahrens *size = zt->zt_size; 202789Sahrens *bufsize = zt->zt_bufsize; 203789Sahrens 204789Sahrens zio->io_transform_stack = zt->zt_next; 205789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 206789Sahrens 207789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 208789Sahrens zio->io_data = zt->zt_data; 209789Sahrens zio->io_size = zt->zt_size; 210789Sahrens } 211789Sahrens } 212789Sahrens 213789Sahrens static void 214789Sahrens zio_clear_transform_stack(zio_t *zio) 215789Sahrens { 216789Sahrens void *data; 217789Sahrens uint64_t size, bufsize; 218789Sahrens 219789Sahrens ASSERT(zio->io_transform_stack != NULL); 220789Sahrens 221789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 222789Sahrens while (zio->io_transform_stack != NULL) { 223789Sahrens zio_buf_free(data, bufsize); 224789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 225789Sahrens } 226789Sahrens } 227789Sahrens 228789Sahrens /* 229789Sahrens * ========================================================================== 230789Sahrens * Create the various types of I/O (read, write, free) 231789Sahrens * ========================================================================== 232789Sahrens */ 233789Sahrens static zio_t * 234789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 235789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 236789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 237789Sahrens { 238789Sahrens zio_t *zio; 239789Sahrens 240789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 241789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 242789Sahrens 243789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 244789Sahrens zio->io_parent = pio; 245789Sahrens zio->io_spa = spa; 246789Sahrens zio->io_txg = txg; 247789Sahrens if (bp != NULL) { 248789Sahrens zio->io_bp = bp; 249789Sahrens zio->io_bp_copy = *bp; 250789Sahrens zio->io_bp_orig = *bp; 251789Sahrens } 252789Sahrens zio->io_done = done; 253789Sahrens zio->io_private = private; 254789Sahrens zio->io_type = type; 255789Sahrens zio->io_priority = priority; 256789Sahrens zio->io_stage = stage; 257789Sahrens zio->io_pipeline = pipeline; 258789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 259789Sahrens zio->io_timestamp = lbolt64; 260789Sahrens zio->io_flags = flags; 261789Sahrens zio_push_transform(zio, data, size, size); 262789Sahrens 263789Sahrens if (pio == NULL) { 264789Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 2651544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 266789Sahrens zio->io_root = zio; 267789Sahrens } else { 268789Sahrens zio->io_root = pio->io_root; 2691544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 2701544Seschrock zio->io_logical = pio->io_logical; 271789Sahrens mutex_enter(&pio->io_lock); 272789Sahrens if (stage < ZIO_STAGE_READY) 273789Sahrens pio->io_children_notready++; 274789Sahrens pio->io_children_notdone++; 275789Sahrens zio->io_sibling_next = pio->io_child; 276789Sahrens zio->io_sibling_prev = NULL; 277789Sahrens if (pio->io_child != NULL) 278789Sahrens pio->io_child->io_sibling_prev = zio; 279789Sahrens pio->io_child = zio; 280*1775Sbillm zio->io_ndvas = pio->io_ndvas; 281789Sahrens mutex_exit(&pio->io_lock); 282789Sahrens } 283789Sahrens 284789Sahrens return (zio); 285789Sahrens } 286789Sahrens 287789Sahrens zio_t * 288789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 289789Sahrens int flags) 290789Sahrens { 291789Sahrens zio_t *zio; 292789Sahrens 293789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 294789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 295789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 296789Sahrens 297789Sahrens return (zio); 298789Sahrens } 299789Sahrens 300789Sahrens zio_t * 301789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 302789Sahrens { 303789Sahrens return (zio_null(NULL, spa, done, private, flags)); 304789Sahrens } 305789Sahrens 306789Sahrens zio_t * 307789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 308789Sahrens uint64_t size, zio_done_func_t *done, void *private, 3091544Seschrock int priority, int flags, zbookmark_t *zb) 310789Sahrens { 311789Sahrens zio_t *zio; 312789Sahrens 313789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 314789Sahrens 315789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 316789Sahrens ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 3171544Seschrock zio->io_bookmark = *zb; 3181544Seschrock 3191544Seschrock zio->io_logical = zio; 320789Sahrens 321789Sahrens /* 322789Sahrens * Work off our copy of the bp so the caller can free it. 323789Sahrens */ 324789Sahrens zio->io_bp = &zio->io_bp_copy; 325789Sahrens 326789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 327789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 328789Sahrens void *cbuf = zio_buf_alloc(csize); 329789Sahrens 330789Sahrens zio_push_transform(zio, cbuf, csize, csize); 331789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 332789Sahrens } 333789Sahrens 334*1775Sbillm if (BP_IS_GANG(bp)) { 335789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 336789Sahrens void *gbuf = zio_buf_alloc(gsize); 337789Sahrens 338789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 339789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 340789Sahrens } 341789Sahrens 342789Sahrens return (zio); 343789Sahrens } 344789Sahrens 345789Sahrens zio_t * 346*1775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 347789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 3481544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 3491544Seschrock zbookmark_t *zb) 350789Sahrens { 351789Sahrens zio_t *zio; 352789Sahrens 353789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 354789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 355789Sahrens 356789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 357789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 358789Sahrens 359789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 360789Sahrens ZIO_TYPE_WRITE, priority, flags, 361789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 362789Sahrens 3631544Seschrock zio->io_bookmark = *zb; 3641544Seschrock 3651544Seschrock zio->io_logical = zio; 3661544Seschrock 367789Sahrens zio->io_checksum = checksum; 368789Sahrens zio->io_compress = compress; 369*1775Sbillm zio->io_ndvas = ncopies; 370789Sahrens 371789Sahrens if (compress != ZIO_COMPRESS_OFF) 372789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 373789Sahrens 374789Sahrens if (bp->blk_birth != txg) { 375789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 376789Sahrens BP_ZERO(bp); 377789Sahrens BP_SET_LSIZE(bp, size); 378789Sahrens BP_SET_PSIZE(bp, size); 379*1775Sbillm } else { 380*1775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 381*1775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 382*1775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 383789Sahrens } 384789Sahrens 385789Sahrens return (zio); 386789Sahrens } 387789Sahrens 388789Sahrens zio_t * 389789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 390789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 3911544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 3921544Seschrock zbookmark_t *zb) 393789Sahrens { 394789Sahrens zio_t *zio; 395789Sahrens 396789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 397789Sahrens ZIO_TYPE_WRITE, priority, flags, 398789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 399789Sahrens 4001544Seschrock zio->io_bookmark = *zb; 401789Sahrens zio->io_checksum = checksum; 402789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 403789Sahrens 404*1775Sbillm if (pio != NULL) 405*1775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 406*1775Sbillm 407789Sahrens return (zio); 408789Sahrens } 409789Sahrens 410789Sahrens static zio_t * 411789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 412789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 413789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 414789Sahrens { 415789Sahrens zio_t *zio; 416789Sahrens 417789Sahrens BP_ZERO(bp); 418789Sahrens BP_SET_LSIZE(bp, size); 419789Sahrens BP_SET_PSIZE(bp, size); 420789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 421789Sahrens 422789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 423789Sahrens ZIO_TYPE_WRITE, priority, flags, 424789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 425789Sahrens 426789Sahrens zio->io_checksum = checksum; 427789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 428789Sahrens 429789Sahrens return (zio); 430789Sahrens } 431789Sahrens 432789Sahrens zio_t * 433789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 434789Sahrens zio_done_func_t *done, void *private) 435789Sahrens { 436789Sahrens zio_t *zio; 437789Sahrens 438789Sahrens ASSERT(!BP_IS_HOLE(bp)); 439789Sahrens 440789Sahrens if (txg == spa->spa_syncing_txg && 441789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 442789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 443789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 444789Sahrens } 445789Sahrens 446789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 447789Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, 448789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 449789Sahrens 450789Sahrens zio->io_bp = &zio->io_bp_copy; 451789Sahrens 452789Sahrens return (zio); 453789Sahrens } 454789Sahrens 455789Sahrens zio_t * 456789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 457789Sahrens zio_done_func_t *done, void *private) 458789Sahrens { 459789Sahrens zio_t *zio; 460789Sahrens 461789Sahrens /* 462789Sahrens * A claim is an allocation of a specific block. Claims are needed 463789Sahrens * to support immediate writes in the intent log. The issue is that 464789Sahrens * immediate writes contain committed data, but in a txg that was 465789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 466789Sahrens * the intent log claims all blocks that contain immediate write data 467789Sahrens * so that the SPA knows they're in use. 468789Sahrens * 469789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 470789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 471789Sahrens */ 472789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 473789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 474789Sahrens 475789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 476789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 477789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 478789Sahrens 479789Sahrens zio->io_bp = &zio->io_bp_copy; 480789Sahrens 481789Sahrens return (zio); 482789Sahrens } 483789Sahrens 484789Sahrens zio_t * 485789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 486789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 487789Sahrens { 488789Sahrens zio_t *zio; 489789Sahrens int c; 490789Sahrens 491789Sahrens if (vd->vdev_children == 0) { 492789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 493789Sahrens ZIO_TYPE_IOCTL, priority, flags, 494789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 495789Sahrens 496789Sahrens zio->io_vd = vd; 497789Sahrens zio->io_cmd = cmd; 498789Sahrens } else { 499789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 500789Sahrens 501789Sahrens for (c = 0; c < vd->vdev_children; c++) 502789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 503789Sahrens done, private, priority, flags)); 504789Sahrens } 505789Sahrens 506789Sahrens return (zio); 507789Sahrens } 508789Sahrens 509789Sahrens static void 510789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 511789Sahrens int checksum) 512789Sahrens { 513789Sahrens ASSERT(vd->vdev_children == 0); 514789Sahrens 515789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 516789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 517789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 518789Sahrens 519789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 520789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 521789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 522789Sahrens 523789Sahrens BP_ZERO(bp); 524789Sahrens 525789Sahrens BP_SET_LSIZE(bp, size); 526789Sahrens BP_SET_PSIZE(bp, size); 527789Sahrens 528789Sahrens BP_SET_CHECKSUM(bp, checksum); 529789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 530789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 531789Sahrens 532789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 533789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 534789Sahrens } 535789Sahrens 536789Sahrens zio_t * 537789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 538789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 539789Sahrens int priority, int flags) 540789Sahrens { 541789Sahrens zio_t *zio; 542789Sahrens blkptr_t blk; 543789Sahrens 544789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 545789Sahrens 546789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 547789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 548789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 549789Sahrens 550789Sahrens zio->io_vd = vd; 551789Sahrens zio->io_offset = offset; 552789Sahrens 553789Sahrens /* 554789Sahrens * Work off our copy of the bp so the caller can free it. 555789Sahrens */ 556789Sahrens zio->io_bp = &zio->io_bp_copy; 557789Sahrens 558789Sahrens return (zio); 559789Sahrens } 560789Sahrens 561789Sahrens zio_t * 562789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 563789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 564789Sahrens int priority, int flags) 565789Sahrens { 566789Sahrens zio_block_tail_t *zbt; 567789Sahrens void *wbuf; 568789Sahrens zio_t *zio; 569789Sahrens blkptr_t blk; 570789Sahrens 571789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 572789Sahrens 573789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 574789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 575789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 576789Sahrens 577789Sahrens zio->io_vd = vd; 578789Sahrens zio->io_offset = offset; 579789Sahrens 580789Sahrens zio->io_bp = &zio->io_bp_copy; 581789Sahrens zio->io_checksum = checksum; 582789Sahrens 583789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 584789Sahrens /* 585789Sahrens * zbt checksums are necessarily destructive -- they modify 586789Sahrens * one word of the write buffer to hold the verifier/checksum. 587789Sahrens * Therefore, we must make a local copy in case the data is 588789Sahrens * being written to multiple places. 589789Sahrens */ 590789Sahrens wbuf = zio_buf_alloc(size); 591789Sahrens bcopy(data, wbuf, size); 592789Sahrens zio_push_transform(zio, wbuf, size, size); 593789Sahrens 594789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 595789Sahrens zbt->zbt_cksum = blk.blk_cksum; 596789Sahrens } 597789Sahrens 598789Sahrens return (zio); 599789Sahrens } 600789Sahrens 601789Sahrens /* 602789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 603789Sahrens */ 604789Sahrens zio_t * 605789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 606789Sahrens void *data, uint64_t size, int type, int priority, int flags, 607789Sahrens zio_done_func_t *done, void *private) 608789Sahrens { 609789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 610789Sahrens zio_t *cio; 611789Sahrens 612789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 613789Sahrens /* 614789Sahrens * If we have the bp, then the child should perform the 615789Sahrens * checksum and the parent need not. This pushes error 616789Sahrens * detection as close to the leaves as possible and 617789Sahrens * eliminates redundant checksums in the interior nodes. 618789Sahrens */ 619789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 620789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 621789Sahrens } 622789Sahrens 623789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 624789Sahrens done, private, type, priority, 625789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 626*1775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 627789Sahrens 628789Sahrens cio->io_vd = vd; 629789Sahrens cio->io_offset = offset; 630789Sahrens 631789Sahrens return (cio); 632789Sahrens } 633789Sahrens 634789Sahrens /* 635789Sahrens * ========================================================================== 636789Sahrens * Initiate I/O, either sync or async 637789Sahrens * ========================================================================== 638789Sahrens */ 639789Sahrens int 640789Sahrens zio_wait(zio_t *zio) 641789Sahrens { 642789Sahrens int error; 643789Sahrens 644789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 645789Sahrens 646789Sahrens zio->io_waiter = curthread; 647789Sahrens 648789Sahrens zio_next_stage_async(zio); 649789Sahrens 650789Sahrens mutex_enter(&zio->io_lock); 651789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 652789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 653789Sahrens mutex_exit(&zio->io_lock); 654789Sahrens 655789Sahrens error = zio->io_error; 656789Sahrens 657789Sahrens kmem_free(zio, sizeof (zio_t)); 658789Sahrens 659789Sahrens return (error); 660789Sahrens } 661789Sahrens 662789Sahrens void 663789Sahrens zio_nowait(zio_t *zio) 664789Sahrens { 665789Sahrens zio_next_stage_async(zio); 666789Sahrens } 667789Sahrens 668789Sahrens /* 669789Sahrens * ========================================================================== 670789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 671789Sahrens * ========================================================================== 672789Sahrens */ 673789Sahrens static void 674789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 675789Sahrens { 676789Sahrens mutex_enter(&zio->io_lock); 677789Sahrens if (*countp == 0) { 678789Sahrens ASSERT(zio->io_stalled == 0); 679789Sahrens mutex_exit(&zio->io_lock); 680789Sahrens zio_next_stage(zio); 681789Sahrens } else { 682789Sahrens zio->io_stalled = stage; 683789Sahrens mutex_exit(&zio->io_lock); 684789Sahrens } 685789Sahrens } 686789Sahrens 687789Sahrens static void 688789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 689789Sahrens { 690789Sahrens zio_t *pio = zio->io_parent; 691789Sahrens 692789Sahrens mutex_enter(&pio->io_lock); 693789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 694789Sahrens pio->io_error = zio->io_error; 695789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 696789Sahrens pio->io_stalled = 0; 697789Sahrens mutex_exit(&pio->io_lock); 698789Sahrens zio_next_stage_async(pio); 699789Sahrens } else { 700789Sahrens mutex_exit(&pio->io_lock); 701789Sahrens } 702789Sahrens } 703789Sahrens 704789Sahrens static void 705789Sahrens zio_wait_children_ready(zio_t *zio) 706789Sahrens { 707789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 708789Sahrens &zio->io_children_notready); 709789Sahrens } 710789Sahrens 711789Sahrens void 712789Sahrens zio_wait_children_done(zio_t *zio) 713789Sahrens { 714789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 715789Sahrens &zio->io_children_notdone); 716789Sahrens } 717789Sahrens 718789Sahrens static void 719789Sahrens zio_ready(zio_t *zio) 720789Sahrens { 721789Sahrens zio_t *pio = zio->io_parent; 722789Sahrens 723789Sahrens if (pio != NULL) 724789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 725789Sahrens &pio->io_children_notready); 726789Sahrens 727789Sahrens if (zio->io_bp) 728789Sahrens zio->io_bp_copy = *zio->io_bp; 729789Sahrens 730789Sahrens zio_next_stage(zio); 731789Sahrens } 732789Sahrens 733789Sahrens static void 734789Sahrens zio_done(zio_t *zio) 735789Sahrens { 736789Sahrens zio_t *pio = zio->io_parent; 737789Sahrens spa_t *spa = zio->io_spa; 738789Sahrens blkptr_t *bp = zio->io_bp; 739789Sahrens vdev_t *vd = zio->io_vd; 740896Smaybee char blkbuf[BP_SPRINTF_LEN]; 741789Sahrens 742789Sahrens ASSERT(zio->io_children_notready == 0); 743789Sahrens ASSERT(zio->io_children_notdone == 0); 744789Sahrens 745789Sahrens if (bp != NULL) { 746789Sahrens ASSERT(bp->blk_pad[0] == 0); 747789Sahrens ASSERT(bp->blk_pad[1] == 0); 748789Sahrens ASSERT(bp->blk_pad[2] == 0); 749789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 750789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 751*1775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 752789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 753*1775Sbillm if (zio->io_ndvas != 0) 754*1775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 755*1775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 756*1775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 757*1775Sbillm } 758789Sahrens } 759789Sahrens 760789Sahrens if (vd != NULL) 761789Sahrens vdev_stat_update(zio); 762789Sahrens 763789Sahrens if (zio->io_error) { 7641544Seschrock /* 7651544Seschrock * If this I/O is attached to a particular vdev, 7661544Seschrock * generate an error message describing the I/O failure 7671544Seschrock * at the block level. We ignore these errors if the 7681544Seschrock * device is currently unavailable. 7691544Seschrock */ 7701732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 7711544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 7721732Sbonwick zio->io_spa, vd, zio, 0, 0); 773789Sahrens 7741544Seschrock if ((zio->io_error == EIO || 7751544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 7761544Seschrock zio->io_logical == zio) { 7771544Seschrock /* 7781544Seschrock * For root I/O requests, tell the SPA to log the error 7791544Seschrock * appropriately. Also, generate a logical data 7801544Seschrock * ereport. 7811544Seschrock */ 7821544Seschrock spa_log_error(zio->io_spa, zio); 7831544Seschrock 7841544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 7851544Seschrock zio->io_spa, NULL, zio, 0, 0); 7861544Seschrock } 787789Sahrens 7881544Seschrock /* 7891544Seschrock * For I/O requests that cannot fail, panic appropriately. 7901544Seschrock */ 7911544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 7921544Seschrock sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 7931544Seschrock bp ? bp : &zio->io_bp_copy); 7941544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 7951544Seschrock "%d", zio->io_error == ECKSUM ? 7961544Seschrock "bad checksum" : "I/O failure", 7971544Seschrock zio_type_name[zio->io_type], 7981544Seschrock vdev_description(vd), 7991544Seschrock (u_longlong_t)zio->io_offset, 8001544Seschrock zio, blkbuf, zio->io_error); 8011544Seschrock } 802789Sahrens } 803789Sahrens 804789Sahrens zio_clear_transform_stack(zio); 805789Sahrens 806789Sahrens if (zio->io_done) 807789Sahrens zio->io_done(zio); 808789Sahrens 809789Sahrens ASSERT(zio->io_delegate_list == NULL); 810789Sahrens ASSERT(zio->io_delegate_next == NULL); 811789Sahrens 812789Sahrens if (pio != NULL) { 813789Sahrens zio_t *next, *prev; 814789Sahrens 815789Sahrens mutex_enter(&pio->io_lock); 816789Sahrens next = zio->io_sibling_next; 817789Sahrens prev = zio->io_sibling_prev; 818789Sahrens if (next != NULL) 819789Sahrens next->io_sibling_prev = prev; 820789Sahrens if (prev != NULL) 821789Sahrens prev->io_sibling_next = next; 822789Sahrens if (pio->io_child == zio) 823789Sahrens pio->io_child = next; 824789Sahrens mutex_exit(&pio->io_lock); 825789Sahrens 826789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 827789Sahrens &pio->io_children_notdone); 828789Sahrens } 829789Sahrens 830789Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 8311544Seschrock spa_config_exit(spa, zio); 832789Sahrens 833789Sahrens if (zio->io_waiter != NULL) { 834789Sahrens mutex_enter(&zio->io_lock); 835789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 836789Sahrens zio->io_stalled = zio->io_stage; 837789Sahrens cv_broadcast(&zio->io_cv); 838789Sahrens mutex_exit(&zio->io_lock); 839789Sahrens } else { 840789Sahrens kmem_free(zio, sizeof (zio_t)); 841789Sahrens } 842789Sahrens } 843789Sahrens 844789Sahrens /* 845789Sahrens * ========================================================================== 846789Sahrens * Compression support 847789Sahrens * ========================================================================== 848789Sahrens */ 849789Sahrens static void 850789Sahrens zio_write_compress(zio_t *zio) 851789Sahrens { 852789Sahrens int compress = zio->io_compress; 853789Sahrens blkptr_t *bp = zio->io_bp; 854789Sahrens void *cbuf; 855789Sahrens uint64_t lsize = zio->io_size; 856789Sahrens uint64_t csize = lsize; 857789Sahrens uint64_t cbufsize = 0; 858789Sahrens int pass; 859789Sahrens 860789Sahrens if (bp->blk_birth == zio->io_txg) { 861789Sahrens /* 862789Sahrens * We're rewriting an existing block, which means we're 863789Sahrens * working on behalf of spa_sync(). For spa_sync() to 864789Sahrens * converge, it must eventually be the case that we don't 865789Sahrens * have to allocate new blocks. But compression changes 866789Sahrens * the blocksize, which forces a reallocate, and makes 867789Sahrens * convergence take longer. Therefore, after the first 868789Sahrens * few passes, stop compressing to ensure convergence. 869789Sahrens */ 870789Sahrens pass = spa_sync_pass(zio->io_spa); 871789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 872789Sahrens compress = ZIO_COMPRESS_OFF; 873789Sahrens } else { 874789Sahrens ASSERT(BP_IS_HOLE(bp)); 875789Sahrens pass = 1; 876789Sahrens } 877789Sahrens 878789Sahrens if (compress != ZIO_COMPRESS_OFF) 879789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 880789Sahrens &cbuf, &csize, &cbufsize)) 881789Sahrens compress = ZIO_COMPRESS_OFF; 882789Sahrens 883789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 884789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 885789Sahrens 886789Sahrens /* 887789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 888789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 889789Sahrens * but newly allocated blocks are sequential, so they can be written 890789Sahrens * to disk faster. Therefore, we allow the first few passes of 891789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 892789Sahrens * There should only be a handful of blocks after pass 1 in any case. 893789Sahrens */ 894789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 895789Sahrens pass > zio_sync_pass.zp_rewrite) { 896789Sahrens ASSERT(csize != 0); 897789Sahrens ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); 898789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 899789Sahrens 900789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 901789Sahrens } else { 902789Sahrens if (bp->blk_birth == zio->io_txg) { 903789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 904789Sahrens bzero(bp, sizeof (blkptr_t)); 905789Sahrens } 906789Sahrens if (csize == 0) { 907789Sahrens BP_ZERO(bp); 908789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 909789Sahrens } else { 910*1775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 911789Sahrens BP_SET_LSIZE(bp, lsize); 912789Sahrens BP_SET_PSIZE(bp, csize); 913789Sahrens BP_SET_COMPRESS(bp, compress); 914789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 915789Sahrens } 916789Sahrens } 917789Sahrens 918789Sahrens zio_next_stage(zio); 919789Sahrens } 920789Sahrens 921789Sahrens static void 922789Sahrens zio_read_decompress(zio_t *zio) 923789Sahrens { 924789Sahrens blkptr_t *bp = zio->io_bp; 925789Sahrens void *data; 926789Sahrens uint64_t size; 927789Sahrens uint64_t bufsize; 928789Sahrens int compress = BP_GET_COMPRESS(bp); 929789Sahrens 930789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 931789Sahrens 932789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 933789Sahrens 934789Sahrens if (zio_decompress_data(compress, data, size, 935789Sahrens zio->io_data, zio->io_size)) 936789Sahrens zio->io_error = EIO; 937789Sahrens 938789Sahrens zio_buf_free(data, bufsize); 939789Sahrens 940789Sahrens zio_next_stage(zio); 941789Sahrens } 942789Sahrens 943789Sahrens /* 944789Sahrens * ========================================================================== 945789Sahrens * Gang block support 946789Sahrens * ========================================================================== 947789Sahrens */ 948789Sahrens static void 949789Sahrens zio_gang_pipeline(zio_t *zio) 950789Sahrens { 951789Sahrens /* 952789Sahrens * By default, the pipeline assumes that we're dealing with a gang 953789Sahrens * block. If we're not, strip out any gang-specific stages. 954789Sahrens */ 955*1775Sbillm if (!BP_IS_GANG(zio->io_bp)) 956789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 957789Sahrens 958789Sahrens zio_next_stage(zio); 959789Sahrens } 960789Sahrens 961789Sahrens static void 962789Sahrens zio_gang_byteswap(zio_t *zio) 963789Sahrens { 964789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 965789Sahrens 966789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 967789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 968789Sahrens } 969789Sahrens 970789Sahrens static void 971789Sahrens zio_get_gang_header(zio_t *zio) 972789Sahrens { 973789Sahrens blkptr_t *bp = zio->io_bp; 974789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 975789Sahrens void *gbuf = zio_buf_alloc(gsize); 976789Sahrens 977*1775Sbillm ASSERT(BP_IS_GANG(bp)); 978789Sahrens 979789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 980789Sahrens 981789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 982789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 983789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 984789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 985789Sahrens 986789Sahrens zio_wait_children_done(zio); 987789Sahrens } 988789Sahrens 989789Sahrens static void 990789Sahrens zio_read_gang_members(zio_t *zio) 991789Sahrens { 992789Sahrens zio_gbh_phys_t *gbh; 993789Sahrens uint64_t gsize, gbufsize, loff, lsize; 994789Sahrens int i; 995789Sahrens 996*1775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 997789Sahrens 998789Sahrens zio_gang_byteswap(zio); 999789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1000789Sahrens 1001789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1002789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1003789Sahrens lsize = BP_GET_PSIZE(gbp); 1004789Sahrens 1005789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1006789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1007789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1008789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1009789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1010789Sahrens 1011789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1012789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 10131544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 10141544Seschrock &zio->io_bookmark)); 1015789Sahrens } 1016789Sahrens 1017789Sahrens zio_buf_free(gbh, gbufsize); 1018789Sahrens zio_wait_children_done(zio); 1019789Sahrens } 1020789Sahrens 1021789Sahrens static void 1022789Sahrens zio_rewrite_gang_members(zio_t *zio) 1023789Sahrens { 1024789Sahrens zio_gbh_phys_t *gbh; 1025789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1026789Sahrens int i; 1027789Sahrens 1028*1775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1029789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1030789Sahrens 1031789Sahrens zio_gang_byteswap(zio); 1032789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1033789Sahrens 1034789Sahrens ASSERT(gsize == gbufsize); 1035789Sahrens 1036789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1037789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1038789Sahrens lsize = BP_GET_PSIZE(gbp); 1039789Sahrens 1040789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1041789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1042789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1043789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1044789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1045789Sahrens 1046789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1047789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 10481544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 10491544Seschrock &zio->io_bookmark)); 1050789Sahrens } 1051789Sahrens 1052789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1053789Sahrens zio_wait_children_ready(zio); 1054789Sahrens } 1055789Sahrens 1056789Sahrens static void 1057789Sahrens zio_free_gang_members(zio_t *zio) 1058789Sahrens { 1059789Sahrens zio_gbh_phys_t *gbh; 1060789Sahrens uint64_t gsize, gbufsize; 1061789Sahrens int i; 1062789Sahrens 1063*1775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1064789Sahrens 1065789Sahrens zio_gang_byteswap(zio); 1066789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1067789Sahrens 1068789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1069789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1070789Sahrens 1071789Sahrens if (BP_IS_HOLE(gbp)) 1072789Sahrens continue; 1073789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1074789Sahrens gbp, NULL, NULL)); 1075789Sahrens } 1076789Sahrens 1077789Sahrens zio_buf_free(gbh, gbufsize); 1078789Sahrens zio_next_stage(zio); 1079789Sahrens } 1080789Sahrens 1081789Sahrens static void 1082789Sahrens zio_claim_gang_members(zio_t *zio) 1083789Sahrens { 1084789Sahrens zio_gbh_phys_t *gbh; 1085789Sahrens uint64_t gsize, gbufsize; 1086789Sahrens int i; 1087789Sahrens 1088*1775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1089789Sahrens 1090789Sahrens zio_gang_byteswap(zio); 1091789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1092789Sahrens 1093789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1094789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1095789Sahrens if (BP_IS_HOLE(gbp)) 1096789Sahrens continue; 1097789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1098789Sahrens gbp, NULL, NULL)); 1099789Sahrens } 1100789Sahrens 1101789Sahrens zio_buf_free(gbh, gbufsize); 1102789Sahrens zio_next_stage(zio); 1103789Sahrens } 1104789Sahrens 1105789Sahrens static void 1106789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1107789Sahrens { 1108789Sahrens zio_t *pio = zio->io_parent; 1109*1775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 1110*1775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1111789Sahrens uint64_t asize; 1112*1775Sbillm int d; 1113789Sahrens 1114*1775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 1115*1775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1116*1775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 1117*1775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 1118*1775Sbillm 1119789Sahrens mutex_enter(&pio->io_lock); 1120*1775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 1121*1775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 1122*1775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 1123*1775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 1124*1775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 1125*1775Sbillm } 1126789Sahrens mutex_exit(&pio->io_lock); 1127789Sahrens } 1128789Sahrens 1129789Sahrens static void 1130789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1131789Sahrens { 1132789Sahrens blkptr_t *bp = zio->io_bp; 1133*1775Sbillm dva_t *dva = bp->blk_dva; 1134*1775Sbillm spa_t *spa = zio->io_spa; 1135789Sahrens zio_gbh_phys_t *gbh; 1136*1775Sbillm uint64_t txg = zio->io_txg; 1137789Sahrens uint64_t resid = zio->io_size; 1138789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1139789Sahrens uint64_t gsize, loff, lsize; 1140789Sahrens uint32_t gbps_left; 1141*1775Sbillm int ndvas = zio->io_ndvas; 1142*1775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1143789Sahrens int error; 1144*1775Sbillm int i, d; 1145789Sahrens 1146789Sahrens gsize = SPA_GANGBLOCKSIZE; 1147789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1148789Sahrens 1149*1775Sbillm error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL); 1150789Sahrens if (error == ENOSPC) 1151789Sahrens panic("can't allocate gang block header"); 1152789Sahrens ASSERT(error == 0); 1153789Sahrens 1154*1775Sbillm for (d = 0; d < gbh_ndvas; d++) 1155*1775Sbillm DVA_SET_GANG(&dva[d], 1); 1156789Sahrens 1157*1775Sbillm bp->blk_birth = txg; 1158789Sahrens 1159789Sahrens gbh = zio_buf_alloc(gsize); 1160789Sahrens bzero(gbh, gsize); 1161789Sahrens 1162*1775Sbillm /* We need to test multi-level gang blocks */ 1163*1775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 1164*1775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 1165*1775Sbillm 1166789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1167789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1168789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1169*1775Sbillm dva = gbp->blk_dva; 1170789Sahrens 1171789Sahrens ASSERT(gbps_left != 0); 1172789Sahrens maxalloc = MIN(maxalloc, resid); 1173789Sahrens 1174789Sahrens while (resid <= maxalloc * gbps_left) { 1175*1775Sbillm error = metaslab_alloc(spa, maxalloc, gbp, ndvas, 1176*1775Sbillm txg, bp); 1177789Sahrens if (error == 0) 1178789Sahrens break; 1179789Sahrens ASSERT3U(error, ==, ENOSPC); 1180789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1181789Sahrens panic("really out of space"); 1182789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1183789Sahrens } 1184789Sahrens 1185789Sahrens if (resid <= maxalloc * gbps_left) { 1186789Sahrens lsize = maxalloc; 1187789Sahrens BP_SET_LSIZE(gbp, lsize); 1188789Sahrens BP_SET_PSIZE(gbp, lsize); 1189789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1190*1775Sbillm gbp->blk_birth = txg; 1191*1775Sbillm zio_nowait(zio_rewrite(zio, spa, 1192*1775Sbillm zio->io_checksum, txg, gbp, 1193789Sahrens (char *)zio->io_data + loff, lsize, 1194789Sahrens zio_write_allocate_gang_member_done, NULL, 11951544Seschrock zio->io_priority, zio->io_flags, 11961544Seschrock &zio->io_bookmark)); 1197789Sahrens } else { 1198789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1199789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 1200*1775Sbillm zio_nowait(zio_write_allocate(zio, spa, 1201*1775Sbillm zio->io_checksum, txg, gbp, 1202789Sahrens (char *)zio->io_data + loff, lsize, 1203789Sahrens zio_write_allocate_gang_member_done, NULL, 1204789Sahrens zio->io_priority, zio->io_flags)); 1205789Sahrens } 1206789Sahrens } 1207789Sahrens 1208789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1209789Sahrens 1210789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1211789Sahrens 1212789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 1213*1775Sbillm /* 1214*1775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 1215*1775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 1216*1775Sbillm * so we have to wait for that to finish in order for our BP 1217*1775Sbillm * to be stable. 1218*1775Sbillm */ 1219789Sahrens zio_wait_children_done(zio); 1220789Sahrens } 1221789Sahrens 1222789Sahrens /* 1223789Sahrens * ========================================================================== 1224789Sahrens * Allocate and free blocks 1225789Sahrens * ========================================================================== 1226789Sahrens */ 1227789Sahrens static void 1228789Sahrens zio_dva_allocate(zio_t *zio) 1229789Sahrens { 1230789Sahrens blkptr_t *bp = zio->io_bp; 1231789Sahrens int error; 1232789Sahrens 1233789Sahrens ASSERT(BP_IS_HOLE(bp)); 1234*1775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1235*1775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 1236*1775Sbillm ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); 1237789Sahrens 1238789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1239789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1240789Sahrens zio_write_allocate_gang_members(zio); 1241789Sahrens return; 1242789Sahrens } 1243789Sahrens 1244789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1245789Sahrens 1246*1775Sbillm error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, 1247*1775Sbillm zio->io_txg, NULL); 1248789Sahrens 1249789Sahrens if (error == 0) { 1250789Sahrens bp->blk_birth = zio->io_txg; 1251789Sahrens } else if (error == ENOSPC) { 1252789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1253789Sahrens panic("really, truly out of space"); 1254789Sahrens zio_write_allocate_gang_members(zio); 1255789Sahrens return; 1256789Sahrens } else { 1257789Sahrens zio->io_error = error; 1258789Sahrens } 1259789Sahrens zio_next_stage(zio); 1260789Sahrens } 1261789Sahrens 1262789Sahrens static void 1263789Sahrens zio_dva_free(zio_t *zio) 1264789Sahrens { 1265789Sahrens blkptr_t *bp = zio->io_bp; 1266*1775Sbillm dva_t *dva = bp->blk_dva; 1267*1775Sbillm int d; 1268789Sahrens 1269789Sahrens ASSERT(!BP_IS_HOLE(bp)); 1270789Sahrens 1271*1775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) 1272*1775Sbillm metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE); 1273789Sahrens 1274789Sahrens BP_ZERO(bp); 1275789Sahrens 1276789Sahrens zio_next_stage(zio); 1277789Sahrens } 1278789Sahrens 1279789Sahrens static void 1280789Sahrens zio_dva_claim(zio_t *zio) 1281789Sahrens { 1282789Sahrens blkptr_t *bp = zio->io_bp; 1283*1775Sbillm dva_t *dva = bp->blk_dva; 1284*1775Sbillm int error = 0; 1285*1775Sbillm int d; 1286789Sahrens 1287789Sahrens ASSERT(!BP_IS_HOLE(bp)); 1288789Sahrens 1289*1775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 1290*1775Sbillm error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg); 1291*1775Sbillm if (error) 1292*1775Sbillm zio->io_error = error; 1293*1775Sbillm } 1294789Sahrens 1295789Sahrens zio_next_stage(zio); 1296789Sahrens } 1297789Sahrens 1298789Sahrens /* 1299789Sahrens * ========================================================================== 1300789Sahrens * Read and write to physical devices 1301789Sahrens * ========================================================================== 1302789Sahrens */ 1303789Sahrens 1304789Sahrens static void 1305*1775Sbillm zio_vdev_io_start(zio_t *zio) 1306789Sahrens { 1307789Sahrens vdev_t *vd = zio->io_vd; 1308*1775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1309*1775Sbillm blkptr_t *bp = zio->io_bp; 1310*1775Sbillm uint64_t align; 1311789Sahrens 1312*1775Sbillm if (vd == NULL) { 1313*1775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 1314*1775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 1315*1775Sbillm return; 1316*1775Sbillm } 1317*1775Sbillm 1318*1775Sbillm align = 1ULL << tvd->vdev_ashift; 1319*1775Sbillm 13201732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1321789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1322789Sahrens 1323*1775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 1324*1775Sbillm vd->vdev_children == 0) { 1325789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1326789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1327789Sahrens } 1328789Sahrens 13291732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 13301732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 13311732Sbonwick char *abuf = zio_buf_alloc(asize); 13321732Sbonwick ASSERT(vd == tvd); 13331732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 13341732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 13351732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 13361732Sbonwick } 13371732Sbonwick zio_push_transform(zio, abuf, asize, asize); 13381732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 13391732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 13401732Sbonwick } 13411732Sbonwick 13421732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 13431732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 13441732Sbonwick ASSERT(bp == NULL || 13451732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1346789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1347789Sahrens 1348789Sahrens vdev_io_start(zio); 1349789Sahrens 1350789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1351789Sahrens } 1352789Sahrens 1353789Sahrens static void 1354789Sahrens zio_vdev_io_done(zio_t *zio) 1355789Sahrens { 1356*1775Sbillm if (zio->io_vd == NULL) 1357*1775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 1358*1775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 1359*1775Sbillm else 1360*1775Sbillm vdev_io_done(zio); 1361789Sahrens } 1362789Sahrens 1363789Sahrens /* XXPOLICY */ 13641544Seschrock boolean_t 1365789Sahrens zio_should_retry(zio_t *zio) 1366789Sahrens { 1367789Sahrens vdev_t *vd = zio->io_vd; 1368789Sahrens 1369789Sahrens if (zio->io_error == 0) 1370789Sahrens return (B_FALSE); 1371789Sahrens if (zio->io_delegate_list != NULL) 1372789Sahrens return (B_FALSE); 1373*1775Sbillm if (vd && vd != vd->vdev_top) 1374789Sahrens return (B_FALSE); 1375789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1376789Sahrens return (B_FALSE); 13771544Seschrock if (zio->io_retries > 0) 1378789Sahrens return (B_FALSE); 1379789Sahrens 1380789Sahrens return (B_TRUE); 1381789Sahrens } 1382789Sahrens 1383789Sahrens static void 1384789Sahrens zio_vdev_io_assess(zio_t *zio) 1385789Sahrens { 1386789Sahrens vdev_t *vd = zio->io_vd; 1387*1775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1388789Sahrens 13891544Seschrock ASSERT(zio->io_vsd == NULL); 1390789Sahrens 13911732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 13921732Sbonwick void *abuf; 13931732Sbonwick uint64_t asize; 13941732Sbonwick ASSERT(vd == tvd); 13951732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 13961732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 13971732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 13981732Sbonwick zio_buf_free(abuf, asize); 13991732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 14001732Sbonwick } 14011732Sbonwick 14021544Seschrock if (zio_injection_enabled && !zio->io_error) 14031544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1404789Sahrens 1405789Sahrens /* 1406789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1407789Sahrens */ 1408789Sahrens /* XXPOLICY */ 1409789Sahrens if (zio_should_retry(zio)) { 1410789Sahrens ASSERT(tvd == vd); 1411789Sahrens ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); 1412789Sahrens 1413789Sahrens zio->io_retries++; 1414789Sahrens zio->io_error = 0; 1415789Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1416789Sahrens /* XXPOLICY */ 1417789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1418789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1419*1775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1420789Sahrens 1421789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1422789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1423789Sahrens vdev_description(vd), zio->io_offset); 1424789Sahrens 14251544Seschrock zio_next_stage_async(zio); 14261544Seschrock return; 14271544Seschrock } 1428789Sahrens 1429*1775Sbillm if (zio->io_error != 0 && zio->io_error != ECKSUM && 1430*1775Sbillm !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { 1431789Sahrens /* 14321544Seschrock * Poor man's hotplug support. Even if we're done retrying this 14331544Seschrock * I/O, try to reopen the vdev to see if it's still attached. 14341544Seschrock * To avoid excessive thrashing, we only try it once a minute. 14351544Seschrock * This also has the effect of detecting when missing devices 14361544Seschrock * have come back, by polling the device once a minute. 14371544Seschrock * 14381544Seschrock * We need to do this asynchronously because we can't grab 14391544Seschrock * all the necessary locks way down here. 1440789Sahrens */ 14411544Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 14421544Seschrock vd->vdev_last_try = gethrtime(); 14431544Seschrock tvd->vdev_reopen_wanted = 1; 14441544Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 14451544Seschrock } 1446789Sahrens } 1447789Sahrens 1448789Sahrens zio_next_stage(zio); 1449789Sahrens } 1450789Sahrens 1451789Sahrens void 1452789Sahrens zio_vdev_io_reissue(zio_t *zio) 1453789Sahrens { 1454789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1455789Sahrens ASSERT(zio->io_error == 0); 1456789Sahrens 1457789Sahrens zio->io_stage--; 1458789Sahrens } 1459789Sahrens 1460789Sahrens void 1461789Sahrens zio_vdev_io_redone(zio_t *zio) 1462789Sahrens { 1463789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1464789Sahrens 1465789Sahrens zio->io_stage--; 1466789Sahrens } 1467789Sahrens 1468789Sahrens void 1469789Sahrens zio_vdev_io_bypass(zio_t *zio) 1470789Sahrens { 1471789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1472789Sahrens ASSERT(zio->io_error == 0); 1473789Sahrens 1474789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1475789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1476789Sahrens } 1477789Sahrens 1478789Sahrens /* 1479789Sahrens * ========================================================================== 1480789Sahrens * Generate and verify checksums 1481789Sahrens * ========================================================================== 1482789Sahrens */ 1483789Sahrens static void 1484789Sahrens zio_checksum_generate(zio_t *zio) 1485789Sahrens { 1486789Sahrens int checksum = zio->io_checksum; 1487789Sahrens blkptr_t *bp = zio->io_bp; 1488789Sahrens 1489789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1490789Sahrens 1491789Sahrens BP_SET_CHECKSUM(bp, checksum); 1492789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1493789Sahrens 1494789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1495789Sahrens 1496789Sahrens zio_next_stage(zio); 1497789Sahrens } 1498789Sahrens 1499789Sahrens static void 1500789Sahrens zio_gang_checksum_generate(zio_t *zio) 1501789Sahrens { 1502789Sahrens zio_cksum_t zc; 1503789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1504789Sahrens 1505*1775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1506789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1507789Sahrens 1508789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1509789Sahrens 1510789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1511789Sahrens 1512789Sahrens zio_next_stage(zio); 1513789Sahrens } 1514789Sahrens 1515789Sahrens static void 1516789Sahrens zio_checksum_verify(zio_t *zio) 1517789Sahrens { 1518789Sahrens if (zio->io_bp != NULL) { 1519789Sahrens zio->io_error = zio_checksum_error(zio); 15201544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 15211544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 15221544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1523789Sahrens } 1524789Sahrens 1525789Sahrens zio_next_stage(zio); 1526789Sahrens } 1527789Sahrens 1528789Sahrens /* 1529789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1530789Sahrens */ 1531789Sahrens void 1532789Sahrens zio_checksum_verified(zio_t *zio) 1533789Sahrens { 1534789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1535789Sahrens } 1536789Sahrens 1537789Sahrens /* 1538789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1539789Sahrens */ 1540789Sahrens void 1541789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1542789Sahrens { 1543*1775Sbillm blkptr_t *bp = zio->io_bp; 1544*1775Sbillm 1545*1775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 1546*1775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 1547*1775Sbillm zcp->zc_word[2] = bp->blk_birth; 1548789Sahrens zcp->zc_word[3] = 0; 1549789Sahrens } 1550789Sahrens 1551789Sahrens /* 1552789Sahrens * ========================================================================== 1553789Sahrens * Define the pipeline 1554789Sahrens * ========================================================================== 1555789Sahrens */ 1556789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1557789Sahrens 1558789Sahrens static void 1559789Sahrens zio_badop(zio_t *zio) 1560789Sahrens { 1561789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1562789Sahrens } 1563789Sahrens 1564789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1565789Sahrens zio_badop, 1566789Sahrens zio_wait_children_ready, 1567789Sahrens zio_write_compress, 1568789Sahrens zio_checksum_generate, 1569789Sahrens zio_gang_pipeline, 1570789Sahrens zio_get_gang_header, 1571789Sahrens zio_rewrite_gang_members, 1572789Sahrens zio_free_gang_members, 1573789Sahrens zio_claim_gang_members, 1574789Sahrens zio_dva_allocate, 1575789Sahrens zio_dva_free, 1576789Sahrens zio_dva_claim, 1577789Sahrens zio_gang_checksum_generate, 1578789Sahrens zio_ready, 1579789Sahrens zio_vdev_io_start, 1580789Sahrens zio_vdev_io_done, 1581789Sahrens zio_vdev_io_assess, 1582789Sahrens zio_wait_children_done, 1583789Sahrens zio_checksum_verify, 1584789Sahrens zio_read_gang_members, 1585789Sahrens zio_read_decompress, 1586789Sahrens zio_done, 1587789Sahrens zio_badop 1588789Sahrens }; 1589789Sahrens 1590789Sahrens /* 1591789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1592789Sahrens * There's no locking on io_stage because there's no legitimate way for 1593789Sahrens * multiple threads to be attempting to process the same I/O. 1594789Sahrens */ 1595789Sahrens void 1596789Sahrens zio_next_stage(zio_t *zio) 1597789Sahrens { 1598789Sahrens uint32_t pipeline = zio->io_pipeline; 1599789Sahrens 1600789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1601789Sahrens 1602789Sahrens if (zio->io_error) { 1603789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1604789Sahrens zio, vdev_description(zio->io_vd), 1605789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1606789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1607789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1608789Sahrens } 1609789Sahrens 1610789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1611789Sahrens continue; 1612789Sahrens 1613789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1614789Sahrens ASSERT(zio->io_stalled == 0); 1615789Sahrens 1616789Sahrens zio_pipeline[zio->io_stage](zio); 1617789Sahrens } 1618789Sahrens 1619789Sahrens void 1620789Sahrens zio_next_stage_async(zio_t *zio) 1621789Sahrens { 1622789Sahrens taskq_t *tq; 1623789Sahrens uint32_t pipeline = zio->io_pipeline; 1624789Sahrens 1625789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1626789Sahrens 1627789Sahrens if (zio->io_error) { 1628789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1629789Sahrens zio, vdev_description(zio->io_vd), 1630789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1631789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1632789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1633789Sahrens } 1634789Sahrens 1635789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1636789Sahrens continue; 1637789Sahrens 1638789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1639789Sahrens ASSERT(zio->io_stalled == 0); 1640789Sahrens 1641789Sahrens /* 1642789Sahrens * For performance, we'll probably want two sets of task queues: 1643789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1644789Sahrens * part is for read performance: since we have to make a pass over 1645789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1646789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1647789Sahrens * that thread is probably still there. Getting this optimization 1648789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1649789Sahrens * 1650789Sahrens * Note that having two sets of task queues is also necessary for 1651789Sahrens * correctness: if all of the issue threads get bogged down waiting 1652789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1653789Sahrens * there won't be any threads available to service I/O completion 1654789Sahrens * interrupts. 1655789Sahrens */ 1656789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1657789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1658789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1659789Sahrens else 1660789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1661789Sahrens (void) taskq_dispatch(tq, 1662789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1663789Sahrens } else { 1664789Sahrens zio_pipeline[zio->io_stage](zio); 1665789Sahrens } 1666789Sahrens } 1667789Sahrens 1668789Sahrens /* 1669789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1670789Sahrens */ 1671789Sahrens int 1672789Sahrens zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, 1673789Sahrens uint64_t txg) 1674789Sahrens { 1675789Sahrens int error; 1676789Sahrens 16771544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1678789Sahrens 1679789Sahrens BP_ZERO(bp); 1680789Sahrens 1681*1775Sbillm error = metaslab_alloc(spa, size, bp, 1, txg, NULL); 1682789Sahrens 1683789Sahrens if (error == 0) { 1684789Sahrens BP_SET_CHECKSUM(bp, checksum); 1685789Sahrens BP_SET_LSIZE(bp, size); 1686789Sahrens BP_SET_PSIZE(bp, size); 1687789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 1688789Sahrens BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); 1689789Sahrens BP_SET_LEVEL(bp, 0); 1690789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1691789Sahrens bp->blk_birth = txg; 1692789Sahrens } 1693789Sahrens 16941544Seschrock spa_config_exit(spa, FTAG); 1695789Sahrens 1696789Sahrens return (error); 1697789Sahrens } 1698789Sahrens 1699789Sahrens /* 1700789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1701789Sahrens * nothing to do except metaslab_free() it. 1702789Sahrens */ 1703789Sahrens void 1704789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1705789Sahrens { 1706*1775Sbillm ASSERT(!BP_IS_GANG(bp)); 1707789Sahrens 1708789Sahrens dprintf_bp(bp, "txg %llu: ", txg); 1709789Sahrens 17101544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1711789Sahrens 17121732Sbonwick metaslab_free(spa, BP_IDENTITY(bp), txg, B_FALSE); 1713789Sahrens 17141544Seschrock spa_config_exit(spa, FTAG); 1715789Sahrens } 1716