1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 5789Sahrens * Common Development and Distribution License, Version 1.0 only 6789Sahrens * (the "License"). You may not use this file except in compliance 7789Sahrens * with the License. 8789Sahrens * 9789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10789Sahrens * or http://www.opensolaris.org/os/licensing. 11789Sahrens * See the License for the specific language governing permissions 12789Sahrens * and limitations under the License. 13789Sahrens * 14789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19789Sahrens * 20789Sahrens * CDDL HEADER END 21789Sahrens */ 22789Sahrens /* 23789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens static void zio_vdev_io_enter(zio_t *zio); 39789Sahrens static void zio_vdev_io_exit(zio_t *zio); 40789Sahrens 41789Sahrens /* 42789Sahrens * ========================================================================== 43789Sahrens * I/O priority table 44789Sahrens * ========================================================================== 45789Sahrens */ 46789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 47789Sahrens 0, /* ZIO_PRIORITY_NOW */ 48789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 49789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 50789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 51789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 52789Sahrens 4, /* ZIO_PRIORITY_FREE */ 53789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 54789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 55789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 56789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 57789Sahrens }; 58789Sahrens 59789Sahrens /* 60789Sahrens * ========================================================================== 61789Sahrens * I/O type descriptions 62789Sahrens * ========================================================================== 63789Sahrens */ 64789Sahrens char *zio_type_name[ZIO_TYPES] = { 65789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 66789Sahrens 67789Sahrens /* At or above this size, force gang blocking - for testing */ 68789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 69789Sahrens 70789Sahrens typedef struct zio_sync_pass { 71789Sahrens int zp_defer_free; /* defer frees after this pass */ 72789Sahrens int zp_dontcompress; /* don't compress after this pass */ 73789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74789Sahrens } zio_sync_pass_t; 75789Sahrens 76789Sahrens zio_sync_pass_t zio_sync_pass = { 77789Sahrens 1, /* zp_defer_free */ 78789Sahrens 4, /* zp_dontcompress */ 79789Sahrens 1, /* zp_rewrite */ 80789Sahrens }; 81789Sahrens 82789Sahrens /* 83789Sahrens * ========================================================================== 84789Sahrens * I/O kmem caches 85789Sahrens * ========================================================================== 86789Sahrens */ 87789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 88789Sahrens 89789Sahrens void 90789Sahrens zio_init(void) 91789Sahrens { 92789Sahrens size_t c; 93789Sahrens 94789Sahrens /* 95789Sahrens * For small buffers, we want a cache for each multiple of 96789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 97789Sahrens * for each quarter-power of 2. For large buffers, we want 98789Sahrens * a cache for each multiple of PAGESIZE. 99789Sahrens */ 100789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 101789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 102789Sahrens size_t p2 = size; 103789Sahrens size_t align = 0; 104789Sahrens 105789Sahrens while (p2 & (p2 - 1)) 106789Sahrens p2 &= p2 - 1; 107789Sahrens 108789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 109789Sahrens align = SPA_MINBLOCKSIZE; 110789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 111789Sahrens align = PAGESIZE; 112789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 113789Sahrens align = p2 >> 2; 114789Sahrens } 115789Sahrens 116789Sahrens if (align != 0) { 117789Sahrens char name[30]; 118789Sahrens (void) sprintf(name, "zio_buf_%lu", size); 119789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 120849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 121789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 122789Sahrens size, align); 123789Sahrens } 124789Sahrens } 125789Sahrens 126789Sahrens while (--c != 0) { 127789Sahrens ASSERT(zio_buf_cache[c] != NULL); 128789Sahrens if (zio_buf_cache[c - 1] == NULL) 129789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 130789Sahrens } 131789Sahrens } 132789Sahrens 133789Sahrens void 134789Sahrens zio_fini(void) 135789Sahrens { 136789Sahrens size_t c; 137789Sahrens kmem_cache_t *last_cache = NULL; 138789Sahrens 139789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 140789Sahrens if (zio_buf_cache[c] != last_cache) { 141789Sahrens last_cache = zio_buf_cache[c]; 142789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 143789Sahrens } 144789Sahrens zio_buf_cache[c] = NULL; 145789Sahrens } 146789Sahrens } 147789Sahrens 148789Sahrens /* 149789Sahrens * ========================================================================== 150789Sahrens * Allocate and free I/O buffers 151789Sahrens * ========================================================================== 152789Sahrens */ 153789Sahrens void * 154789Sahrens zio_buf_alloc(size_t size) 155789Sahrens { 156789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 157789Sahrens 158789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 159789Sahrens 160789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 161789Sahrens } 162789Sahrens 163789Sahrens void 164789Sahrens zio_buf_free(void *buf, size_t size) 165789Sahrens { 166789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 167789Sahrens 168789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 169789Sahrens 170789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 171789Sahrens } 172789Sahrens 173789Sahrens /* 174789Sahrens * ========================================================================== 175789Sahrens * Push and pop I/O transform buffers 176789Sahrens * ========================================================================== 177789Sahrens */ 178789Sahrens static void 179789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 180789Sahrens { 181789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 182789Sahrens 183789Sahrens zt->zt_data = data; 184789Sahrens zt->zt_size = size; 185789Sahrens zt->zt_bufsize = bufsize; 186789Sahrens 187789Sahrens zt->zt_next = zio->io_transform_stack; 188789Sahrens zio->io_transform_stack = zt; 189789Sahrens 190789Sahrens zio->io_data = data; 191789Sahrens zio->io_size = size; 192789Sahrens } 193789Sahrens 194789Sahrens static void 195789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 196789Sahrens { 197789Sahrens zio_transform_t *zt = zio->io_transform_stack; 198789Sahrens 199789Sahrens *data = zt->zt_data; 200789Sahrens *size = zt->zt_size; 201789Sahrens *bufsize = zt->zt_bufsize; 202789Sahrens 203789Sahrens zio->io_transform_stack = zt->zt_next; 204789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 205789Sahrens 206789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 207789Sahrens zio->io_data = zt->zt_data; 208789Sahrens zio->io_size = zt->zt_size; 209789Sahrens } 210789Sahrens } 211789Sahrens 212789Sahrens static void 213789Sahrens zio_clear_transform_stack(zio_t *zio) 214789Sahrens { 215789Sahrens void *data; 216789Sahrens uint64_t size, bufsize; 217789Sahrens 218789Sahrens ASSERT(zio->io_transform_stack != NULL); 219789Sahrens 220789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 221789Sahrens while (zio->io_transform_stack != NULL) { 222789Sahrens zio_buf_free(data, bufsize); 223789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 224789Sahrens } 225789Sahrens } 226789Sahrens 227789Sahrens /* 228789Sahrens * ========================================================================== 229789Sahrens * Create the various types of I/O (read, write, free) 230789Sahrens * ========================================================================== 231789Sahrens */ 232789Sahrens static zio_t * 233789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 234789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 235789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 236789Sahrens { 237789Sahrens zio_t *zio; 238789Sahrens 239789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 240789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 241789Sahrens 242789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 243789Sahrens zio->io_parent = pio; 244789Sahrens zio->io_spa = spa; 245789Sahrens zio->io_txg = txg; 246789Sahrens if (bp != NULL) { 247789Sahrens zio->io_bp = bp; 248789Sahrens zio->io_bp_copy = *bp; 249789Sahrens zio->io_bp_orig = *bp; 250789Sahrens /* XXBP - Need to inherit this when it matters */ 251789Sahrens zio->io_dva_index = 0; 252789Sahrens } 253789Sahrens zio->io_done = done; 254789Sahrens zio->io_private = private; 255789Sahrens zio->io_type = type; 256789Sahrens zio->io_priority = priority; 257789Sahrens zio->io_stage = stage; 258789Sahrens zio->io_pipeline = pipeline; 259789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 260789Sahrens zio->io_timestamp = lbolt64; 261789Sahrens zio->io_flags = flags; 262789Sahrens zio_push_transform(zio, data, size, size); 263789Sahrens 264789Sahrens if (pio == NULL) { 265789Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 266789Sahrens spa_config_enter(zio->io_spa, RW_READER); 267789Sahrens zio->io_root = zio; 268789Sahrens } else { 269789Sahrens zio->io_root = pio->io_root; 270789Sahrens 271789Sahrens mutex_enter(&pio->io_lock); 272789Sahrens if (stage < ZIO_STAGE_READY) 273789Sahrens pio->io_children_notready++; 274789Sahrens pio->io_children_notdone++; 275789Sahrens zio->io_sibling_next = pio->io_child; 276789Sahrens zio->io_sibling_prev = NULL; 277789Sahrens if (pio->io_child != NULL) 278789Sahrens pio->io_child->io_sibling_prev = zio; 279789Sahrens pio->io_child = zio; 280789Sahrens mutex_exit(&pio->io_lock); 281789Sahrens } 282789Sahrens 283789Sahrens return (zio); 284789Sahrens } 285789Sahrens 286789Sahrens zio_t * 287789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 288789Sahrens int flags) 289789Sahrens { 290789Sahrens zio_t *zio; 291789Sahrens 292789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 293789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 294789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 295789Sahrens 296789Sahrens return (zio); 297789Sahrens } 298789Sahrens 299789Sahrens zio_t * 300789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 301789Sahrens { 302789Sahrens return (zio_null(NULL, spa, done, private, flags)); 303789Sahrens } 304789Sahrens 305789Sahrens zio_t * 306789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 307789Sahrens uint64_t size, zio_done_func_t *done, void *private, 308789Sahrens int priority, int flags) 309789Sahrens { 310789Sahrens zio_t *zio; 311789Sahrens dva_t *dva; 312789Sahrens 313789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 314789Sahrens 315789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 316789Sahrens ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 317789Sahrens 318789Sahrens /* 319789Sahrens * Work off our copy of the bp so the caller can free it. 320789Sahrens */ 321789Sahrens zio->io_bp = &zio->io_bp_copy; 322789Sahrens 323789Sahrens bp = zio->io_bp; 324789Sahrens dva = ZIO_GET_DVA(zio); 325789Sahrens 326789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 327789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 328789Sahrens void *cbuf = zio_buf_alloc(csize); 329789Sahrens 330789Sahrens zio_push_transform(zio, cbuf, csize, csize); 331789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 332789Sahrens } 333789Sahrens 334789Sahrens if (DVA_GET_GANG(dva)) { 335789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 336789Sahrens void *gbuf = zio_buf_alloc(gsize); 337789Sahrens 338789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 339789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 340789Sahrens } 341789Sahrens 342789Sahrens return (zio); 343789Sahrens } 344789Sahrens 345789Sahrens zio_t * 346789Sahrens zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, 347789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 348789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 349789Sahrens { 350789Sahrens zio_t *zio; 351789Sahrens 352789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 353789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 354789Sahrens 355789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 356789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 357789Sahrens 358789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 359789Sahrens ZIO_TYPE_WRITE, priority, flags, 360789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 361789Sahrens 362789Sahrens zio->io_checksum = checksum; 363789Sahrens zio->io_compress = compress; 364789Sahrens 365789Sahrens if (compress != ZIO_COMPRESS_OFF) 366789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 367789Sahrens 368789Sahrens if (bp->blk_birth != txg) { 369789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 370789Sahrens BP_ZERO(bp); 371789Sahrens BP_SET_LSIZE(bp, size); 372789Sahrens BP_SET_PSIZE(bp, size); 373789Sahrens } 374789Sahrens 375789Sahrens return (zio); 376789Sahrens } 377789Sahrens 378789Sahrens zio_t * 379789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 380789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 381789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 382789Sahrens { 383789Sahrens zio_t *zio; 384789Sahrens 385789Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 386789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 387789Sahrens ZIO_TYPE_WRITE, priority, flags, 388789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 389789Sahrens 390789Sahrens zio->io_checksum = checksum; 391789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 392789Sahrens 393789Sahrens return (zio); 394789Sahrens } 395789Sahrens 396789Sahrens static zio_t * 397789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 398789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 399789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 400789Sahrens { 401789Sahrens zio_t *zio; 402789Sahrens 403789Sahrens BP_ZERO(bp); 404789Sahrens BP_SET_LSIZE(bp, size); 405789Sahrens BP_SET_PSIZE(bp, size); 406789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 407789Sahrens 408789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 409789Sahrens ZIO_TYPE_WRITE, priority, flags, 410789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 411789Sahrens 412789Sahrens zio->io_checksum = checksum; 413789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 414789Sahrens 415789Sahrens return (zio); 416789Sahrens } 417789Sahrens 418789Sahrens zio_t * 419789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 420789Sahrens zio_done_func_t *done, void *private) 421789Sahrens { 422789Sahrens zio_t *zio; 423789Sahrens 424789Sahrens ASSERT(!BP_IS_HOLE(bp)); 425789Sahrens 426789Sahrens if (txg == spa->spa_syncing_txg && 427789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 428789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 429789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 430789Sahrens } 431789Sahrens 432789Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 433789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 434789Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, 435789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 436789Sahrens 437789Sahrens zio->io_bp = &zio->io_bp_copy; 438789Sahrens 439789Sahrens return (zio); 440789Sahrens } 441789Sahrens 442789Sahrens zio_t * 443789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 444789Sahrens zio_done_func_t *done, void *private) 445789Sahrens { 446789Sahrens zio_t *zio; 447789Sahrens 448789Sahrens /* 449789Sahrens * A claim is an allocation of a specific block. Claims are needed 450789Sahrens * to support immediate writes in the intent log. The issue is that 451789Sahrens * immediate writes contain committed data, but in a txg that was 452789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 453789Sahrens * the intent log claims all blocks that contain immediate write data 454789Sahrens * so that the SPA knows they're in use. 455789Sahrens * 456789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 457789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 458789Sahrens */ 459789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 460789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 461789Sahrens 462789Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 463789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 464789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 465789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 466789Sahrens 467789Sahrens zio->io_bp = &zio->io_bp_copy; 468789Sahrens 469789Sahrens return (zio); 470789Sahrens } 471789Sahrens 472789Sahrens zio_t * 473789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 474789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 475789Sahrens { 476789Sahrens zio_t *zio; 477789Sahrens int c; 478789Sahrens 479789Sahrens if (vd->vdev_children == 0) { 480789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 481789Sahrens ZIO_TYPE_IOCTL, priority, flags, 482789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 483789Sahrens 484789Sahrens zio->io_vd = vd; 485789Sahrens zio->io_cmd = cmd; 486789Sahrens } else { 487789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 488789Sahrens 489789Sahrens for (c = 0; c < vd->vdev_children; c++) 490789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 491789Sahrens done, private, priority, flags)); 492789Sahrens } 493789Sahrens 494789Sahrens return (zio); 495789Sahrens } 496789Sahrens 497789Sahrens static void 498789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 499789Sahrens int checksum) 500789Sahrens { 501789Sahrens ASSERT(vd->vdev_children == 0); 502789Sahrens 503789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 504789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 505789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 506789Sahrens 507789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 508789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 509789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 510789Sahrens 511789Sahrens BP_ZERO(bp); 512789Sahrens 513789Sahrens BP_SET_LSIZE(bp, size); 514789Sahrens BP_SET_PSIZE(bp, size); 515789Sahrens 516789Sahrens BP_SET_CHECKSUM(bp, checksum); 517789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 518789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 519789Sahrens 520789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 521789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 522789Sahrens } 523789Sahrens 524789Sahrens zio_t * 525789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 526789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 527789Sahrens int priority, int flags) 528789Sahrens { 529789Sahrens zio_t *zio; 530789Sahrens blkptr_t blk; 531789Sahrens 532789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 533789Sahrens 534789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 535789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 536789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 537789Sahrens 538789Sahrens zio->io_vd = vd; 539789Sahrens zio->io_offset = offset; 540789Sahrens 541789Sahrens /* 542789Sahrens * Work off our copy of the bp so the caller can free it. 543789Sahrens */ 544789Sahrens zio->io_bp = &zio->io_bp_copy; 545789Sahrens 546789Sahrens return (zio); 547789Sahrens } 548789Sahrens 549789Sahrens zio_t * 550789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 551789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 552789Sahrens int priority, int flags) 553789Sahrens { 554789Sahrens zio_block_tail_t *zbt; 555789Sahrens void *wbuf; 556789Sahrens zio_t *zio; 557789Sahrens blkptr_t blk; 558789Sahrens 559789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 560789Sahrens 561789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 562789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 563789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 564789Sahrens 565789Sahrens zio->io_vd = vd; 566789Sahrens zio->io_offset = offset; 567789Sahrens 568789Sahrens zio->io_bp = &zio->io_bp_copy; 569789Sahrens zio->io_checksum = checksum; 570789Sahrens 571789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 572789Sahrens /* 573789Sahrens * zbt checksums are necessarily destructive -- they modify 574789Sahrens * one word of the write buffer to hold the verifier/checksum. 575789Sahrens * Therefore, we must make a local copy in case the data is 576789Sahrens * being written to multiple places. 577789Sahrens */ 578789Sahrens wbuf = zio_buf_alloc(size); 579789Sahrens bcopy(data, wbuf, size); 580789Sahrens zio_push_transform(zio, wbuf, size, size); 581789Sahrens 582789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 583789Sahrens zbt->zbt_cksum = blk.blk_cksum; 584789Sahrens } 585789Sahrens 586789Sahrens return (zio); 587789Sahrens } 588789Sahrens 589789Sahrens /* 590789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 591789Sahrens */ 592789Sahrens zio_t * 593789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 594789Sahrens void *data, uint64_t size, int type, int priority, int flags, 595789Sahrens zio_done_func_t *done, void *private) 596789Sahrens { 597789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 598789Sahrens zio_t *cio; 599789Sahrens 600789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 601789Sahrens /* 602789Sahrens * If we have the bp, then the child should perform the 603789Sahrens * checksum and the parent need not. This pushes error 604789Sahrens * detection as close to the leaves as possible and 605789Sahrens * eliminates redundant checksums in the interior nodes. 606789Sahrens */ 607789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 608789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 609789Sahrens } 610789Sahrens 611789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 612789Sahrens done, private, type, priority, 613789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 614789Sahrens ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline); 615789Sahrens 616789Sahrens cio->io_vd = vd; 617789Sahrens cio->io_offset = offset; 618789Sahrens 619789Sahrens return (cio); 620789Sahrens } 621789Sahrens 622789Sahrens /* 623789Sahrens * ========================================================================== 624789Sahrens * Initiate I/O, either sync or async 625789Sahrens * ========================================================================== 626789Sahrens */ 627789Sahrens int 628789Sahrens zio_wait(zio_t *zio) 629789Sahrens { 630789Sahrens int error; 631789Sahrens 632789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 633789Sahrens 634789Sahrens zio->io_waiter = curthread; 635789Sahrens 636789Sahrens zio_next_stage_async(zio); 637789Sahrens 638789Sahrens mutex_enter(&zio->io_lock); 639789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 640789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 641789Sahrens mutex_exit(&zio->io_lock); 642789Sahrens 643789Sahrens error = zio->io_error; 644789Sahrens 645789Sahrens kmem_free(zio, sizeof (zio_t)); 646789Sahrens 647789Sahrens return (error); 648789Sahrens } 649789Sahrens 650789Sahrens void 651789Sahrens zio_nowait(zio_t *zio) 652789Sahrens { 653789Sahrens zio_next_stage_async(zio); 654789Sahrens } 655789Sahrens 656789Sahrens /* 657789Sahrens * ========================================================================== 658789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 659789Sahrens * ========================================================================== 660789Sahrens */ 661789Sahrens static void 662789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 663789Sahrens { 664789Sahrens mutex_enter(&zio->io_lock); 665789Sahrens if (*countp == 0) { 666789Sahrens ASSERT(zio->io_stalled == 0); 667789Sahrens mutex_exit(&zio->io_lock); 668789Sahrens zio_next_stage(zio); 669789Sahrens } else { 670789Sahrens if (zio->io_stage == ZIO_STAGE_VDEV_IO_START) 671789Sahrens zio_vdev_io_exit(zio); 672789Sahrens zio->io_stalled = stage; 673789Sahrens mutex_exit(&zio->io_lock); 674789Sahrens } 675789Sahrens } 676789Sahrens 677789Sahrens static void 678789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 679789Sahrens { 680789Sahrens zio_t *pio = zio->io_parent; 681789Sahrens 682789Sahrens mutex_enter(&pio->io_lock); 683789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 684789Sahrens pio->io_error = zio->io_error; 685789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 686789Sahrens if (pio->io_stage == ZIO_STAGE_VDEV_IO_START) 687789Sahrens zio_vdev_io_enter(pio); 688789Sahrens pio->io_stalled = 0; 689789Sahrens mutex_exit(&pio->io_lock); 690789Sahrens zio_next_stage_async(pio); 691789Sahrens } else { 692789Sahrens mutex_exit(&pio->io_lock); 693789Sahrens } 694789Sahrens } 695789Sahrens 696789Sahrens static void 697789Sahrens zio_wait_children_ready(zio_t *zio) 698789Sahrens { 699789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 700789Sahrens &zio->io_children_notready); 701789Sahrens } 702789Sahrens 703789Sahrens void 704789Sahrens zio_wait_children_done(zio_t *zio) 705789Sahrens { 706789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 707789Sahrens &zio->io_children_notdone); 708789Sahrens } 709789Sahrens 710789Sahrens static void 711789Sahrens zio_ready(zio_t *zio) 712789Sahrens { 713789Sahrens zio_t *pio = zio->io_parent; 714789Sahrens 715789Sahrens if (pio != NULL) 716789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 717789Sahrens &pio->io_children_notready); 718789Sahrens 719789Sahrens if (zio->io_bp) 720789Sahrens zio->io_bp_copy = *zio->io_bp; 721789Sahrens 722789Sahrens zio_next_stage(zio); 723789Sahrens } 724789Sahrens 725789Sahrens static void 726789Sahrens zio_done(zio_t *zio) 727789Sahrens { 728789Sahrens zio_t *pio = zio->io_parent; 729789Sahrens spa_t *spa = zio->io_spa; 730789Sahrens blkptr_t *bp = zio->io_bp; 731789Sahrens vdev_t *vd = zio->io_vd; 732*896Smaybee char blkbuf[BP_SPRINTF_LEN]; 733789Sahrens 734789Sahrens ASSERT(zio->io_children_notready == 0); 735789Sahrens ASSERT(zio->io_children_notdone == 0); 736789Sahrens 737789Sahrens if (bp != NULL) { 738789Sahrens ASSERT(bp->blk_pad[0] == 0); 739789Sahrens ASSERT(bp->blk_pad[1] == 0); 740789Sahrens ASSERT(bp->blk_pad[2] == 0); 741789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 742789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 743789Sahrens !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 744789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 745789Sahrens } 746789Sahrens 747789Sahrens if (vd != NULL) 748789Sahrens vdev_stat_update(zio); 749789Sahrens 750789Sahrens if (zio->io_error) { 751*896Smaybee sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 752*896Smaybee bp ? bp : &zio->io_bp_copy); 753789Sahrens dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n", 754789Sahrens zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 755789Sahrens zio_type_name[zio->io_type], 756789Sahrens vdev_description(vd), 757789Sahrens (u_longlong_t)zio->io_offset, 758789Sahrens zio, blkbuf, zio->io_error); 759789Sahrens } 760789Sahrens 761789Sahrens if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) { 762*896Smaybee sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 763*896Smaybee bp ? bp : &zio->io_bp_copy); 764789Sahrens dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n", 765789Sahrens "partial write", 766789Sahrens zio_type_name[zio->io_type], 767789Sahrens vdev_description(vd), 768789Sahrens (u_longlong_t)zio->io_offset, 769789Sahrens zio, blkbuf, zio->io_numerrors); 770789Sahrens } 771789Sahrens 772789Sahrens if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 773*896Smaybee sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 774*896Smaybee bp ? bp : &zio->io_bp_copy); 775789Sahrens panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 776789Sahrens zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 777789Sahrens zio_type_name[zio->io_type], 778789Sahrens vdev_description(vd), 779789Sahrens (u_longlong_t)zio->io_offset, 780789Sahrens zio, blkbuf, zio->io_error); 781789Sahrens } 782789Sahrens 783789Sahrens zio_clear_transform_stack(zio); 784789Sahrens 785789Sahrens if (zio->io_done) 786789Sahrens zio->io_done(zio); 787789Sahrens 788789Sahrens ASSERT(zio->io_delegate_list == NULL); 789789Sahrens ASSERT(zio->io_delegate_next == NULL); 790789Sahrens 791789Sahrens if (pio != NULL) { 792789Sahrens zio_t *next, *prev; 793789Sahrens 794789Sahrens mutex_enter(&pio->io_lock); 795789Sahrens next = zio->io_sibling_next; 796789Sahrens prev = zio->io_sibling_prev; 797789Sahrens if (next != NULL) 798789Sahrens next->io_sibling_prev = prev; 799789Sahrens if (prev != NULL) 800789Sahrens prev->io_sibling_next = next; 801789Sahrens if (pio->io_child == zio) 802789Sahrens pio->io_child = next; 803789Sahrens mutex_exit(&pio->io_lock); 804789Sahrens 805789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 806789Sahrens &pio->io_children_notdone); 807789Sahrens } 808789Sahrens 809789Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 810789Sahrens spa_config_exit(spa); 811789Sahrens 812789Sahrens if (zio->io_waiter != NULL) { 813789Sahrens mutex_enter(&zio->io_lock); 814789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 815789Sahrens zio->io_stalled = zio->io_stage; 816789Sahrens cv_broadcast(&zio->io_cv); 817789Sahrens mutex_exit(&zio->io_lock); 818789Sahrens } else { 819789Sahrens kmem_free(zio, sizeof (zio_t)); 820789Sahrens } 821789Sahrens } 822789Sahrens 823789Sahrens /* 824789Sahrens * ========================================================================== 825789Sahrens * Compression support 826789Sahrens * ========================================================================== 827789Sahrens */ 828789Sahrens static void 829789Sahrens zio_write_compress(zio_t *zio) 830789Sahrens { 831789Sahrens int compress = zio->io_compress; 832789Sahrens blkptr_t *bp = zio->io_bp; 833789Sahrens void *cbuf; 834789Sahrens uint64_t lsize = zio->io_size; 835789Sahrens uint64_t csize = lsize; 836789Sahrens uint64_t cbufsize = 0; 837789Sahrens int pass; 838789Sahrens 839789Sahrens if (bp->blk_birth == zio->io_txg) { 840789Sahrens /* 841789Sahrens * We're rewriting an existing block, which means we're 842789Sahrens * working on behalf of spa_sync(). For spa_sync() to 843789Sahrens * converge, it must eventually be the case that we don't 844789Sahrens * have to allocate new blocks. But compression changes 845789Sahrens * the blocksize, which forces a reallocate, and makes 846789Sahrens * convergence take longer. Therefore, after the first 847789Sahrens * few passes, stop compressing to ensure convergence. 848789Sahrens */ 849789Sahrens pass = spa_sync_pass(zio->io_spa); 850789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 851789Sahrens compress = ZIO_COMPRESS_OFF; 852789Sahrens } else { 853789Sahrens ASSERT(BP_IS_HOLE(bp)); 854789Sahrens pass = 1; 855789Sahrens } 856789Sahrens 857789Sahrens if (compress != ZIO_COMPRESS_OFF) 858789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 859789Sahrens &cbuf, &csize, &cbufsize)) 860789Sahrens compress = ZIO_COMPRESS_OFF; 861789Sahrens 862789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 863789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 864789Sahrens 865789Sahrens /* 866789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 867789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 868789Sahrens * but newly allocated blocks are sequential, so they can be written 869789Sahrens * to disk faster. Therefore, we allow the first few passes of 870789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 871789Sahrens * There should only be a handful of blocks after pass 1 in any case. 872789Sahrens */ 873789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 874789Sahrens pass > zio_sync_pass.zp_rewrite) { 875789Sahrens ASSERT(csize != 0); 876789Sahrens ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); 877789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 878789Sahrens 879789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 880789Sahrens } else { 881789Sahrens if (bp->blk_birth == zio->io_txg) { 882789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 883789Sahrens bzero(bp, sizeof (blkptr_t)); 884789Sahrens } 885789Sahrens if (csize == 0) { 886789Sahrens BP_ZERO(bp); 887789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 888789Sahrens } else { 889789Sahrens BP_SET_LSIZE(bp, lsize); 890789Sahrens BP_SET_PSIZE(bp, csize); 891789Sahrens BP_SET_COMPRESS(bp, compress); 892789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 893789Sahrens } 894789Sahrens } 895789Sahrens 896789Sahrens zio_next_stage(zio); 897789Sahrens } 898789Sahrens 899789Sahrens static void 900789Sahrens zio_read_decompress(zio_t *zio) 901789Sahrens { 902789Sahrens blkptr_t *bp = zio->io_bp; 903789Sahrens void *data; 904789Sahrens uint64_t size; 905789Sahrens uint64_t bufsize; 906789Sahrens int compress = BP_GET_COMPRESS(bp); 907789Sahrens 908789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 909789Sahrens 910789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 911789Sahrens 912789Sahrens if (zio_decompress_data(compress, data, size, 913789Sahrens zio->io_data, zio->io_size)) 914789Sahrens zio->io_error = EIO; 915789Sahrens 916789Sahrens zio_buf_free(data, bufsize); 917789Sahrens 918789Sahrens zio_next_stage(zio); 919789Sahrens } 920789Sahrens 921789Sahrens /* 922789Sahrens * ========================================================================== 923789Sahrens * Gang block support 924789Sahrens * ========================================================================== 925789Sahrens */ 926789Sahrens static void 927789Sahrens zio_gang_pipeline(zio_t *zio) 928789Sahrens { 929789Sahrens /* 930789Sahrens * By default, the pipeline assumes that we're dealing with a gang 931789Sahrens * block. If we're not, strip out any gang-specific stages. 932789Sahrens */ 933789Sahrens if (!DVA_GET_GANG(ZIO_GET_DVA(zio))) 934789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 935789Sahrens 936789Sahrens zio_next_stage(zio); 937789Sahrens } 938789Sahrens 939789Sahrens static void 940789Sahrens zio_gang_byteswap(zio_t *zio) 941789Sahrens { 942789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 943789Sahrens 944789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 945789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 946789Sahrens } 947789Sahrens 948789Sahrens static void 949789Sahrens zio_get_gang_header(zio_t *zio) 950789Sahrens { 951789Sahrens blkptr_t *bp = zio->io_bp; 952789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 953789Sahrens void *gbuf = zio_buf_alloc(gsize); 954789Sahrens 955789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 956789Sahrens 957789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 958789Sahrens 959789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 960789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 961789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 962789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 963789Sahrens 964789Sahrens zio_wait_children_done(zio); 965789Sahrens } 966789Sahrens 967789Sahrens static void 968789Sahrens zio_read_gang_members(zio_t *zio) 969789Sahrens { 970789Sahrens zio_gbh_phys_t *gbh; 971789Sahrens uint64_t gsize, gbufsize, loff, lsize; 972789Sahrens int i; 973789Sahrens 974789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 975789Sahrens 976789Sahrens zio_gang_byteswap(zio); 977789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 978789Sahrens 979789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 980789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 981789Sahrens lsize = BP_GET_PSIZE(gbp); 982789Sahrens 983789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 984789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 985789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 986789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 987789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 988789Sahrens 989789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 990789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 991789Sahrens zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 992789Sahrens } 993789Sahrens 994789Sahrens zio_buf_free(gbh, gbufsize); 995789Sahrens zio_wait_children_done(zio); 996789Sahrens } 997789Sahrens 998789Sahrens static void 999789Sahrens zio_rewrite_gang_members(zio_t *zio) 1000789Sahrens { 1001789Sahrens zio_gbh_phys_t *gbh; 1002789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1003789Sahrens int i; 1004789Sahrens 1005789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1006789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1007789Sahrens 1008789Sahrens zio_gang_byteswap(zio); 1009789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1010789Sahrens 1011789Sahrens ASSERT(gsize == gbufsize); 1012789Sahrens 1013789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1014789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1015789Sahrens lsize = BP_GET_PSIZE(gbp); 1016789Sahrens 1017789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1018789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1019789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1020789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1021789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1022789Sahrens 1023789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1024789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1025789Sahrens NULL, NULL, zio->io_priority, zio->io_flags)); 1026789Sahrens } 1027789Sahrens 1028789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1029789Sahrens zio_wait_children_ready(zio); 1030789Sahrens } 1031789Sahrens 1032789Sahrens static void 1033789Sahrens zio_free_gang_members(zio_t *zio) 1034789Sahrens { 1035789Sahrens zio_gbh_phys_t *gbh; 1036789Sahrens uint64_t gsize, gbufsize; 1037789Sahrens int i; 1038789Sahrens 1039789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1040789Sahrens 1041789Sahrens zio_gang_byteswap(zio); 1042789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1043789Sahrens 1044789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1045789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1046789Sahrens 1047789Sahrens if (BP_IS_HOLE(gbp)) 1048789Sahrens continue; 1049789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1050789Sahrens gbp, NULL, NULL)); 1051789Sahrens } 1052789Sahrens 1053789Sahrens zio_buf_free(gbh, gbufsize); 1054789Sahrens zio_next_stage(zio); 1055789Sahrens } 1056789Sahrens 1057789Sahrens static void 1058789Sahrens zio_claim_gang_members(zio_t *zio) 1059789Sahrens { 1060789Sahrens zio_gbh_phys_t *gbh; 1061789Sahrens uint64_t gsize, gbufsize; 1062789Sahrens int i; 1063789Sahrens 1064789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1065789Sahrens 1066789Sahrens zio_gang_byteswap(zio); 1067789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1068789Sahrens 1069789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1070789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1071789Sahrens if (BP_IS_HOLE(gbp)) 1072789Sahrens continue; 1073789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1074789Sahrens gbp, NULL, NULL)); 1075789Sahrens } 1076789Sahrens 1077789Sahrens zio_buf_free(gbh, gbufsize); 1078789Sahrens zio_next_stage(zio); 1079789Sahrens } 1080789Sahrens 1081789Sahrens static void 1082789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1083789Sahrens { 1084789Sahrens zio_t *pio = zio->io_parent; 1085789Sahrens dva_t *cdva = ZIO_GET_DVA(zio); 1086789Sahrens dva_t *pdva = ZIO_GET_DVA(pio); 1087789Sahrens uint64_t asize; 1088789Sahrens 1089789Sahrens ASSERT(DVA_GET_GANG(pdva)); 1090789Sahrens 1091789Sahrens /* XXBP - Need to be careful here with multiple DVAs */ 1092789Sahrens mutex_enter(&pio->io_lock); 1093789Sahrens asize = DVA_GET_ASIZE(pdva); 1094789Sahrens asize += DVA_GET_ASIZE(cdva); 1095789Sahrens DVA_SET_ASIZE(pdva, asize); 1096789Sahrens mutex_exit(&pio->io_lock); 1097789Sahrens } 1098789Sahrens 1099789Sahrens static void 1100789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1101789Sahrens { 1102789Sahrens blkptr_t *bp = zio->io_bp; 1103789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1104789Sahrens zio_gbh_phys_t *gbh; 1105789Sahrens uint64_t resid = zio->io_size; 1106789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1107789Sahrens uint64_t gsize, loff, lsize; 1108789Sahrens uint32_t gbps_left; 1109789Sahrens int error; 1110789Sahrens int i; 1111789Sahrens 1112789Sahrens gsize = SPA_GANGBLOCKSIZE; 1113789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1114789Sahrens 1115789Sahrens error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg); 1116789Sahrens if (error == ENOSPC) 1117789Sahrens panic("can't allocate gang block header"); 1118789Sahrens ASSERT(error == 0); 1119789Sahrens 1120789Sahrens DVA_SET_GANG(dva, 1); 1121789Sahrens 1122789Sahrens bp->blk_birth = zio->io_txg; 1123789Sahrens 1124789Sahrens gbh = zio_buf_alloc(gsize); 1125789Sahrens bzero(gbh, gsize); 1126789Sahrens 1127789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1128789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1129789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1130789Sahrens dva = &gbp->blk_dva[0]; 1131789Sahrens 1132789Sahrens ASSERT(gbps_left != 0); 1133789Sahrens maxalloc = MIN(maxalloc, resid); 1134789Sahrens 1135789Sahrens while (resid <= maxalloc * gbps_left) { 1136789Sahrens error = metaslab_alloc(zio->io_spa, maxalloc, dva, 1137789Sahrens zio->io_txg); 1138789Sahrens if (error == 0) 1139789Sahrens break; 1140789Sahrens ASSERT3U(error, ==, ENOSPC); 1141789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1142789Sahrens panic("really out of space"); 1143789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1144789Sahrens } 1145789Sahrens 1146789Sahrens if (resid <= maxalloc * gbps_left) { 1147789Sahrens lsize = maxalloc; 1148789Sahrens BP_SET_LSIZE(gbp, lsize); 1149789Sahrens BP_SET_PSIZE(gbp, lsize); 1150789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1151789Sahrens gbp->blk_birth = zio->io_txg; 1152789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, 1153789Sahrens zio->io_checksum, zio->io_txg, gbp, 1154789Sahrens (char *)zio->io_data + loff, lsize, 1155789Sahrens zio_write_allocate_gang_member_done, NULL, 1156789Sahrens zio->io_priority, zio->io_flags)); 1157789Sahrens } else { 1158789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1159789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 1160789Sahrens zio_nowait(zio_write_allocate(zio, zio->io_spa, 1161789Sahrens zio->io_checksum, zio->io_txg, gbp, 1162789Sahrens (char *)zio->io_data + loff, lsize, 1163789Sahrens zio_write_allocate_gang_member_done, NULL, 1164789Sahrens zio->io_priority, zio->io_flags)); 1165789Sahrens } 1166789Sahrens } 1167789Sahrens 1168789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1169789Sahrens 1170789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1171789Sahrens 1172789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 1173789Sahrens zio_wait_children_done(zio); 1174789Sahrens } 1175789Sahrens 1176789Sahrens /* 1177789Sahrens * ========================================================================== 1178789Sahrens * Allocate and free blocks 1179789Sahrens * ========================================================================== 1180789Sahrens */ 1181789Sahrens static void 1182789Sahrens zio_dva_allocate(zio_t *zio) 1183789Sahrens { 1184789Sahrens blkptr_t *bp = zio->io_bp; 1185789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1186789Sahrens int error; 1187789Sahrens 1188789Sahrens ASSERT(BP_IS_HOLE(bp)); 1189789Sahrens 1190789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1191789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1192789Sahrens zio_write_allocate_gang_members(zio); 1193789Sahrens return; 1194789Sahrens } 1195789Sahrens 1196789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1197789Sahrens 1198789Sahrens error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg); 1199789Sahrens 1200789Sahrens if (error == 0) { 1201789Sahrens bp->blk_birth = zio->io_txg; 1202789Sahrens } else if (error == ENOSPC) { 1203789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1204789Sahrens panic("really, truly out of space"); 1205789Sahrens zio_write_allocate_gang_members(zio); 1206789Sahrens return; 1207789Sahrens } else { 1208789Sahrens zio->io_error = error; 1209789Sahrens } 1210789Sahrens zio_next_stage(zio); 1211789Sahrens } 1212789Sahrens 1213789Sahrens static void 1214789Sahrens zio_dva_free(zio_t *zio) 1215789Sahrens { 1216789Sahrens blkptr_t *bp = zio->io_bp; 1217789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1218789Sahrens 1219789Sahrens ASSERT(!BP_IS_HOLE(bp)); 1220789Sahrens 1221789Sahrens metaslab_free(zio->io_spa, dva, zio->io_txg); 1222789Sahrens 1223789Sahrens BP_ZERO(bp); 1224789Sahrens 1225789Sahrens zio_next_stage(zio); 1226789Sahrens } 1227789Sahrens 1228789Sahrens static void 1229789Sahrens zio_dva_claim(zio_t *zio) 1230789Sahrens { 1231789Sahrens blkptr_t *bp = zio->io_bp; 1232789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1233789Sahrens 1234789Sahrens ASSERT(!BP_IS_HOLE(bp)); 1235789Sahrens 1236789Sahrens zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg); 1237789Sahrens 1238789Sahrens zio_next_stage(zio); 1239789Sahrens } 1240789Sahrens 1241789Sahrens static void 1242789Sahrens zio_dva_translate(zio_t *zio) 1243789Sahrens { 1244789Sahrens spa_t *spa = zio->io_spa; 1245789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1246789Sahrens uint64_t vdev = DVA_GET_VDEV(dva); 1247789Sahrens uint64_t offset = DVA_GET_OFFSET(dva); 1248789Sahrens 1249789Sahrens ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio)); 1250789Sahrens 1251789Sahrens zio->io_offset = offset; 1252789Sahrens 1253789Sahrens if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL) 1254789Sahrens zio->io_error = ENXIO; 1255789Sahrens else if (offset + zio->io_size > zio->io_vd->vdev_asize) 1256789Sahrens zio->io_error = EOVERFLOW; 1257789Sahrens 1258789Sahrens zio_next_stage(zio); 1259789Sahrens } 1260789Sahrens 1261789Sahrens /* 1262789Sahrens * ========================================================================== 1263789Sahrens * Read and write to physical devices 1264789Sahrens * ========================================================================== 1265789Sahrens */ 1266789Sahrens static void 1267789Sahrens zio_vdev_io_enter(zio_t *zio) 1268789Sahrens { 1269789Sahrens vdev_t *tvd = zio->io_vd->vdev_top; 1270789Sahrens 1271789Sahrens mutex_enter(&tvd->vdev_io_lock); 1272789Sahrens ASSERT(zio->io_pending.list_next == NULL); 1273789Sahrens list_insert_tail(&tvd->vdev_io_pending, zio); 1274789Sahrens mutex_exit(&tvd->vdev_io_lock); 1275789Sahrens } 1276789Sahrens 1277789Sahrens static void 1278789Sahrens zio_vdev_io_exit(zio_t *zio) 1279789Sahrens { 1280789Sahrens vdev_t *tvd = zio->io_vd->vdev_top; 1281789Sahrens 1282789Sahrens mutex_enter(&tvd->vdev_io_lock); 1283789Sahrens ASSERT(zio->io_pending.list_next != NULL); 1284789Sahrens list_remove(&tvd->vdev_io_pending, zio); 1285789Sahrens if (list_head(&tvd->vdev_io_pending) == NULL) 1286789Sahrens cv_broadcast(&tvd->vdev_io_cv); 1287789Sahrens mutex_exit(&tvd->vdev_io_lock); 1288789Sahrens } 1289789Sahrens 1290789Sahrens static void 1291789Sahrens zio_vdev_io_retry(void *vdarg) 1292789Sahrens { 1293789Sahrens vdev_t *vd = vdarg; 1294789Sahrens zio_t *zio, *zq; 1295789Sahrens 1296789Sahrens ASSERT(vd == vd->vdev_top); 1297789Sahrens 1298789Sahrens /* XXPOLICY */ 1299789Sahrens delay(hz); 1300789Sahrens 1301789Sahrens vdev_reopen(vd, &zq); 1302789Sahrens 1303789Sahrens while ((zio = zq) != NULL) { 1304789Sahrens zq = zio->io_retry_next; 1305789Sahrens zio->io_retry_next = NULL; 1306789Sahrens dprintf("async retry #%d for I/O to %s offset %llx\n", 1307789Sahrens zio->io_retries, vdev_description(vd), zio->io_offset); 1308789Sahrens zio_next_stage_async(zio); 1309789Sahrens } 1310789Sahrens } 1311789Sahrens 1312789Sahrens static void 1313789Sahrens zio_vdev_io_setup(zio_t *zio) 1314789Sahrens { 1315789Sahrens vdev_t *vd = zio->io_vd; 1316789Sahrens 1317789Sahrens /* XXPOLICY */ 1318789Sahrens if (zio->io_retries == 0 && vd == vd->vdev_top) 1319789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1320789Sahrens 1321789Sahrens if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1322789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1323789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1324789Sahrens } 1325789Sahrens 1326789Sahrens zio_vdev_io_enter(zio); 1327789Sahrens 1328789Sahrens zio_next_stage(zio); 1329789Sahrens } 1330789Sahrens 1331789Sahrens static void 1332789Sahrens zio_vdev_io_start(zio_t *zio) 1333789Sahrens { 1334789Sahrens blkptr_t *bp = zio->io_bp; 1335789Sahrens 1336789Sahrens ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0); 1337789Sahrens ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0); 1338789Sahrens ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size); 1339789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1340789Sahrens 1341789Sahrens vdev_io_start(zio); 1342789Sahrens 1343789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1344789Sahrens } 1345789Sahrens 1346789Sahrens static void 1347789Sahrens zio_vdev_io_done(zio_t *zio) 1348789Sahrens { 1349789Sahrens vdev_io_done(zio); 1350789Sahrens } 1351789Sahrens 1352789Sahrens /* XXPOLICY */ 1353789Sahrens static boolean_t 1354789Sahrens zio_should_retry(zio_t *zio) 1355789Sahrens { 1356789Sahrens vdev_t *vd = zio->io_vd; 1357789Sahrens 1358789Sahrens if (zio->io_error == 0) 1359789Sahrens return (B_FALSE); 1360789Sahrens if (zio->io_delegate_list != NULL) 1361789Sahrens return (B_FALSE); 1362789Sahrens if (vd != vd->vdev_top) 1363789Sahrens return (B_FALSE); 1364789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1365789Sahrens return (B_FALSE); 1366789Sahrens if (zio->io_retries > 300 && 1367789Sahrens (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL))) 1368789Sahrens return (B_FALSE); 1369789Sahrens if (zio->io_retries > 1 && 1370789Sahrens (zio->io_error == ECKSUM || zio->io_error == ENXIO)) 1371789Sahrens return (B_FALSE); 1372789Sahrens 1373789Sahrens return (B_TRUE); 1374789Sahrens } 1375789Sahrens 1376789Sahrens static void 1377789Sahrens zio_vdev_io_assess(zio_t *zio) 1378789Sahrens { 1379789Sahrens vdev_t *vd = zio->io_vd; 1380789Sahrens vdev_t *tvd = vd->vdev_top; 1381789Sahrens 1382789Sahrens zio_vdev_io_exit(zio); 1383789Sahrens 1384789Sahrens ASSERT(zio->io_vsd == NULL); 1385789Sahrens 1386789Sahrens /* 1387789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1388789Sahrens */ 1389789Sahrens /* XXPOLICY */ 1390789Sahrens if (zio_should_retry(zio)) { 1391789Sahrens zio_t *zq; 1392789Sahrens 1393789Sahrens ASSERT(tvd == vd); 1394789Sahrens ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); 1395789Sahrens 1396789Sahrens zio->io_retries++; 1397789Sahrens zio->io_error = 0; 1398789Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1399789Sahrens /* XXPOLICY */ 1400789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1401789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1402789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1; 1403789Sahrens 1404789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1405789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1406789Sahrens vdev_description(vd), zio->io_offset); 1407789Sahrens 1408789Sahrens /* 1409789Sahrens * If this is the first retry, do it immediately. 1410789Sahrens */ 1411789Sahrens /* XXPOLICY */ 1412789Sahrens if (zio->io_retries == 1) { 1413789Sahrens zio_next_stage_async(zio); 1414789Sahrens return; 1415789Sahrens } 1416789Sahrens 1417789Sahrens /* 1418789Sahrens * This was not the first retry, so go through the 1419789Sahrens * longer enqueue/delay/vdev_reopen() process. 1420789Sahrens */ 1421789Sahrens mutex_enter(&tvd->vdev_io_lock); 1422789Sahrens ASSERT(zio->io_retry_next == NULL); 1423789Sahrens zio->io_retry_next = zq = tvd->vdev_io_retry; 1424789Sahrens tvd->vdev_io_retry = zio; 1425789Sahrens mutex_exit(&tvd->vdev_io_lock); 1426789Sahrens if (zq == NULL) 1427789Sahrens (void) taskq_dispatch( 1428789Sahrens tvd->vdev_spa->spa_vdev_retry_taskq, 1429789Sahrens zio_vdev_io_retry, tvd, TQ_SLEEP); 1430789Sahrens return; 1431789Sahrens } 1432789Sahrens 1433789Sahrens zio_next_stage(zio); 1434789Sahrens } 1435789Sahrens 1436789Sahrens void 1437789Sahrens zio_vdev_io_reissue(zio_t *zio) 1438789Sahrens { 1439789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1440789Sahrens ASSERT(zio->io_error == 0); 1441789Sahrens 1442789Sahrens zio->io_stage--; 1443789Sahrens } 1444789Sahrens 1445789Sahrens void 1446789Sahrens zio_vdev_io_redone(zio_t *zio) 1447789Sahrens { 1448789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1449789Sahrens 1450789Sahrens zio->io_stage--; 1451789Sahrens } 1452789Sahrens 1453789Sahrens void 1454789Sahrens zio_vdev_io_bypass(zio_t *zio) 1455789Sahrens { 1456789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1457789Sahrens ASSERT(zio->io_error == 0); 1458789Sahrens 1459789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1460789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1461789Sahrens } 1462789Sahrens 1463789Sahrens /* 1464789Sahrens * ========================================================================== 1465789Sahrens * Generate and verify checksums 1466789Sahrens * ========================================================================== 1467789Sahrens */ 1468789Sahrens static void 1469789Sahrens zio_checksum_generate(zio_t *zio) 1470789Sahrens { 1471789Sahrens int checksum = zio->io_checksum; 1472789Sahrens blkptr_t *bp = zio->io_bp; 1473789Sahrens 1474789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1475789Sahrens 1476789Sahrens BP_SET_CHECKSUM(bp, checksum); 1477789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1478789Sahrens 1479789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1480789Sahrens 1481789Sahrens zio_next_stage(zio); 1482789Sahrens } 1483789Sahrens 1484789Sahrens static void 1485789Sahrens zio_gang_checksum_generate(zio_t *zio) 1486789Sahrens { 1487789Sahrens zio_cksum_t zc; 1488789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1489789Sahrens 1490789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1491789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1492789Sahrens 1493789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1494789Sahrens 1495789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1496789Sahrens 1497789Sahrens zio_next_stage(zio); 1498789Sahrens } 1499789Sahrens 1500789Sahrens static void 1501789Sahrens zio_checksum_verify(zio_t *zio) 1502789Sahrens { 1503789Sahrens if (zio->io_bp != NULL) { 1504789Sahrens zio->io_error = zio_checksum_error(zio); 1505789Sahrens if (zio->io_error) { 1506789Sahrens dprintf("bad checksum on vdev %s\n", 1507789Sahrens vdev_description(zio->io_vd)); 1508789Sahrens } 1509789Sahrens } 1510789Sahrens 1511789Sahrens zio_next_stage(zio); 1512789Sahrens } 1513789Sahrens 1514789Sahrens /* 1515789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1516789Sahrens */ 1517789Sahrens void 1518789Sahrens zio_checksum_verified(zio_t *zio) 1519789Sahrens { 1520789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1521789Sahrens } 1522789Sahrens 1523789Sahrens /* 1524789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1525789Sahrens */ 1526789Sahrens void 1527789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1528789Sahrens { 1529789Sahrens zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio)); 1530789Sahrens zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio)); 1531789Sahrens zcp->zc_word[2] = zio->io_bp->blk_birth; 1532789Sahrens zcp->zc_word[3] = 0; 1533789Sahrens } 1534789Sahrens 1535789Sahrens /* 1536789Sahrens * ========================================================================== 1537789Sahrens * Define the pipeline 1538789Sahrens * ========================================================================== 1539789Sahrens */ 1540789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1541789Sahrens 1542789Sahrens static void 1543789Sahrens zio_badop(zio_t *zio) 1544789Sahrens { 1545789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1546789Sahrens } 1547789Sahrens 1548789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1549789Sahrens zio_badop, 1550789Sahrens zio_wait_children_ready, 1551789Sahrens zio_write_compress, 1552789Sahrens zio_checksum_generate, 1553789Sahrens zio_gang_pipeline, 1554789Sahrens zio_get_gang_header, 1555789Sahrens zio_rewrite_gang_members, 1556789Sahrens zio_free_gang_members, 1557789Sahrens zio_claim_gang_members, 1558789Sahrens zio_dva_allocate, 1559789Sahrens zio_dva_free, 1560789Sahrens zio_dva_claim, 1561789Sahrens zio_gang_checksum_generate, 1562789Sahrens zio_ready, 1563789Sahrens zio_dva_translate, 1564789Sahrens zio_vdev_io_setup, 1565789Sahrens zio_vdev_io_start, 1566789Sahrens zio_vdev_io_done, 1567789Sahrens zio_vdev_io_assess, 1568789Sahrens zio_wait_children_done, 1569789Sahrens zio_checksum_verify, 1570789Sahrens zio_read_gang_members, 1571789Sahrens zio_read_decompress, 1572789Sahrens zio_done, 1573789Sahrens zio_badop 1574789Sahrens }; 1575789Sahrens 1576789Sahrens /* 1577789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1578789Sahrens * There's no locking on io_stage because there's no legitimate way for 1579789Sahrens * multiple threads to be attempting to process the same I/O. 1580789Sahrens */ 1581789Sahrens void 1582789Sahrens zio_next_stage(zio_t *zio) 1583789Sahrens { 1584789Sahrens uint32_t pipeline = zio->io_pipeline; 1585789Sahrens 1586789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1587789Sahrens 1588789Sahrens if (zio->io_error) { 1589789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1590789Sahrens zio, vdev_description(zio->io_vd), 1591789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1592789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1593789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1594789Sahrens } 1595789Sahrens 1596789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1597789Sahrens continue; 1598789Sahrens 1599789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1600789Sahrens ASSERT(zio->io_stalled == 0); 1601789Sahrens 1602789Sahrens zio_pipeline[zio->io_stage](zio); 1603789Sahrens } 1604789Sahrens 1605789Sahrens void 1606789Sahrens zio_next_stage_async(zio_t *zio) 1607789Sahrens { 1608789Sahrens taskq_t *tq; 1609789Sahrens uint32_t pipeline = zio->io_pipeline; 1610789Sahrens 1611789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1612789Sahrens 1613789Sahrens if (zio->io_error) { 1614789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1615789Sahrens zio, vdev_description(zio->io_vd), 1616789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1617789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1618789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1619789Sahrens } 1620789Sahrens 1621789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1622789Sahrens continue; 1623789Sahrens 1624789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1625789Sahrens ASSERT(zio->io_stalled == 0); 1626789Sahrens 1627789Sahrens /* 1628789Sahrens * For performance, we'll probably want two sets of task queues: 1629789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1630789Sahrens * part is for read performance: since we have to make a pass over 1631789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1632789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1633789Sahrens * that thread is probably still there. Getting this optimization 1634789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1635789Sahrens * 1636789Sahrens * Note that having two sets of task queues is also necessary for 1637789Sahrens * correctness: if all of the issue threads get bogged down waiting 1638789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1639789Sahrens * there won't be any threads available to service I/O completion 1640789Sahrens * interrupts. 1641789Sahrens */ 1642789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1643789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1644789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1645789Sahrens else 1646789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1647789Sahrens (void) taskq_dispatch(tq, 1648789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1649789Sahrens } else { 1650789Sahrens zio_pipeline[zio->io_stage](zio); 1651789Sahrens } 1652789Sahrens } 1653789Sahrens 1654789Sahrens /* 1655789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1656789Sahrens */ 1657789Sahrens int 1658789Sahrens zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, 1659789Sahrens uint64_t txg) 1660789Sahrens { 1661789Sahrens int error; 1662789Sahrens 1663789Sahrens spa_config_enter(spa, RW_READER); 1664789Sahrens 1665789Sahrens BP_ZERO(bp); 1666789Sahrens 1667789Sahrens error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg); 1668789Sahrens 1669789Sahrens if (error == 0) { 1670789Sahrens BP_SET_CHECKSUM(bp, checksum); 1671789Sahrens BP_SET_LSIZE(bp, size); 1672789Sahrens BP_SET_PSIZE(bp, size); 1673789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 1674789Sahrens BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); 1675789Sahrens BP_SET_LEVEL(bp, 0); 1676789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1677789Sahrens bp->blk_birth = txg; 1678789Sahrens } 1679789Sahrens 1680789Sahrens spa_config_exit(spa); 1681789Sahrens 1682789Sahrens return (error); 1683789Sahrens } 1684789Sahrens 1685789Sahrens /* 1686789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1687789Sahrens * nothing to do except metaslab_free() it. 1688789Sahrens */ 1689789Sahrens void 1690789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1691789Sahrens { 1692789Sahrens ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0); 1693789Sahrens 1694789Sahrens dprintf_bp(bp, "txg %llu: ", txg); 1695789Sahrens 1696789Sahrens spa_config_enter(spa, RW_READER); 1697789Sahrens 1698789Sahrens metaslab_free(spa, BP_IDENTITY(bp), txg); 1699789Sahrens 1700789Sahrens spa_config_exit(spa); 1701789Sahrens } 1702