1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 5789Sahrens * Common Development and Distribution License, Version 1.0 only 6789Sahrens * (the "License"). You may not use this file except in compliance 7789Sahrens * with the License. 8789Sahrens * 9789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10789Sahrens * or http://www.opensolaris.org/os/licensing. 11789Sahrens * See the License for the specific language governing permissions 12789Sahrens * and limitations under the License. 13789Sahrens * 14789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19789Sahrens * 20789Sahrens * CDDL HEADER END 21789Sahrens */ 22789Sahrens /* 23789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens static void zio_vdev_io_enter(zio_t *zio); 39789Sahrens static void zio_vdev_io_exit(zio_t *zio); 40789Sahrens 41789Sahrens /* 42789Sahrens * ========================================================================== 43789Sahrens * I/O priority table 44789Sahrens * ========================================================================== 45789Sahrens */ 46789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 47789Sahrens 0, /* ZIO_PRIORITY_NOW */ 48789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 49789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 50789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 51789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 52789Sahrens 4, /* ZIO_PRIORITY_FREE */ 53789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 54789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 55789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 56789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 57789Sahrens }; 58789Sahrens 59789Sahrens /* 60789Sahrens * ========================================================================== 61789Sahrens * I/O type descriptions 62789Sahrens * ========================================================================== 63789Sahrens */ 64789Sahrens char *zio_type_name[ZIO_TYPES] = { 65789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 66789Sahrens 67789Sahrens /* At or above this size, force gang blocking - for testing */ 68789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 69789Sahrens 70789Sahrens typedef struct zio_sync_pass { 71789Sahrens int zp_defer_free; /* defer frees after this pass */ 72789Sahrens int zp_dontcompress; /* don't compress after this pass */ 73789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74789Sahrens } zio_sync_pass_t; 75789Sahrens 76789Sahrens zio_sync_pass_t zio_sync_pass = { 77789Sahrens 1, /* zp_defer_free */ 78789Sahrens 4, /* zp_dontcompress */ 79789Sahrens 1, /* zp_rewrite */ 80789Sahrens }; 81789Sahrens 82789Sahrens /* 83789Sahrens * ========================================================================== 84789Sahrens * I/O kmem caches 85789Sahrens * ========================================================================== 86789Sahrens */ 87789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 88789Sahrens 89789Sahrens void 90789Sahrens zio_init(void) 91789Sahrens { 92789Sahrens size_t c; 93789Sahrens 94789Sahrens /* 95789Sahrens * For small buffers, we want a cache for each multiple of 96789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 97789Sahrens * for each quarter-power of 2. For large buffers, we want 98789Sahrens * a cache for each multiple of PAGESIZE. 99789Sahrens */ 100789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 101789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 102789Sahrens size_t p2 = size; 103789Sahrens size_t align = 0; 104789Sahrens 105789Sahrens while (p2 & (p2 - 1)) 106789Sahrens p2 &= p2 - 1; 107789Sahrens 108789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 109789Sahrens align = SPA_MINBLOCKSIZE; 110789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 111789Sahrens align = PAGESIZE; 112789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 113789Sahrens align = p2 >> 2; 114789Sahrens } 115789Sahrens 116789Sahrens if (align != 0) { 117789Sahrens char name[30]; 118789Sahrens (void) sprintf(name, "zio_buf_%lu", size); 119789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 120*849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 121789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 122789Sahrens size, align); 123789Sahrens } 124789Sahrens } 125789Sahrens 126789Sahrens while (--c != 0) { 127789Sahrens ASSERT(zio_buf_cache[c] != NULL); 128789Sahrens if (zio_buf_cache[c - 1] == NULL) 129789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 130789Sahrens } 131789Sahrens } 132789Sahrens 133789Sahrens void 134789Sahrens zio_fini(void) 135789Sahrens { 136789Sahrens size_t c; 137789Sahrens kmem_cache_t *last_cache = NULL; 138789Sahrens 139789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 140789Sahrens if (zio_buf_cache[c] != last_cache) { 141789Sahrens last_cache = zio_buf_cache[c]; 142789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 143789Sahrens } 144789Sahrens zio_buf_cache[c] = NULL; 145789Sahrens } 146789Sahrens } 147789Sahrens 148789Sahrens /* 149789Sahrens * ========================================================================== 150789Sahrens * Allocate and free I/O buffers 151789Sahrens * ========================================================================== 152789Sahrens */ 153789Sahrens void * 154789Sahrens zio_buf_alloc(size_t size) 155789Sahrens { 156789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 157789Sahrens 158789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 159789Sahrens 160789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 161789Sahrens } 162789Sahrens 163789Sahrens void 164789Sahrens zio_buf_free(void *buf, size_t size) 165789Sahrens { 166789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 167789Sahrens 168789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 169789Sahrens 170789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 171789Sahrens } 172789Sahrens 173789Sahrens /* 174789Sahrens * ========================================================================== 175789Sahrens * Push and pop I/O transform buffers 176789Sahrens * ========================================================================== 177789Sahrens */ 178789Sahrens static void 179789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 180789Sahrens { 181789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 182789Sahrens 183789Sahrens zt->zt_data = data; 184789Sahrens zt->zt_size = size; 185789Sahrens zt->zt_bufsize = bufsize; 186789Sahrens 187789Sahrens zt->zt_next = zio->io_transform_stack; 188789Sahrens zio->io_transform_stack = zt; 189789Sahrens 190789Sahrens zio->io_data = data; 191789Sahrens zio->io_size = size; 192789Sahrens } 193789Sahrens 194789Sahrens static void 195789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 196789Sahrens { 197789Sahrens zio_transform_t *zt = zio->io_transform_stack; 198789Sahrens 199789Sahrens *data = zt->zt_data; 200789Sahrens *size = zt->zt_size; 201789Sahrens *bufsize = zt->zt_bufsize; 202789Sahrens 203789Sahrens zio->io_transform_stack = zt->zt_next; 204789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 205789Sahrens 206789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 207789Sahrens zio->io_data = zt->zt_data; 208789Sahrens zio->io_size = zt->zt_size; 209789Sahrens } 210789Sahrens } 211789Sahrens 212789Sahrens static void 213789Sahrens zio_clear_transform_stack(zio_t *zio) 214789Sahrens { 215789Sahrens void *data; 216789Sahrens uint64_t size, bufsize; 217789Sahrens 218789Sahrens ASSERT(zio->io_transform_stack != NULL); 219789Sahrens 220789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 221789Sahrens while (zio->io_transform_stack != NULL) { 222789Sahrens zio_buf_free(data, bufsize); 223789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 224789Sahrens } 225789Sahrens } 226789Sahrens 227789Sahrens /* 228789Sahrens * ========================================================================== 229789Sahrens * Create the various types of I/O (read, write, free) 230789Sahrens * ========================================================================== 231789Sahrens */ 232789Sahrens static zio_t * 233789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 234789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 235789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 236789Sahrens { 237789Sahrens zio_t *zio; 238789Sahrens 239789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 240789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 241789Sahrens 242789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 243789Sahrens zio->io_parent = pio; 244789Sahrens zio->io_spa = spa; 245789Sahrens zio->io_txg = txg; 246789Sahrens if (bp != NULL) { 247789Sahrens zio->io_bp = bp; 248789Sahrens zio->io_bp_copy = *bp; 249789Sahrens zio->io_bp_orig = *bp; 250789Sahrens /* XXBP - Need to inherit this when it matters */ 251789Sahrens zio->io_dva_index = 0; 252789Sahrens } 253789Sahrens zio->io_done = done; 254789Sahrens zio->io_private = private; 255789Sahrens zio->io_type = type; 256789Sahrens zio->io_priority = priority; 257789Sahrens zio->io_stage = stage; 258789Sahrens zio->io_pipeline = pipeline; 259789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 260789Sahrens zio->io_timestamp = lbolt64; 261789Sahrens zio->io_flags = flags; 262789Sahrens zio_push_transform(zio, data, size, size); 263789Sahrens 264789Sahrens if (pio == NULL) { 265789Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 266789Sahrens spa_config_enter(zio->io_spa, RW_READER); 267789Sahrens zio->io_root = zio; 268789Sahrens } else { 269789Sahrens zio->io_root = pio->io_root; 270789Sahrens 271789Sahrens mutex_enter(&pio->io_lock); 272789Sahrens if (stage < ZIO_STAGE_READY) 273789Sahrens pio->io_children_notready++; 274789Sahrens pio->io_children_notdone++; 275789Sahrens zio->io_sibling_next = pio->io_child; 276789Sahrens zio->io_sibling_prev = NULL; 277789Sahrens if (pio->io_child != NULL) 278789Sahrens pio->io_child->io_sibling_prev = zio; 279789Sahrens pio->io_child = zio; 280789Sahrens mutex_exit(&pio->io_lock); 281789Sahrens } 282789Sahrens 283789Sahrens return (zio); 284789Sahrens } 285789Sahrens 286789Sahrens zio_t * 287789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 288789Sahrens int flags) 289789Sahrens { 290789Sahrens zio_t *zio; 291789Sahrens 292789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 293789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 294789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 295789Sahrens 296789Sahrens return (zio); 297789Sahrens } 298789Sahrens 299789Sahrens zio_t * 300789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 301789Sahrens { 302789Sahrens return (zio_null(NULL, spa, done, private, flags)); 303789Sahrens } 304789Sahrens 305789Sahrens zio_t * 306789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 307789Sahrens uint64_t size, zio_done_func_t *done, void *private, 308789Sahrens int priority, int flags) 309789Sahrens { 310789Sahrens zio_t *zio; 311789Sahrens dva_t *dva; 312789Sahrens 313789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 314789Sahrens 315789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 316789Sahrens ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 317789Sahrens 318789Sahrens /* 319789Sahrens * Work off our copy of the bp so the caller can free it. 320789Sahrens */ 321789Sahrens zio->io_bp = &zio->io_bp_copy; 322789Sahrens 323789Sahrens bp = zio->io_bp; 324789Sahrens dva = ZIO_GET_DVA(zio); 325789Sahrens 326789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 327789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 328789Sahrens void *cbuf = zio_buf_alloc(csize); 329789Sahrens 330789Sahrens zio_push_transform(zio, cbuf, csize, csize); 331789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 332789Sahrens } 333789Sahrens 334789Sahrens if (DVA_GET_GANG(dva)) { 335789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 336789Sahrens void *gbuf = zio_buf_alloc(gsize); 337789Sahrens 338789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 339789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 340789Sahrens } 341789Sahrens 342789Sahrens return (zio); 343789Sahrens } 344789Sahrens 345789Sahrens zio_t * 346789Sahrens zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, 347789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 348789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 349789Sahrens { 350789Sahrens zio_t *zio; 351789Sahrens 352789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 353789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 354789Sahrens 355789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 356789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 357789Sahrens 358789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 359789Sahrens ZIO_TYPE_WRITE, priority, flags, 360789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 361789Sahrens 362789Sahrens zio->io_checksum = checksum; 363789Sahrens zio->io_compress = compress; 364789Sahrens 365789Sahrens if (compress != ZIO_COMPRESS_OFF) 366789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 367789Sahrens 368789Sahrens if (bp->blk_birth != txg) { 369789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 370789Sahrens BP_ZERO(bp); 371789Sahrens BP_SET_LSIZE(bp, size); 372789Sahrens BP_SET_PSIZE(bp, size); 373789Sahrens } 374789Sahrens 375789Sahrens return (zio); 376789Sahrens } 377789Sahrens 378789Sahrens zio_t * 379789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 380789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 381789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 382789Sahrens { 383789Sahrens zio_t *zio; 384789Sahrens 385789Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 386789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 387789Sahrens ZIO_TYPE_WRITE, priority, flags, 388789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 389789Sahrens 390789Sahrens zio->io_checksum = checksum; 391789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 392789Sahrens 393789Sahrens return (zio); 394789Sahrens } 395789Sahrens 396789Sahrens static zio_t * 397789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 398789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 399789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 400789Sahrens { 401789Sahrens zio_t *zio; 402789Sahrens 403789Sahrens BP_ZERO(bp); 404789Sahrens BP_SET_LSIZE(bp, size); 405789Sahrens BP_SET_PSIZE(bp, size); 406789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 407789Sahrens 408789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 409789Sahrens ZIO_TYPE_WRITE, priority, flags, 410789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 411789Sahrens 412789Sahrens zio->io_checksum = checksum; 413789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 414789Sahrens 415789Sahrens return (zio); 416789Sahrens } 417789Sahrens 418789Sahrens zio_t * 419789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 420789Sahrens zio_done_func_t *done, void *private) 421789Sahrens { 422789Sahrens zio_t *zio; 423789Sahrens 424789Sahrens ASSERT(!BP_IS_HOLE(bp)); 425789Sahrens 426789Sahrens if (txg == spa->spa_syncing_txg && 427789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 428789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 429789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 430789Sahrens } 431789Sahrens 432789Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 433789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 434789Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0, 435789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 436789Sahrens 437789Sahrens zio->io_bp = &zio->io_bp_copy; 438789Sahrens 439789Sahrens return (zio); 440789Sahrens } 441789Sahrens 442789Sahrens zio_t * 443789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 444789Sahrens zio_done_func_t *done, void *private) 445789Sahrens { 446789Sahrens zio_t *zio; 447789Sahrens 448789Sahrens /* 449789Sahrens * A claim is an allocation of a specific block. Claims are needed 450789Sahrens * to support immediate writes in the intent log. The issue is that 451789Sahrens * immediate writes contain committed data, but in a txg that was 452789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 453789Sahrens * the intent log claims all blocks that contain immediate write data 454789Sahrens * so that the SPA knows they're in use. 455789Sahrens * 456789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 457789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 458789Sahrens */ 459789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 460789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 461789Sahrens 462789Sahrens /* XXBP - We need to re-evaluate when to insert pipeline stages */ 463789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 464789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 465789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 466789Sahrens 467789Sahrens zio->io_bp = &zio->io_bp_copy; 468789Sahrens 469789Sahrens return (zio); 470789Sahrens } 471789Sahrens 472789Sahrens zio_t * 473789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 474789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 475789Sahrens { 476789Sahrens zio_t *zio; 477789Sahrens int c; 478789Sahrens 479789Sahrens if (vd->vdev_children == 0) { 480789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 481789Sahrens ZIO_TYPE_IOCTL, priority, flags, 482789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 483789Sahrens 484789Sahrens zio->io_vd = vd; 485789Sahrens zio->io_cmd = cmd; 486789Sahrens } else { 487789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 488789Sahrens 489789Sahrens for (c = 0; c < vd->vdev_children; c++) 490789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 491789Sahrens done, private, priority, flags)); 492789Sahrens } 493789Sahrens 494789Sahrens return (zio); 495789Sahrens } 496789Sahrens 497789Sahrens static void 498789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 499789Sahrens int checksum) 500789Sahrens { 501789Sahrens ASSERT(vd->vdev_children == 0); 502789Sahrens 503789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 504789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 505789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 506789Sahrens 507789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 508789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 509789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 510789Sahrens 511789Sahrens BP_ZERO(bp); 512789Sahrens 513789Sahrens BP_SET_LSIZE(bp, size); 514789Sahrens BP_SET_PSIZE(bp, size); 515789Sahrens 516789Sahrens BP_SET_CHECKSUM(bp, checksum); 517789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 518789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 519789Sahrens 520789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 521789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 522789Sahrens } 523789Sahrens 524789Sahrens zio_t * 525789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 526789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 527789Sahrens int priority, int flags) 528789Sahrens { 529789Sahrens zio_t *zio; 530789Sahrens blkptr_t blk; 531789Sahrens 532789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 533789Sahrens 534789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 535789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 536789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 537789Sahrens 538789Sahrens zio->io_vd = vd; 539789Sahrens zio->io_offset = offset; 540789Sahrens 541789Sahrens /* 542789Sahrens * Work off our copy of the bp so the caller can free it. 543789Sahrens */ 544789Sahrens zio->io_bp = &zio->io_bp_copy; 545789Sahrens 546789Sahrens return (zio); 547789Sahrens } 548789Sahrens 549789Sahrens zio_t * 550789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 551789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 552789Sahrens int priority, int flags) 553789Sahrens { 554789Sahrens zio_block_tail_t *zbt; 555789Sahrens void *wbuf; 556789Sahrens zio_t *zio; 557789Sahrens blkptr_t blk; 558789Sahrens 559789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 560789Sahrens 561789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 562789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 563789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 564789Sahrens 565789Sahrens zio->io_vd = vd; 566789Sahrens zio->io_offset = offset; 567789Sahrens 568789Sahrens zio->io_bp = &zio->io_bp_copy; 569789Sahrens zio->io_checksum = checksum; 570789Sahrens 571789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 572789Sahrens /* 573789Sahrens * zbt checksums are necessarily destructive -- they modify 574789Sahrens * one word of the write buffer to hold the verifier/checksum. 575789Sahrens * Therefore, we must make a local copy in case the data is 576789Sahrens * being written to multiple places. 577789Sahrens */ 578789Sahrens wbuf = zio_buf_alloc(size); 579789Sahrens bcopy(data, wbuf, size); 580789Sahrens zio_push_transform(zio, wbuf, size, size); 581789Sahrens 582789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 583789Sahrens zbt->zbt_cksum = blk.blk_cksum; 584789Sahrens } 585789Sahrens 586789Sahrens return (zio); 587789Sahrens } 588789Sahrens 589789Sahrens /* 590789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 591789Sahrens */ 592789Sahrens zio_t * 593789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 594789Sahrens void *data, uint64_t size, int type, int priority, int flags, 595789Sahrens zio_done_func_t *done, void *private) 596789Sahrens { 597789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 598789Sahrens zio_t *cio; 599789Sahrens 600789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 601789Sahrens /* 602789Sahrens * If we have the bp, then the child should perform the 603789Sahrens * checksum and the parent need not. This pushes error 604789Sahrens * detection as close to the leaves as possible and 605789Sahrens * eliminates redundant checksums in the interior nodes. 606789Sahrens */ 607789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 608789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 609789Sahrens } 610789Sahrens 611789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 612789Sahrens done, private, type, priority, 613789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 614789Sahrens ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline); 615789Sahrens 616789Sahrens cio->io_vd = vd; 617789Sahrens cio->io_offset = offset; 618789Sahrens 619789Sahrens return (cio); 620789Sahrens } 621789Sahrens 622789Sahrens /* 623789Sahrens * ========================================================================== 624789Sahrens * Initiate I/O, either sync or async 625789Sahrens * ========================================================================== 626789Sahrens */ 627789Sahrens int 628789Sahrens zio_wait(zio_t *zio) 629789Sahrens { 630789Sahrens int error; 631789Sahrens 632789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 633789Sahrens 634789Sahrens zio->io_waiter = curthread; 635789Sahrens 636789Sahrens zio_next_stage_async(zio); 637789Sahrens 638789Sahrens mutex_enter(&zio->io_lock); 639789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 640789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 641789Sahrens mutex_exit(&zio->io_lock); 642789Sahrens 643789Sahrens error = zio->io_error; 644789Sahrens 645789Sahrens kmem_free(zio, sizeof (zio_t)); 646789Sahrens 647789Sahrens return (error); 648789Sahrens } 649789Sahrens 650789Sahrens void 651789Sahrens zio_nowait(zio_t *zio) 652789Sahrens { 653789Sahrens zio_next_stage_async(zio); 654789Sahrens } 655789Sahrens 656789Sahrens /* 657789Sahrens * ========================================================================== 658789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 659789Sahrens * ========================================================================== 660789Sahrens */ 661789Sahrens static void 662789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 663789Sahrens { 664789Sahrens mutex_enter(&zio->io_lock); 665789Sahrens if (*countp == 0) { 666789Sahrens ASSERT(zio->io_stalled == 0); 667789Sahrens mutex_exit(&zio->io_lock); 668789Sahrens zio_next_stage(zio); 669789Sahrens } else { 670789Sahrens if (zio->io_stage == ZIO_STAGE_VDEV_IO_START) 671789Sahrens zio_vdev_io_exit(zio); 672789Sahrens zio->io_stalled = stage; 673789Sahrens mutex_exit(&zio->io_lock); 674789Sahrens } 675789Sahrens } 676789Sahrens 677789Sahrens static void 678789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 679789Sahrens { 680789Sahrens zio_t *pio = zio->io_parent; 681789Sahrens 682789Sahrens mutex_enter(&pio->io_lock); 683789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 684789Sahrens pio->io_error = zio->io_error; 685789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 686789Sahrens if (pio->io_stage == ZIO_STAGE_VDEV_IO_START) 687789Sahrens zio_vdev_io_enter(pio); 688789Sahrens pio->io_stalled = 0; 689789Sahrens mutex_exit(&pio->io_lock); 690789Sahrens zio_next_stage_async(pio); 691789Sahrens } else { 692789Sahrens mutex_exit(&pio->io_lock); 693789Sahrens } 694789Sahrens } 695789Sahrens 696789Sahrens static void 697789Sahrens zio_wait_children_ready(zio_t *zio) 698789Sahrens { 699789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 700789Sahrens &zio->io_children_notready); 701789Sahrens } 702789Sahrens 703789Sahrens void 704789Sahrens zio_wait_children_done(zio_t *zio) 705789Sahrens { 706789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 707789Sahrens &zio->io_children_notdone); 708789Sahrens } 709789Sahrens 710789Sahrens static void 711789Sahrens zio_ready(zio_t *zio) 712789Sahrens { 713789Sahrens zio_t *pio = zio->io_parent; 714789Sahrens 715789Sahrens if (pio != NULL) 716789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 717789Sahrens &pio->io_children_notready); 718789Sahrens 719789Sahrens if (zio->io_bp) 720789Sahrens zio->io_bp_copy = *zio->io_bp; 721789Sahrens 722789Sahrens zio_next_stage(zio); 723789Sahrens } 724789Sahrens 725789Sahrens static void 726789Sahrens zio_done(zio_t *zio) 727789Sahrens { 728789Sahrens zio_t *pio = zio->io_parent; 729789Sahrens spa_t *spa = zio->io_spa; 730789Sahrens blkptr_t *bp = zio->io_bp; 731789Sahrens vdev_t *vd = zio->io_vd; 732789Sahrens char blkbuf[300]; 733789Sahrens 734789Sahrens ASSERT(zio->io_children_notready == 0); 735789Sahrens ASSERT(zio->io_children_notdone == 0); 736789Sahrens 737789Sahrens if (bp != NULL) { 738789Sahrens ASSERT(bp->blk_pad[0] == 0); 739789Sahrens ASSERT(bp->blk_pad[1] == 0); 740789Sahrens ASSERT(bp->blk_pad[2] == 0); 741789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 742789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 743789Sahrens !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) 744789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 745789Sahrens } 746789Sahrens 747789Sahrens if (vd != NULL) 748789Sahrens vdev_stat_update(zio); 749789Sahrens 750789Sahrens if (zio->io_error) { 751789Sahrens sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); 752789Sahrens dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n", 753789Sahrens zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 754789Sahrens zio_type_name[zio->io_type], 755789Sahrens vdev_description(vd), 756789Sahrens (u_longlong_t)zio->io_offset, 757789Sahrens zio, blkbuf, zio->io_error); 758789Sahrens } 759789Sahrens 760789Sahrens if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) { 761789Sahrens sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); 762789Sahrens dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n", 763789Sahrens "partial write", 764789Sahrens zio_type_name[zio->io_type], 765789Sahrens vdev_description(vd), 766789Sahrens (u_longlong_t)zio->io_offset, 767789Sahrens zio, blkbuf, zio->io_numerrors); 768789Sahrens } 769789Sahrens 770789Sahrens if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 771789Sahrens sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy); 772789Sahrens panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 773789Sahrens zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 774789Sahrens zio_type_name[zio->io_type], 775789Sahrens vdev_description(vd), 776789Sahrens (u_longlong_t)zio->io_offset, 777789Sahrens zio, blkbuf, zio->io_error); 778789Sahrens } 779789Sahrens 780789Sahrens zio_clear_transform_stack(zio); 781789Sahrens 782789Sahrens if (zio->io_done) 783789Sahrens zio->io_done(zio); 784789Sahrens 785789Sahrens ASSERT(zio->io_delegate_list == NULL); 786789Sahrens ASSERT(zio->io_delegate_next == NULL); 787789Sahrens 788789Sahrens if (pio != NULL) { 789789Sahrens zio_t *next, *prev; 790789Sahrens 791789Sahrens mutex_enter(&pio->io_lock); 792789Sahrens next = zio->io_sibling_next; 793789Sahrens prev = zio->io_sibling_prev; 794789Sahrens if (next != NULL) 795789Sahrens next->io_sibling_prev = prev; 796789Sahrens if (prev != NULL) 797789Sahrens prev->io_sibling_next = next; 798789Sahrens if (pio->io_child == zio) 799789Sahrens pio->io_child = next; 800789Sahrens mutex_exit(&pio->io_lock); 801789Sahrens 802789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 803789Sahrens &pio->io_children_notdone); 804789Sahrens } 805789Sahrens 806789Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 807789Sahrens spa_config_exit(spa); 808789Sahrens 809789Sahrens if (zio->io_waiter != NULL) { 810789Sahrens mutex_enter(&zio->io_lock); 811789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 812789Sahrens zio->io_stalled = zio->io_stage; 813789Sahrens cv_broadcast(&zio->io_cv); 814789Sahrens mutex_exit(&zio->io_lock); 815789Sahrens } else { 816789Sahrens kmem_free(zio, sizeof (zio_t)); 817789Sahrens } 818789Sahrens } 819789Sahrens 820789Sahrens /* 821789Sahrens * ========================================================================== 822789Sahrens * Compression support 823789Sahrens * ========================================================================== 824789Sahrens */ 825789Sahrens static void 826789Sahrens zio_write_compress(zio_t *zio) 827789Sahrens { 828789Sahrens int compress = zio->io_compress; 829789Sahrens blkptr_t *bp = zio->io_bp; 830789Sahrens void *cbuf; 831789Sahrens uint64_t lsize = zio->io_size; 832789Sahrens uint64_t csize = lsize; 833789Sahrens uint64_t cbufsize = 0; 834789Sahrens int pass; 835789Sahrens 836789Sahrens if (bp->blk_birth == zio->io_txg) { 837789Sahrens /* 838789Sahrens * We're rewriting an existing block, which means we're 839789Sahrens * working on behalf of spa_sync(). For spa_sync() to 840789Sahrens * converge, it must eventually be the case that we don't 841789Sahrens * have to allocate new blocks. But compression changes 842789Sahrens * the blocksize, which forces a reallocate, and makes 843789Sahrens * convergence take longer. Therefore, after the first 844789Sahrens * few passes, stop compressing to ensure convergence. 845789Sahrens */ 846789Sahrens pass = spa_sync_pass(zio->io_spa); 847789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 848789Sahrens compress = ZIO_COMPRESS_OFF; 849789Sahrens } else { 850789Sahrens ASSERT(BP_IS_HOLE(bp)); 851789Sahrens pass = 1; 852789Sahrens } 853789Sahrens 854789Sahrens if (compress != ZIO_COMPRESS_OFF) 855789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 856789Sahrens &cbuf, &csize, &cbufsize)) 857789Sahrens compress = ZIO_COMPRESS_OFF; 858789Sahrens 859789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 860789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 861789Sahrens 862789Sahrens /* 863789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 864789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 865789Sahrens * but newly allocated blocks are sequential, so they can be written 866789Sahrens * to disk faster. Therefore, we allow the first few passes of 867789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 868789Sahrens * There should only be a handful of blocks after pass 1 in any case. 869789Sahrens */ 870789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 871789Sahrens pass > zio_sync_pass.zp_rewrite) { 872789Sahrens ASSERT(csize != 0); 873789Sahrens ASSERT3U(BP_GET_COMPRESS(bp), ==, compress); 874789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 875789Sahrens 876789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 877789Sahrens } else { 878789Sahrens if (bp->blk_birth == zio->io_txg) { 879789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 880789Sahrens bzero(bp, sizeof (blkptr_t)); 881789Sahrens } 882789Sahrens if (csize == 0) { 883789Sahrens BP_ZERO(bp); 884789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 885789Sahrens } else { 886789Sahrens BP_SET_LSIZE(bp, lsize); 887789Sahrens BP_SET_PSIZE(bp, csize); 888789Sahrens BP_SET_COMPRESS(bp, compress); 889789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 890789Sahrens } 891789Sahrens } 892789Sahrens 893789Sahrens zio_next_stage(zio); 894789Sahrens } 895789Sahrens 896789Sahrens static void 897789Sahrens zio_read_decompress(zio_t *zio) 898789Sahrens { 899789Sahrens blkptr_t *bp = zio->io_bp; 900789Sahrens void *data; 901789Sahrens uint64_t size; 902789Sahrens uint64_t bufsize; 903789Sahrens int compress = BP_GET_COMPRESS(bp); 904789Sahrens 905789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 906789Sahrens 907789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 908789Sahrens 909789Sahrens if (zio_decompress_data(compress, data, size, 910789Sahrens zio->io_data, zio->io_size)) 911789Sahrens zio->io_error = EIO; 912789Sahrens 913789Sahrens zio_buf_free(data, bufsize); 914789Sahrens 915789Sahrens zio_next_stage(zio); 916789Sahrens } 917789Sahrens 918789Sahrens /* 919789Sahrens * ========================================================================== 920789Sahrens * Gang block support 921789Sahrens * ========================================================================== 922789Sahrens */ 923789Sahrens static void 924789Sahrens zio_gang_pipeline(zio_t *zio) 925789Sahrens { 926789Sahrens /* 927789Sahrens * By default, the pipeline assumes that we're dealing with a gang 928789Sahrens * block. If we're not, strip out any gang-specific stages. 929789Sahrens */ 930789Sahrens if (!DVA_GET_GANG(ZIO_GET_DVA(zio))) 931789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 932789Sahrens 933789Sahrens zio_next_stage(zio); 934789Sahrens } 935789Sahrens 936789Sahrens static void 937789Sahrens zio_gang_byteswap(zio_t *zio) 938789Sahrens { 939789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 940789Sahrens 941789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 942789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 943789Sahrens } 944789Sahrens 945789Sahrens static void 946789Sahrens zio_get_gang_header(zio_t *zio) 947789Sahrens { 948789Sahrens blkptr_t *bp = zio->io_bp; 949789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 950789Sahrens void *gbuf = zio_buf_alloc(gsize); 951789Sahrens 952789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 953789Sahrens 954789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 955789Sahrens 956789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 957789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 958789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 959789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 960789Sahrens 961789Sahrens zio_wait_children_done(zio); 962789Sahrens } 963789Sahrens 964789Sahrens static void 965789Sahrens zio_read_gang_members(zio_t *zio) 966789Sahrens { 967789Sahrens zio_gbh_phys_t *gbh; 968789Sahrens uint64_t gsize, gbufsize, loff, lsize; 969789Sahrens int i; 970789Sahrens 971789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 972789Sahrens 973789Sahrens zio_gang_byteswap(zio); 974789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 975789Sahrens 976789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 977789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 978789Sahrens lsize = BP_GET_PSIZE(gbp); 979789Sahrens 980789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 981789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 982789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 983789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 984789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 985789Sahrens 986789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 987789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 988789Sahrens zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 989789Sahrens } 990789Sahrens 991789Sahrens zio_buf_free(gbh, gbufsize); 992789Sahrens zio_wait_children_done(zio); 993789Sahrens } 994789Sahrens 995789Sahrens static void 996789Sahrens zio_rewrite_gang_members(zio_t *zio) 997789Sahrens { 998789Sahrens zio_gbh_phys_t *gbh; 999789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1000789Sahrens int i; 1001789Sahrens 1002789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1003789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1004789Sahrens 1005789Sahrens zio_gang_byteswap(zio); 1006789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1007789Sahrens 1008789Sahrens ASSERT(gsize == gbufsize); 1009789Sahrens 1010789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1011789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1012789Sahrens lsize = BP_GET_PSIZE(gbp); 1013789Sahrens 1014789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1015789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1016789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1017789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1018789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1019789Sahrens 1020789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1021789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1022789Sahrens NULL, NULL, zio->io_priority, zio->io_flags)); 1023789Sahrens } 1024789Sahrens 1025789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1026789Sahrens zio_wait_children_ready(zio); 1027789Sahrens } 1028789Sahrens 1029789Sahrens static void 1030789Sahrens zio_free_gang_members(zio_t *zio) 1031789Sahrens { 1032789Sahrens zio_gbh_phys_t *gbh; 1033789Sahrens uint64_t gsize, gbufsize; 1034789Sahrens int i; 1035789Sahrens 1036789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1037789Sahrens 1038789Sahrens zio_gang_byteswap(zio); 1039789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1040789Sahrens 1041789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1042789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1043789Sahrens 1044789Sahrens if (BP_IS_HOLE(gbp)) 1045789Sahrens continue; 1046789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1047789Sahrens gbp, NULL, NULL)); 1048789Sahrens } 1049789Sahrens 1050789Sahrens zio_buf_free(gbh, gbufsize); 1051789Sahrens zio_next_stage(zio); 1052789Sahrens } 1053789Sahrens 1054789Sahrens static void 1055789Sahrens zio_claim_gang_members(zio_t *zio) 1056789Sahrens { 1057789Sahrens zio_gbh_phys_t *gbh; 1058789Sahrens uint64_t gsize, gbufsize; 1059789Sahrens int i; 1060789Sahrens 1061789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1062789Sahrens 1063789Sahrens zio_gang_byteswap(zio); 1064789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1065789Sahrens 1066789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1067789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1068789Sahrens if (BP_IS_HOLE(gbp)) 1069789Sahrens continue; 1070789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1071789Sahrens gbp, NULL, NULL)); 1072789Sahrens } 1073789Sahrens 1074789Sahrens zio_buf_free(gbh, gbufsize); 1075789Sahrens zio_next_stage(zio); 1076789Sahrens } 1077789Sahrens 1078789Sahrens static void 1079789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1080789Sahrens { 1081789Sahrens zio_t *pio = zio->io_parent; 1082789Sahrens dva_t *cdva = ZIO_GET_DVA(zio); 1083789Sahrens dva_t *pdva = ZIO_GET_DVA(pio); 1084789Sahrens uint64_t asize; 1085789Sahrens 1086789Sahrens ASSERT(DVA_GET_GANG(pdva)); 1087789Sahrens 1088789Sahrens /* XXBP - Need to be careful here with multiple DVAs */ 1089789Sahrens mutex_enter(&pio->io_lock); 1090789Sahrens asize = DVA_GET_ASIZE(pdva); 1091789Sahrens asize += DVA_GET_ASIZE(cdva); 1092789Sahrens DVA_SET_ASIZE(pdva, asize); 1093789Sahrens mutex_exit(&pio->io_lock); 1094789Sahrens } 1095789Sahrens 1096789Sahrens static void 1097789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1098789Sahrens { 1099789Sahrens blkptr_t *bp = zio->io_bp; 1100789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1101789Sahrens zio_gbh_phys_t *gbh; 1102789Sahrens uint64_t resid = zio->io_size; 1103789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1104789Sahrens uint64_t gsize, loff, lsize; 1105789Sahrens uint32_t gbps_left; 1106789Sahrens int error; 1107789Sahrens int i; 1108789Sahrens 1109789Sahrens gsize = SPA_GANGBLOCKSIZE; 1110789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1111789Sahrens 1112789Sahrens error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg); 1113789Sahrens if (error == ENOSPC) 1114789Sahrens panic("can't allocate gang block header"); 1115789Sahrens ASSERT(error == 0); 1116789Sahrens 1117789Sahrens DVA_SET_GANG(dva, 1); 1118789Sahrens 1119789Sahrens bp->blk_birth = zio->io_txg; 1120789Sahrens 1121789Sahrens gbh = zio_buf_alloc(gsize); 1122789Sahrens bzero(gbh, gsize); 1123789Sahrens 1124789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1125789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1126789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1127789Sahrens dva = &gbp->blk_dva[0]; 1128789Sahrens 1129789Sahrens ASSERT(gbps_left != 0); 1130789Sahrens maxalloc = MIN(maxalloc, resid); 1131789Sahrens 1132789Sahrens while (resid <= maxalloc * gbps_left) { 1133789Sahrens error = metaslab_alloc(zio->io_spa, maxalloc, dva, 1134789Sahrens zio->io_txg); 1135789Sahrens if (error == 0) 1136789Sahrens break; 1137789Sahrens ASSERT3U(error, ==, ENOSPC); 1138789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1139789Sahrens panic("really out of space"); 1140789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1141789Sahrens } 1142789Sahrens 1143789Sahrens if (resid <= maxalloc * gbps_left) { 1144789Sahrens lsize = maxalloc; 1145789Sahrens BP_SET_LSIZE(gbp, lsize); 1146789Sahrens BP_SET_PSIZE(gbp, lsize); 1147789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1148789Sahrens gbp->blk_birth = zio->io_txg; 1149789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, 1150789Sahrens zio->io_checksum, zio->io_txg, gbp, 1151789Sahrens (char *)zio->io_data + loff, lsize, 1152789Sahrens zio_write_allocate_gang_member_done, NULL, 1153789Sahrens zio->io_priority, zio->io_flags)); 1154789Sahrens } else { 1155789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1156789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 1157789Sahrens zio_nowait(zio_write_allocate(zio, zio->io_spa, 1158789Sahrens zio->io_checksum, zio->io_txg, gbp, 1159789Sahrens (char *)zio->io_data + loff, lsize, 1160789Sahrens zio_write_allocate_gang_member_done, NULL, 1161789Sahrens zio->io_priority, zio->io_flags)); 1162789Sahrens } 1163789Sahrens } 1164789Sahrens 1165789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1166789Sahrens 1167789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1168789Sahrens 1169789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 1170789Sahrens zio_wait_children_done(zio); 1171789Sahrens } 1172789Sahrens 1173789Sahrens /* 1174789Sahrens * ========================================================================== 1175789Sahrens * Allocate and free blocks 1176789Sahrens * ========================================================================== 1177789Sahrens */ 1178789Sahrens static void 1179789Sahrens zio_dva_allocate(zio_t *zio) 1180789Sahrens { 1181789Sahrens blkptr_t *bp = zio->io_bp; 1182789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1183789Sahrens int error; 1184789Sahrens 1185789Sahrens ASSERT(BP_IS_HOLE(bp)); 1186789Sahrens 1187789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1188789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1189789Sahrens zio_write_allocate_gang_members(zio); 1190789Sahrens return; 1191789Sahrens } 1192789Sahrens 1193789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1194789Sahrens 1195789Sahrens error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg); 1196789Sahrens 1197789Sahrens if (error == 0) { 1198789Sahrens bp->blk_birth = zio->io_txg; 1199789Sahrens } else if (error == ENOSPC) { 1200789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1201789Sahrens panic("really, truly out of space"); 1202789Sahrens zio_write_allocate_gang_members(zio); 1203789Sahrens return; 1204789Sahrens } else { 1205789Sahrens zio->io_error = error; 1206789Sahrens } 1207789Sahrens zio_next_stage(zio); 1208789Sahrens } 1209789Sahrens 1210789Sahrens static void 1211789Sahrens zio_dva_free(zio_t *zio) 1212789Sahrens { 1213789Sahrens blkptr_t *bp = zio->io_bp; 1214789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1215789Sahrens 1216789Sahrens ASSERT(!BP_IS_HOLE(bp)); 1217789Sahrens 1218789Sahrens metaslab_free(zio->io_spa, dva, zio->io_txg); 1219789Sahrens 1220789Sahrens BP_ZERO(bp); 1221789Sahrens 1222789Sahrens zio_next_stage(zio); 1223789Sahrens } 1224789Sahrens 1225789Sahrens static void 1226789Sahrens zio_dva_claim(zio_t *zio) 1227789Sahrens { 1228789Sahrens blkptr_t *bp = zio->io_bp; 1229789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1230789Sahrens 1231789Sahrens ASSERT(!BP_IS_HOLE(bp)); 1232789Sahrens 1233789Sahrens zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg); 1234789Sahrens 1235789Sahrens zio_next_stage(zio); 1236789Sahrens } 1237789Sahrens 1238789Sahrens static void 1239789Sahrens zio_dva_translate(zio_t *zio) 1240789Sahrens { 1241789Sahrens spa_t *spa = zio->io_spa; 1242789Sahrens dva_t *dva = ZIO_GET_DVA(zio); 1243789Sahrens uint64_t vdev = DVA_GET_VDEV(dva); 1244789Sahrens uint64_t offset = DVA_GET_OFFSET(dva); 1245789Sahrens 1246789Sahrens ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio)); 1247789Sahrens 1248789Sahrens zio->io_offset = offset; 1249789Sahrens 1250789Sahrens if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL) 1251789Sahrens zio->io_error = ENXIO; 1252789Sahrens else if (offset + zio->io_size > zio->io_vd->vdev_asize) 1253789Sahrens zio->io_error = EOVERFLOW; 1254789Sahrens 1255789Sahrens zio_next_stage(zio); 1256789Sahrens } 1257789Sahrens 1258789Sahrens /* 1259789Sahrens * ========================================================================== 1260789Sahrens * Read and write to physical devices 1261789Sahrens * ========================================================================== 1262789Sahrens */ 1263789Sahrens static void 1264789Sahrens zio_vdev_io_enter(zio_t *zio) 1265789Sahrens { 1266789Sahrens vdev_t *tvd = zio->io_vd->vdev_top; 1267789Sahrens 1268789Sahrens mutex_enter(&tvd->vdev_io_lock); 1269789Sahrens ASSERT(zio->io_pending.list_next == NULL); 1270789Sahrens list_insert_tail(&tvd->vdev_io_pending, zio); 1271789Sahrens mutex_exit(&tvd->vdev_io_lock); 1272789Sahrens } 1273789Sahrens 1274789Sahrens static void 1275789Sahrens zio_vdev_io_exit(zio_t *zio) 1276789Sahrens { 1277789Sahrens vdev_t *tvd = zio->io_vd->vdev_top; 1278789Sahrens 1279789Sahrens mutex_enter(&tvd->vdev_io_lock); 1280789Sahrens ASSERT(zio->io_pending.list_next != NULL); 1281789Sahrens list_remove(&tvd->vdev_io_pending, zio); 1282789Sahrens if (list_head(&tvd->vdev_io_pending) == NULL) 1283789Sahrens cv_broadcast(&tvd->vdev_io_cv); 1284789Sahrens mutex_exit(&tvd->vdev_io_lock); 1285789Sahrens } 1286789Sahrens 1287789Sahrens static void 1288789Sahrens zio_vdev_io_retry(void *vdarg) 1289789Sahrens { 1290789Sahrens vdev_t *vd = vdarg; 1291789Sahrens zio_t *zio, *zq; 1292789Sahrens 1293789Sahrens ASSERT(vd == vd->vdev_top); 1294789Sahrens 1295789Sahrens /* XXPOLICY */ 1296789Sahrens delay(hz); 1297789Sahrens 1298789Sahrens vdev_reopen(vd, &zq); 1299789Sahrens 1300789Sahrens while ((zio = zq) != NULL) { 1301789Sahrens zq = zio->io_retry_next; 1302789Sahrens zio->io_retry_next = NULL; 1303789Sahrens dprintf("async retry #%d for I/O to %s offset %llx\n", 1304789Sahrens zio->io_retries, vdev_description(vd), zio->io_offset); 1305789Sahrens zio_next_stage_async(zio); 1306789Sahrens } 1307789Sahrens } 1308789Sahrens 1309789Sahrens static void 1310789Sahrens zio_vdev_io_setup(zio_t *zio) 1311789Sahrens { 1312789Sahrens vdev_t *vd = zio->io_vd; 1313789Sahrens 1314789Sahrens /* XXPOLICY */ 1315789Sahrens if (zio->io_retries == 0 && vd == vd->vdev_top) 1316789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1317789Sahrens 1318789Sahrens if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1319789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1320789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1321789Sahrens } 1322789Sahrens 1323789Sahrens zio_vdev_io_enter(zio); 1324789Sahrens 1325789Sahrens zio_next_stage(zio); 1326789Sahrens } 1327789Sahrens 1328789Sahrens static void 1329789Sahrens zio_vdev_io_start(zio_t *zio) 1330789Sahrens { 1331789Sahrens blkptr_t *bp = zio->io_bp; 1332789Sahrens 1333789Sahrens ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0); 1334789Sahrens ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0); 1335789Sahrens ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size); 1336789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1337789Sahrens 1338789Sahrens vdev_io_start(zio); 1339789Sahrens 1340789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1341789Sahrens } 1342789Sahrens 1343789Sahrens static void 1344789Sahrens zio_vdev_io_done(zio_t *zio) 1345789Sahrens { 1346789Sahrens vdev_io_done(zio); 1347789Sahrens } 1348789Sahrens 1349789Sahrens /* XXPOLICY */ 1350789Sahrens static boolean_t 1351789Sahrens zio_should_retry(zio_t *zio) 1352789Sahrens { 1353789Sahrens vdev_t *vd = zio->io_vd; 1354789Sahrens 1355789Sahrens if (zio->io_error == 0) 1356789Sahrens return (B_FALSE); 1357789Sahrens if (zio->io_delegate_list != NULL) 1358789Sahrens return (B_FALSE); 1359789Sahrens if (vd != vd->vdev_top) 1360789Sahrens return (B_FALSE); 1361789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1362789Sahrens return (B_FALSE); 1363789Sahrens if (zio->io_retries > 300 && 1364789Sahrens (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL))) 1365789Sahrens return (B_FALSE); 1366789Sahrens if (zio->io_retries > 1 && 1367789Sahrens (zio->io_error == ECKSUM || zio->io_error == ENXIO)) 1368789Sahrens return (B_FALSE); 1369789Sahrens 1370789Sahrens return (B_TRUE); 1371789Sahrens } 1372789Sahrens 1373789Sahrens static void 1374789Sahrens zio_vdev_io_assess(zio_t *zio) 1375789Sahrens { 1376789Sahrens vdev_t *vd = zio->io_vd; 1377789Sahrens vdev_t *tvd = vd->vdev_top; 1378789Sahrens 1379789Sahrens zio_vdev_io_exit(zio); 1380789Sahrens 1381789Sahrens ASSERT(zio->io_vsd == NULL); 1382789Sahrens 1383789Sahrens /* 1384789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1385789Sahrens */ 1386789Sahrens /* XXPOLICY */ 1387789Sahrens if (zio_should_retry(zio)) { 1388789Sahrens zio_t *zq; 1389789Sahrens 1390789Sahrens ASSERT(tvd == vd); 1391789Sahrens ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); 1392789Sahrens 1393789Sahrens zio->io_retries++; 1394789Sahrens zio->io_error = 0; 1395789Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1396789Sahrens /* XXPOLICY */ 1397789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1398789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1399789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1; 1400789Sahrens 1401789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1402789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1403789Sahrens vdev_description(vd), zio->io_offset); 1404789Sahrens 1405789Sahrens /* 1406789Sahrens * If this is the first retry, do it immediately. 1407789Sahrens */ 1408789Sahrens /* XXPOLICY */ 1409789Sahrens if (zio->io_retries == 1) { 1410789Sahrens zio_next_stage_async(zio); 1411789Sahrens return; 1412789Sahrens } 1413789Sahrens 1414789Sahrens /* 1415789Sahrens * This was not the first retry, so go through the 1416789Sahrens * longer enqueue/delay/vdev_reopen() process. 1417789Sahrens */ 1418789Sahrens mutex_enter(&tvd->vdev_io_lock); 1419789Sahrens ASSERT(zio->io_retry_next == NULL); 1420789Sahrens zio->io_retry_next = zq = tvd->vdev_io_retry; 1421789Sahrens tvd->vdev_io_retry = zio; 1422789Sahrens mutex_exit(&tvd->vdev_io_lock); 1423789Sahrens if (zq == NULL) 1424789Sahrens (void) taskq_dispatch( 1425789Sahrens tvd->vdev_spa->spa_vdev_retry_taskq, 1426789Sahrens zio_vdev_io_retry, tvd, TQ_SLEEP); 1427789Sahrens return; 1428789Sahrens } 1429789Sahrens 1430789Sahrens zio_next_stage(zio); 1431789Sahrens } 1432789Sahrens 1433789Sahrens void 1434789Sahrens zio_vdev_io_reissue(zio_t *zio) 1435789Sahrens { 1436789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1437789Sahrens ASSERT(zio->io_error == 0); 1438789Sahrens 1439789Sahrens zio->io_stage--; 1440789Sahrens } 1441789Sahrens 1442789Sahrens void 1443789Sahrens zio_vdev_io_redone(zio_t *zio) 1444789Sahrens { 1445789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1446789Sahrens 1447789Sahrens zio->io_stage--; 1448789Sahrens } 1449789Sahrens 1450789Sahrens void 1451789Sahrens zio_vdev_io_bypass(zio_t *zio) 1452789Sahrens { 1453789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1454789Sahrens ASSERT(zio->io_error == 0); 1455789Sahrens 1456789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1457789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1458789Sahrens } 1459789Sahrens 1460789Sahrens /* 1461789Sahrens * ========================================================================== 1462789Sahrens * Generate and verify checksums 1463789Sahrens * ========================================================================== 1464789Sahrens */ 1465789Sahrens static void 1466789Sahrens zio_checksum_generate(zio_t *zio) 1467789Sahrens { 1468789Sahrens int checksum = zio->io_checksum; 1469789Sahrens blkptr_t *bp = zio->io_bp; 1470789Sahrens 1471789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1472789Sahrens 1473789Sahrens BP_SET_CHECKSUM(bp, checksum); 1474789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1475789Sahrens 1476789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1477789Sahrens 1478789Sahrens zio_next_stage(zio); 1479789Sahrens } 1480789Sahrens 1481789Sahrens static void 1482789Sahrens zio_gang_checksum_generate(zio_t *zio) 1483789Sahrens { 1484789Sahrens zio_cksum_t zc; 1485789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1486789Sahrens 1487789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1488789Sahrens ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio))); 1489789Sahrens 1490789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1491789Sahrens 1492789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1493789Sahrens 1494789Sahrens zio_next_stage(zio); 1495789Sahrens } 1496789Sahrens 1497789Sahrens static void 1498789Sahrens zio_checksum_verify(zio_t *zio) 1499789Sahrens { 1500789Sahrens if (zio->io_bp != NULL) { 1501789Sahrens zio->io_error = zio_checksum_error(zio); 1502789Sahrens if (zio->io_error) { 1503789Sahrens dprintf("bad checksum on vdev %s\n", 1504789Sahrens vdev_description(zio->io_vd)); 1505789Sahrens } 1506789Sahrens } 1507789Sahrens 1508789Sahrens zio_next_stage(zio); 1509789Sahrens } 1510789Sahrens 1511789Sahrens /* 1512789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1513789Sahrens */ 1514789Sahrens void 1515789Sahrens zio_checksum_verified(zio_t *zio) 1516789Sahrens { 1517789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1518789Sahrens } 1519789Sahrens 1520789Sahrens /* 1521789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1522789Sahrens */ 1523789Sahrens void 1524789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1525789Sahrens { 1526789Sahrens zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio)); 1527789Sahrens zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio)); 1528789Sahrens zcp->zc_word[2] = zio->io_bp->blk_birth; 1529789Sahrens zcp->zc_word[3] = 0; 1530789Sahrens } 1531789Sahrens 1532789Sahrens /* 1533789Sahrens * ========================================================================== 1534789Sahrens * Define the pipeline 1535789Sahrens * ========================================================================== 1536789Sahrens */ 1537789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1538789Sahrens 1539789Sahrens static void 1540789Sahrens zio_badop(zio_t *zio) 1541789Sahrens { 1542789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1543789Sahrens } 1544789Sahrens 1545789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1546789Sahrens zio_badop, 1547789Sahrens zio_wait_children_ready, 1548789Sahrens zio_write_compress, 1549789Sahrens zio_checksum_generate, 1550789Sahrens zio_gang_pipeline, 1551789Sahrens zio_get_gang_header, 1552789Sahrens zio_rewrite_gang_members, 1553789Sahrens zio_free_gang_members, 1554789Sahrens zio_claim_gang_members, 1555789Sahrens zio_dva_allocate, 1556789Sahrens zio_dva_free, 1557789Sahrens zio_dva_claim, 1558789Sahrens zio_gang_checksum_generate, 1559789Sahrens zio_ready, 1560789Sahrens zio_dva_translate, 1561789Sahrens zio_vdev_io_setup, 1562789Sahrens zio_vdev_io_start, 1563789Sahrens zio_vdev_io_done, 1564789Sahrens zio_vdev_io_assess, 1565789Sahrens zio_wait_children_done, 1566789Sahrens zio_checksum_verify, 1567789Sahrens zio_read_gang_members, 1568789Sahrens zio_read_decompress, 1569789Sahrens zio_done, 1570789Sahrens zio_badop 1571789Sahrens }; 1572789Sahrens 1573789Sahrens /* 1574789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1575789Sahrens * There's no locking on io_stage because there's no legitimate way for 1576789Sahrens * multiple threads to be attempting to process the same I/O. 1577789Sahrens */ 1578789Sahrens void 1579789Sahrens zio_next_stage(zio_t *zio) 1580789Sahrens { 1581789Sahrens uint32_t pipeline = zio->io_pipeline; 1582789Sahrens 1583789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1584789Sahrens 1585789Sahrens if (zio->io_error) { 1586789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1587789Sahrens zio, vdev_description(zio->io_vd), 1588789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1589789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1590789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1591789Sahrens } 1592789Sahrens 1593789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1594789Sahrens continue; 1595789Sahrens 1596789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1597789Sahrens ASSERT(zio->io_stalled == 0); 1598789Sahrens 1599789Sahrens zio_pipeline[zio->io_stage](zio); 1600789Sahrens } 1601789Sahrens 1602789Sahrens void 1603789Sahrens zio_next_stage_async(zio_t *zio) 1604789Sahrens { 1605789Sahrens taskq_t *tq; 1606789Sahrens uint32_t pipeline = zio->io_pipeline; 1607789Sahrens 1608789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1609789Sahrens 1610789Sahrens if (zio->io_error) { 1611789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1612789Sahrens zio, vdev_description(zio->io_vd), 1613789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1614789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1615789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1616789Sahrens } 1617789Sahrens 1618789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1619789Sahrens continue; 1620789Sahrens 1621789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1622789Sahrens ASSERT(zio->io_stalled == 0); 1623789Sahrens 1624789Sahrens /* 1625789Sahrens * For performance, we'll probably want two sets of task queues: 1626789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1627789Sahrens * part is for read performance: since we have to make a pass over 1628789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1629789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1630789Sahrens * that thread is probably still there. Getting this optimization 1631789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1632789Sahrens * 1633789Sahrens * Note that having two sets of task queues is also necessary for 1634789Sahrens * correctness: if all of the issue threads get bogged down waiting 1635789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1636789Sahrens * there won't be any threads available to service I/O completion 1637789Sahrens * interrupts. 1638789Sahrens */ 1639789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1640789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1641789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1642789Sahrens else 1643789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1644789Sahrens (void) taskq_dispatch(tq, 1645789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1646789Sahrens } else { 1647789Sahrens zio_pipeline[zio->io_stage](zio); 1648789Sahrens } 1649789Sahrens } 1650789Sahrens 1651789Sahrens /* 1652789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1653789Sahrens */ 1654789Sahrens int 1655789Sahrens zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, 1656789Sahrens uint64_t txg) 1657789Sahrens { 1658789Sahrens int error; 1659789Sahrens 1660789Sahrens spa_config_enter(spa, RW_READER); 1661789Sahrens 1662789Sahrens BP_ZERO(bp); 1663789Sahrens 1664789Sahrens error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg); 1665789Sahrens 1666789Sahrens if (error == 0) { 1667789Sahrens BP_SET_CHECKSUM(bp, checksum); 1668789Sahrens BP_SET_LSIZE(bp, size); 1669789Sahrens BP_SET_PSIZE(bp, size); 1670789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 1671789Sahrens BP_SET_TYPE(bp, DMU_OT_INTENT_LOG); 1672789Sahrens BP_SET_LEVEL(bp, 0); 1673789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1674789Sahrens bp->blk_birth = txg; 1675789Sahrens } 1676789Sahrens 1677789Sahrens spa_config_exit(spa); 1678789Sahrens 1679789Sahrens return (error); 1680789Sahrens } 1681789Sahrens 1682789Sahrens /* 1683789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1684789Sahrens * nothing to do except metaslab_free() it. 1685789Sahrens */ 1686789Sahrens void 1687789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1688789Sahrens { 1689789Sahrens ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0); 1690789Sahrens 1691789Sahrens dprintf_bp(bp, "txg %llu: ", txg); 1692789Sahrens 1693789Sahrens spa_config_enter(spa, RW_READER); 1694789Sahrens 1695789Sahrens metaslab_free(spa, BP_IDENTITY(bp), txg); 1696789Sahrens 1697789Sahrens spa_config_exit(spa); 1698789Sahrens } 1699