1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 673668Sgw25295 /* Force an allocation failure when non-zero */ 683668Sgw25295 uint16_t zio_zil_fail_shift = 0; 693668Sgw25295 70789Sahrens typedef struct zio_sync_pass { 71789Sahrens int zp_defer_free; /* defer frees after this pass */ 72789Sahrens int zp_dontcompress; /* don't compress after this pass */ 73789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74789Sahrens } zio_sync_pass_t; 75789Sahrens 76789Sahrens zio_sync_pass_t zio_sync_pass = { 77789Sahrens 1, /* zp_defer_free */ 78789Sahrens 4, /* zp_dontcompress */ 79789Sahrens 1, /* zp_rewrite */ 80789Sahrens }; 81789Sahrens 82789Sahrens /* 83789Sahrens * ========================================================================== 84789Sahrens * I/O kmem caches 85789Sahrens * ========================================================================== 86789Sahrens */ 874055Seschrock kmem_cache_t *zio_cache; 88789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 893290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 903290Sjohansen 913290Sjohansen #ifdef _KERNEL 923290Sjohansen extern vmem_t *zio_alloc_arena; 933290Sjohansen #endif 94789Sahrens 95789Sahrens void 96789Sahrens zio_init(void) 97789Sahrens { 98789Sahrens size_t c; 993290Sjohansen vmem_t *data_alloc_arena = NULL; 1003290Sjohansen 1013290Sjohansen #ifdef _KERNEL 1023290Sjohansen data_alloc_arena = zio_alloc_arena; 1033290Sjohansen #endif 104789Sahrens 1054055Seschrock zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 1064055Seschrock NULL, NULL, NULL, NULL, NULL, 0); 1074055Seschrock 108789Sahrens /* 109789Sahrens * For small buffers, we want a cache for each multiple of 110789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 111789Sahrens * for each quarter-power of 2. For large buffers, we want 112789Sahrens * a cache for each multiple of PAGESIZE. 113789Sahrens */ 114789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 115789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 116789Sahrens size_t p2 = size; 117789Sahrens size_t align = 0; 118789Sahrens 119789Sahrens while (p2 & (p2 - 1)) 120789Sahrens p2 &= p2 - 1; 121789Sahrens 122789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 123789Sahrens align = SPA_MINBLOCKSIZE; 124789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 125789Sahrens align = PAGESIZE; 126789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 127789Sahrens align = p2 >> 2; 128789Sahrens } 129789Sahrens 130789Sahrens if (align != 0) { 1313290Sjohansen char name[36]; 1322856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 133789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 134849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1353290Sjohansen 1363290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1373290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1383290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1393290Sjohansen KMC_NODEBUG); 1403290Sjohansen 141789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 142789Sahrens size, align); 143789Sahrens } 144789Sahrens } 145789Sahrens 146789Sahrens while (--c != 0) { 147789Sahrens ASSERT(zio_buf_cache[c] != NULL); 148789Sahrens if (zio_buf_cache[c - 1] == NULL) 149789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1503290Sjohansen 1513290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1523290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1533290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 154789Sahrens } 1551544Seschrock 1561544Seschrock zio_inject_init(); 157789Sahrens } 158789Sahrens 159789Sahrens void 160789Sahrens zio_fini(void) 161789Sahrens { 162789Sahrens size_t c; 163789Sahrens kmem_cache_t *last_cache = NULL; 1643290Sjohansen kmem_cache_t *last_data_cache = NULL; 165789Sahrens 166789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167789Sahrens if (zio_buf_cache[c] != last_cache) { 168789Sahrens last_cache = zio_buf_cache[c]; 169789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 170789Sahrens } 171789Sahrens zio_buf_cache[c] = NULL; 1723290Sjohansen 1733290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1743290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1753290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1763290Sjohansen } 1773290Sjohansen zio_data_buf_cache[c] = NULL; 178789Sahrens } 1791544Seschrock 1804055Seschrock kmem_cache_destroy(zio_cache); 1814055Seschrock 1821544Seschrock zio_inject_fini(); 183789Sahrens } 184789Sahrens 185789Sahrens /* 186789Sahrens * ========================================================================== 187789Sahrens * Allocate and free I/O buffers 188789Sahrens * ========================================================================== 189789Sahrens */ 1903290Sjohansen 1913290Sjohansen /* 1923290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1933290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1943290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1953290Sjohansen * excess / transient data in-core during a crashdump. 1963290Sjohansen */ 197789Sahrens void * 198789Sahrens zio_buf_alloc(size_t size) 199789Sahrens { 200789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 201789Sahrens 202789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 203789Sahrens 204789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 205789Sahrens } 206789Sahrens 2073290Sjohansen /* 2083290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2093290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2103290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2113290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2123290Sjohansen */ 2133290Sjohansen void * 2143290Sjohansen zio_data_buf_alloc(size_t size) 2153290Sjohansen { 2163290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2173290Sjohansen 2183290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2193290Sjohansen 2203290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2213290Sjohansen } 2223290Sjohansen 223789Sahrens void 224789Sahrens zio_buf_free(void *buf, size_t size) 225789Sahrens { 226789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 227789Sahrens 228789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 229789Sahrens 230789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 231789Sahrens } 232789Sahrens 2333290Sjohansen void 2343290Sjohansen zio_data_buf_free(void *buf, size_t size) 2353290Sjohansen { 2363290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2373290Sjohansen 2383290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2393290Sjohansen 2403290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2413290Sjohansen } 2423463Sahrens 243789Sahrens /* 244789Sahrens * ========================================================================== 245789Sahrens * Push and pop I/O transform buffers 246789Sahrens * ========================================================================== 247789Sahrens */ 248789Sahrens static void 249789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 250789Sahrens { 251789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 252789Sahrens 253789Sahrens zt->zt_data = data; 254789Sahrens zt->zt_size = size; 255789Sahrens zt->zt_bufsize = bufsize; 256789Sahrens 257789Sahrens zt->zt_next = zio->io_transform_stack; 258789Sahrens zio->io_transform_stack = zt; 259789Sahrens 260789Sahrens zio->io_data = data; 261789Sahrens zio->io_size = size; 262789Sahrens } 263789Sahrens 264789Sahrens static void 265789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 266789Sahrens { 267789Sahrens zio_transform_t *zt = zio->io_transform_stack; 268789Sahrens 269789Sahrens *data = zt->zt_data; 270789Sahrens *size = zt->zt_size; 271789Sahrens *bufsize = zt->zt_bufsize; 272789Sahrens 273789Sahrens zio->io_transform_stack = zt->zt_next; 274789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 275789Sahrens 276789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 277789Sahrens zio->io_data = zt->zt_data; 278789Sahrens zio->io_size = zt->zt_size; 279789Sahrens } 280789Sahrens } 281789Sahrens 282789Sahrens static void 283789Sahrens zio_clear_transform_stack(zio_t *zio) 284789Sahrens { 285789Sahrens void *data; 286789Sahrens uint64_t size, bufsize; 287789Sahrens 288789Sahrens ASSERT(zio->io_transform_stack != NULL); 289789Sahrens 290789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 291789Sahrens while (zio->io_transform_stack != NULL) { 292789Sahrens zio_buf_free(data, bufsize); 293789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 294789Sahrens } 295789Sahrens } 296789Sahrens 297789Sahrens /* 298789Sahrens * ========================================================================== 299789Sahrens * Create the various types of I/O (read, write, free) 300789Sahrens * ========================================================================== 301789Sahrens */ 302789Sahrens static zio_t * 303789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 304789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 305789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 306789Sahrens { 307789Sahrens zio_t *zio; 308789Sahrens 309789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 310789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 311789Sahrens 3124055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 3134055Seschrock bzero(zio, sizeof (zio_t)); 314789Sahrens zio->io_parent = pio; 315789Sahrens zio->io_spa = spa; 316789Sahrens zio->io_txg = txg; 3174634Sek110237 zio->io_flags = flags; 318789Sahrens if (bp != NULL) { 319789Sahrens zio->io_bp = bp; 320789Sahrens zio->io_bp_copy = *bp; 321789Sahrens zio->io_bp_orig = *bp; 3224634Sek110237 if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata || 3234634Sek110237 BP_GET_LEVEL(bp) != 0) 3244634Sek110237 zio->io_flags |= ZIO_FLAG_METADATA; 325789Sahrens } 326789Sahrens zio->io_done = done; 327789Sahrens zio->io_private = private; 328789Sahrens zio->io_type = type; 329789Sahrens zio->io_priority = priority; 330789Sahrens zio->io_stage = stage; 331789Sahrens zio->io_pipeline = pipeline; 332789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 333789Sahrens zio->io_timestamp = lbolt64; 3344634Sek110237 if (pio != NULL) 3354634Sek110237 zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA); 3362856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 337*4831Sgw25295 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 338789Sahrens zio_push_transform(zio, data, size, size); 339789Sahrens 3403463Sahrens /* 3413463Sahrens * Note on config lock: 3423463Sahrens * 3433463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3443463Sahrens * lock, so we don't need it for this io. 3453463Sahrens * 3463463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3473463Sahrens * config lock on behalf of this io, so it should be released 3483463Sahrens * in zio_done. 3493463Sahrens * 3503463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3513463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3523463Sahrens * The NULL top-level ios rarely have any children, so we delay 3533463Sahrens * grabbing the lock until the first child is added (but it is 3543463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3553463Sahrens * children don't need to also grab it). This greatly reduces 3563463Sahrens * contention on the config lock. 3573463Sahrens */ 358789Sahrens if (pio == NULL) { 3593463Sahrens if (type != ZIO_TYPE_NULL && 3603463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3611544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 3623463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3633463Sahrens } 364789Sahrens zio->io_root = zio; 365789Sahrens } else { 366789Sahrens zio->io_root = pio->io_root; 3671544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3681544Seschrock zio->io_logical = pio->io_logical; 369789Sahrens mutex_enter(&pio->io_lock); 3703463Sahrens if (pio->io_parent == NULL && 3713463Sahrens pio->io_type == ZIO_TYPE_NULL && 3723463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 3733463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 3743463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3753463Sahrens spa_config_enter(zio->io_spa, RW_READER, pio); 3763463Sahrens } 377789Sahrens if (stage < ZIO_STAGE_READY) 378789Sahrens pio->io_children_notready++; 379789Sahrens pio->io_children_notdone++; 380789Sahrens zio->io_sibling_next = pio->io_child; 381789Sahrens zio->io_sibling_prev = NULL; 382789Sahrens if (pio->io_child != NULL) 383789Sahrens pio->io_child->io_sibling_prev = zio; 384789Sahrens pio->io_child = zio; 3851775Sbillm zio->io_ndvas = pio->io_ndvas; 386789Sahrens mutex_exit(&pio->io_lock); 387789Sahrens } 388789Sahrens 389789Sahrens return (zio); 390789Sahrens } 391789Sahrens 392789Sahrens zio_t * 393789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 394789Sahrens int flags) 395789Sahrens { 396789Sahrens zio_t *zio; 397789Sahrens 398789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 399789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 400789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 401789Sahrens 402789Sahrens return (zio); 403789Sahrens } 404789Sahrens 405789Sahrens zio_t * 406789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 407789Sahrens { 408789Sahrens return (zio_null(NULL, spa, done, private, flags)); 409789Sahrens } 410789Sahrens 411789Sahrens zio_t * 412789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 413789Sahrens uint64_t size, zio_done_func_t *done, void *private, 4141544Seschrock int priority, int flags, zbookmark_t *zb) 415789Sahrens { 416789Sahrens zio_t *zio; 417789Sahrens 418789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 419789Sahrens 420789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4212981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4222981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4231544Seschrock zio->io_bookmark = *zb; 4241544Seschrock 4251544Seschrock zio->io_logical = zio; 426789Sahrens 427789Sahrens /* 428789Sahrens * Work off our copy of the bp so the caller can free it. 429789Sahrens */ 430789Sahrens zio->io_bp = &zio->io_bp_copy; 431789Sahrens 432789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 433789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 434789Sahrens void *cbuf = zio_buf_alloc(csize); 435789Sahrens 436789Sahrens zio_push_transform(zio, cbuf, csize, csize); 437789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 438789Sahrens } 439789Sahrens 4401775Sbillm if (BP_IS_GANG(bp)) { 441789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 442789Sahrens void *gbuf = zio_buf_alloc(gsize); 443789Sahrens 444789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 445789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 446789Sahrens } 447789Sahrens 448789Sahrens return (zio); 449789Sahrens } 450789Sahrens 451789Sahrens zio_t * 4521775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 453789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4543547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 4553547Smaybee int flags, zbookmark_t *zb) 456789Sahrens { 457789Sahrens zio_t *zio; 458789Sahrens 459789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 460789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 461789Sahrens 462789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 463789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 464789Sahrens 465789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4662981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 467789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 468789Sahrens 4693547Smaybee zio->io_ready = ready; 4703547Smaybee 4711544Seschrock zio->io_bookmark = *zb; 4721544Seschrock 4731544Seschrock zio->io_logical = zio; 4741544Seschrock 475789Sahrens zio->io_checksum = checksum; 476789Sahrens zio->io_compress = compress; 4771775Sbillm zio->io_ndvas = ncopies; 478789Sahrens 479789Sahrens if (compress != ZIO_COMPRESS_OFF) 480789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 481789Sahrens 482789Sahrens if (bp->blk_birth != txg) { 483789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 484789Sahrens BP_ZERO(bp); 485789Sahrens BP_SET_LSIZE(bp, size); 486789Sahrens BP_SET_PSIZE(bp, size); 4871775Sbillm } else { 4881775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 4891775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 4901775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 491789Sahrens } 492789Sahrens 493789Sahrens return (zio); 494789Sahrens } 495789Sahrens 496789Sahrens zio_t * 497789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 498789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4991544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 5001544Seschrock zbookmark_t *zb) 501789Sahrens { 502789Sahrens zio_t *zio; 503789Sahrens 504789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5052981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 506789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 507789Sahrens 5081544Seschrock zio->io_bookmark = *zb; 509789Sahrens zio->io_checksum = checksum; 510789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 511789Sahrens 5121775Sbillm if (pio != NULL) 5131775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5141775Sbillm 515789Sahrens return (zio); 516789Sahrens } 517789Sahrens 518789Sahrens static zio_t * 519789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 520789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 521789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 522789Sahrens { 523789Sahrens zio_t *zio; 524789Sahrens 525789Sahrens BP_ZERO(bp); 526789Sahrens BP_SET_LSIZE(bp, size); 527789Sahrens BP_SET_PSIZE(bp, size); 528789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 529789Sahrens 530789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 531789Sahrens ZIO_TYPE_WRITE, priority, flags, 532789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 533789Sahrens 534789Sahrens zio->io_checksum = checksum; 535789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 536789Sahrens 537789Sahrens return (zio); 538789Sahrens } 539789Sahrens 540789Sahrens zio_t * 541789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 542789Sahrens zio_done_func_t *done, void *private) 543789Sahrens { 544789Sahrens zio_t *zio; 545789Sahrens 546789Sahrens ASSERT(!BP_IS_HOLE(bp)); 547789Sahrens 548789Sahrens if (txg == spa->spa_syncing_txg && 549789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 550789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 551789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 552789Sahrens } 553789Sahrens 554789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5552981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 556789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 557789Sahrens 558789Sahrens zio->io_bp = &zio->io_bp_copy; 559789Sahrens 560789Sahrens return (zio); 561789Sahrens } 562789Sahrens 563789Sahrens zio_t * 564789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 565789Sahrens zio_done_func_t *done, void *private) 566789Sahrens { 567789Sahrens zio_t *zio; 568789Sahrens 569789Sahrens /* 570789Sahrens * A claim is an allocation of a specific block. Claims are needed 571789Sahrens * to support immediate writes in the intent log. The issue is that 572789Sahrens * immediate writes contain committed data, but in a txg that was 573789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 574789Sahrens * the intent log claims all blocks that contain immediate write data 575789Sahrens * so that the SPA knows they're in use. 576789Sahrens * 577789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 578789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 579789Sahrens */ 580789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 581789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 582789Sahrens 583789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 584789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 585789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 586789Sahrens 587789Sahrens zio->io_bp = &zio->io_bp_copy; 588789Sahrens 589789Sahrens return (zio); 590789Sahrens } 591789Sahrens 592789Sahrens zio_t * 593789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 594789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 595789Sahrens { 596789Sahrens zio_t *zio; 597789Sahrens int c; 598789Sahrens 599789Sahrens if (vd->vdev_children == 0) { 600789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 601789Sahrens ZIO_TYPE_IOCTL, priority, flags, 602789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 603789Sahrens 604789Sahrens zio->io_vd = vd; 605789Sahrens zio->io_cmd = cmd; 606789Sahrens } else { 607789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 608789Sahrens 609789Sahrens for (c = 0; c < vd->vdev_children; c++) 610789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 611789Sahrens done, private, priority, flags)); 612789Sahrens } 613789Sahrens 614789Sahrens return (zio); 615789Sahrens } 616789Sahrens 617789Sahrens static void 618789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 619789Sahrens int checksum) 620789Sahrens { 621789Sahrens ASSERT(vd->vdev_children == 0); 622789Sahrens 623789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 624789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 625789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 626789Sahrens 627789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 628789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 629789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 630789Sahrens 631789Sahrens BP_ZERO(bp); 632789Sahrens 633789Sahrens BP_SET_LSIZE(bp, size); 634789Sahrens BP_SET_PSIZE(bp, size); 635789Sahrens 636789Sahrens BP_SET_CHECKSUM(bp, checksum); 637789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 638789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 639789Sahrens 640789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 641789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 642789Sahrens } 643789Sahrens 644789Sahrens zio_t * 645789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 646789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 647789Sahrens int priority, int flags) 648789Sahrens { 649789Sahrens zio_t *zio; 650789Sahrens blkptr_t blk; 651789Sahrens 652789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 653789Sahrens 654789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 655789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 656789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 657789Sahrens 658789Sahrens zio->io_vd = vd; 659789Sahrens zio->io_offset = offset; 660789Sahrens 661789Sahrens /* 662789Sahrens * Work off our copy of the bp so the caller can free it. 663789Sahrens */ 664789Sahrens zio->io_bp = &zio->io_bp_copy; 665789Sahrens 666789Sahrens return (zio); 667789Sahrens } 668789Sahrens 669789Sahrens zio_t * 670789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 671789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 672789Sahrens int priority, int flags) 673789Sahrens { 674789Sahrens zio_block_tail_t *zbt; 675789Sahrens void *wbuf; 676789Sahrens zio_t *zio; 677789Sahrens blkptr_t blk; 678789Sahrens 679789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 680789Sahrens 681789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 682789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 683789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 684789Sahrens 685789Sahrens zio->io_vd = vd; 686789Sahrens zio->io_offset = offset; 687789Sahrens 688789Sahrens zio->io_bp = &zio->io_bp_copy; 689789Sahrens zio->io_checksum = checksum; 690789Sahrens 691789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 692789Sahrens /* 693789Sahrens * zbt checksums are necessarily destructive -- they modify 694789Sahrens * one word of the write buffer to hold the verifier/checksum. 695789Sahrens * Therefore, we must make a local copy in case the data is 696789Sahrens * being written to multiple places. 697789Sahrens */ 698789Sahrens wbuf = zio_buf_alloc(size); 699789Sahrens bcopy(data, wbuf, size); 700789Sahrens zio_push_transform(zio, wbuf, size, size); 701789Sahrens 702789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 703789Sahrens zbt->zbt_cksum = blk.blk_cksum; 704789Sahrens } 705789Sahrens 706789Sahrens return (zio); 707789Sahrens } 708789Sahrens 709789Sahrens /* 710789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 711789Sahrens */ 712789Sahrens zio_t * 713789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 714789Sahrens void *data, uint64_t size, int type, int priority, int flags, 715789Sahrens zio_done_func_t *done, void *private) 716789Sahrens { 717789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 718789Sahrens zio_t *cio; 719789Sahrens 720789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 721789Sahrens /* 722789Sahrens * If we have the bp, then the child should perform the 723789Sahrens * checksum and the parent need not. This pushes error 724789Sahrens * detection as close to the leaves as possible and 725789Sahrens * eliminates redundant checksums in the interior nodes. 726789Sahrens */ 727789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 728789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 729789Sahrens } 730789Sahrens 731789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 732789Sahrens done, private, type, priority, 733789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 7341775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 735789Sahrens 736789Sahrens cio->io_vd = vd; 737789Sahrens cio->io_offset = offset; 738789Sahrens 739789Sahrens return (cio); 740789Sahrens } 741789Sahrens 742789Sahrens /* 743789Sahrens * ========================================================================== 744789Sahrens * Initiate I/O, either sync or async 745789Sahrens * ========================================================================== 746789Sahrens */ 747789Sahrens int 748789Sahrens zio_wait(zio_t *zio) 749789Sahrens { 750789Sahrens int error; 751789Sahrens 752789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 753789Sahrens 754789Sahrens zio->io_waiter = curthread; 755789Sahrens 756789Sahrens zio_next_stage_async(zio); 757789Sahrens 758789Sahrens mutex_enter(&zio->io_lock); 759789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 760789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 761789Sahrens mutex_exit(&zio->io_lock); 762789Sahrens 763789Sahrens error = zio->io_error; 7642856Snd150628 mutex_destroy(&zio->io_lock); 765*4831Sgw25295 cv_destroy(&zio->io_cv); 7664055Seschrock kmem_cache_free(zio_cache, zio); 767789Sahrens 768789Sahrens return (error); 769789Sahrens } 770789Sahrens 771789Sahrens void 772789Sahrens zio_nowait(zio_t *zio) 773789Sahrens { 774789Sahrens zio_next_stage_async(zio); 775789Sahrens } 776789Sahrens 777789Sahrens /* 778789Sahrens * ========================================================================== 779789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 780789Sahrens * ========================================================================== 781789Sahrens */ 782789Sahrens static void 783789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 784789Sahrens { 785789Sahrens mutex_enter(&zio->io_lock); 786789Sahrens if (*countp == 0) { 787789Sahrens ASSERT(zio->io_stalled == 0); 788789Sahrens mutex_exit(&zio->io_lock); 789789Sahrens zio_next_stage(zio); 790789Sahrens } else { 791789Sahrens zio->io_stalled = stage; 792789Sahrens mutex_exit(&zio->io_lock); 793789Sahrens } 794789Sahrens } 795789Sahrens 796789Sahrens static void 797789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 798789Sahrens { 799789Sahrens zio_t *pio = zio->io_parent; 800789Sahrens 801789Sahrens mutex_enter(&pio->io_lock); 802789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 803789Sahrens pio->io_error = zio->io_error; 804789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 805789Sahrens pio->io_stalled = 0; 806789Sahrens mutex_exit(&pio->io_lock); 807789Sahrens zio_next_stage_async(pio); 808789Sahrens } else { 809789Sahrens mutex_exit(&pio->io_lock); 810789Sahrens } 811789Sahrens } 812789Sahrens 813789Sahrens static void 814789Sahrens zio_wait_children_ready(zio_t *zio) 815789Sahrens { 816789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 817789Sahrens &zio->io_children_notready); 818789Sahrens } 819789Sahrens 820789Sahrens void 821789Sahrens zio_wait_children_done(zio_t *zio) 822789Sahrens { 823789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 824789Sahrens &zio->io_children_notdone); 825789Sahrens } 826789Sahrens 827789Sahrens static void 828789Sahrens zio_ready(zio_t *zio) 829789Sahrens { 830789Sahrens zio_t *pio = zio->io_parent; 831789Sahrens 8323547Smaybee if (zio->io_ready) 8333547Smaybee zio->io_ready(zio); 8343547Smaybee 835789Sahrens if (pio != NULL) 836789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 837789Sahrens &pio->io_children_notready); 838789Sahrens 839789Sahrens if (zio->io_bp) 840789Sahrens zio->io_bp_copy = *zio->io_bp; 841789Sahrens 842789Sahrens zio_next_stage(zio); 843789Sahrens } 844789Sahrens 845789Sahrens static void 846789Sahrens zio_done(zio_t *zio) 847789Sahrens { 848789Sahrens zio_t *pio = zio->io_parent; 849789Sahrens spa_t *spa = zio->io_spa; 850789Sahrens blkptr_t *bp = zio->io_bp; 851789Sahrens vdev_t *vd = zio->io_vd; 852789Sahrens 853789Sahrens ASSERT(zio->io_children_notready == 0); 854789Sahrens ASSERT(zio->io_children_notdone == 0); 855789Sahrens 856789Sahrens if (bp != NULL) { 857789Sahrens ASSERT(bp->blk_pad[0] == 0); 858789Sahrens ASSERT(bp->blk_pad[1] == 0); 859789Sahrens ASSERT(bp->blk_pad[2] == 0); 860789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 861789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 8621775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 863789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 8641775Sbillm if (zio->io_ndvas != 0) 8651775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 8661775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 8671775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 8681775Sbillm } 869789Sahrens } 870789Sahrens 871789Sahrens if (vd != NULL) 872789Sahrens vdev_stat_update(zio); 873789Sahrens 874789Sahrens if (zio->io_error) { 8751544Seschrock /* 8761544Seschrock * If this I/O is attached to a particular vdev, 8771544Seschrock * generate an error message describing the I/O failure 8781544Seschrock * at the block level. We ignore these errors if the 8791544Seschrock * device is currently unavailable. 8801544Seschrock */ 8811732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 8821544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 8831732Sbonwick zio->io_spa, vd, zio, 0, 0); 884789Sahrens 8851544Seschrock if ((zio->io_error == EIO || 8861544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 8871544Seschrock zio->io_logical == zio) { 8881544Seschrock /* 8891544Seschrock * For root I/O requests, tell the SPA to log the error 8901544Seschrock * appropriately. Also, generate a logical data 8911544Seschrock * ereport. 8921544Seschrock */ 8931544Seschrock spa_log_error(zio->io_spa, zio); 8941544Seschrock 8951544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 8961544Seschrock zio->io_spa, NULL, zio, 0, 0); 8971544Seschrock } 898789Sahrens 8991544Seschrock /* 9001544Seschrock * For I/O requests that cannot fail, panic appropriately. 9011544Seschrock */ 9021544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 9033459Sek110237 char *blkbuf; 9043459Sek110237 9053459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 9063459Sek110237 if (blkbuf) { 9073459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 9083459Sek110237 bp ? bp : &zio->io_bp_copy); 9093459Sek110237 } 9101544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 9111544Seschrock "%d", zio->io_error == ECKSUM ? 9121544Seschrock "bad checksum" : "I/O failure", 9131544Seschrock zio_type_name[zio->io_type], 9141544Seschrock vdev_description(vd), 9151544Seschrock (u_longlong_t)zio->io_offset, 9163459Sek110237 zio, blkbuf ? blkbuf : "", zio->io_error); 9171544Seschrock } 918789Sahrens } 919789Sahrens zio_clear_transform_stack(zio); 920789Sahrens 921789Sahrens if (zio->io_done) 922789Sahrens zio->io_done(zio); 923789Sahrens 924789Sahrens ASSERT(zio->io_delegate_list == NULL); 925789Sahrens ASSERT(zio->io_delegate_next == NULL); 926789Sahrens 927789Sahrens if (pio != NULL) { 928789Sahrens zio_t *next, *prev; 929789Sahrens 930789Sahrens mutex_enter(&pio->io_lock); 931789Sahrens next = zio->io_sibling_next; 932789Sahrens prev = zio->io_sibling_prev; 933789Sahrens if (next != NULL) 934789Sahrens next->io_sibling_prev = prev; 935789Sahrens if (prev != NULL) 936789Sahrens prev->io_sibling_next = next; 937789Sahrens if (pio->io_child == zio) 938789Sahrens pio->io_child = next; 939789Sahrens mutex_exit(&pio->io_lock); 940789Sahrens 941789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 942789Sahrens &pio->io_children_notdone); 943789Sahrens } 944789Sahrens 9453463Sahrens /* 9464055Seschrock * Note: this I/O is now done, and will shortly be freed, so there is no 9474055Seschrock * need to clear this (or any other) flag. 9483463Sahrens */ 9493463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 9501544Seschrock spa_config_exit(spa, zio); 951789Sahrens 952789Sahrens if (zio->io_waiter != NULL) { 953789Sahrens mutex_enter(&zio->io_lock); 954789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 955789Sahrens zio->io_stalled = zio->io_stage; 956789Sahrens cv_broadcast(&zio->io_cv); 957789Sahrens mutex_exit(&zio->io_lock); 958789Sahrens } else { 959*4831Sgw25295 mutex_destroy(&zio->io_lock); 960*4831Sgw25295 cv_destroy(&zio->io_cv); 9614055Seschrock kmem_cache_free(zio_cache, zio); 962789Sahrens } 963789Sahrens } 964789Sahrens 965789Sahrens /* 966789Sahrens * ========================================================================== 967789Sahrens * Compression support 968789Sahrens * ========================================================================== 969789Sahrens */ 970789Sahrens static void 971789Sahrens zio_write_compress(zio_t *zio) 972789Sahrens { 973789Sahrens int compress = zio->io_compress; 974789Sahrens blkptr_t *bp = zio->io_bp; 975789Sahrens void *cbuf; 976789Sahrens uint64_t lsize = zio->io_size; 977789Sahrens uint64_t csize = lsize; 978789Sahrens uint64_t cbufsize = 0; 979789Sahrens int pass; 980789Sahrens 981789Sahrens if (bp->blk_birth == zio->io_txg) { 982789Sahrens /* 983789Sahrens * We're rewriting an existing block, which means we're 984789Sahrens * working on behalf of spa_sync(). For spa_sync() to 985789Sahrens * converge, it must eventually be the case that we don't 986789Sahrens * have to allocate new blocks. But compression changes 987789Sahrens * the blocksize, which forces a reallocate, and makes 988789Sahrens * convergence take longer. Therefore, after the first 989789Sahrens * few passes, stop compressing to ensure convergence. 990789Sahrens */ 991789Sahrens pass = spa_sync_pass(zio->io_spa); 992789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 993789Sahrens compress = ZIO_COMPRESS_OFF; 994789Sahrens } else { 995789Sahrens ASSERT(BP_IS_HOLE(bp)); 996789Sahrens pass = 1; 997789Sahrens } 998789Sahrens 999789Sahrens if (compress != ZIO_COMPRESS_OFF) 1000789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1001789Sahrens &cbuf, &csize, &cbufsize)) 1002789Sahrens compress = ZIO_COMPRESS_OFF; 1003789Sahrens 1004789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 1005789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 1006789Sahrens 1007789Sahrens /* 1008789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 1009789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 1010789Sahrens * but newly allocated blocks are sequential, so they can be written 1011789Sahrens * to disk faster. Therefore, we allow the first few passes of 1012789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 1013789Sahrens * There should only be a handful of blocks after pass 1 in any case. 1014789Sahrens */ 1015789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1016789Sahrens pass > zio_sync_pass.zp_rewrite) { 1017789Sahrens ASSERT(csize != 0); 10182885Sahrens BP_SET_LSIZE(bp, lsize); 10192885Sahrens BP_SET_COMPRESS(bp, compress); 1020789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 1021789Sahrens } else { 10223882Sahrens if (bp->blk_birth == zio->io_txg) 10233882Sahrens BP_ZERO(bp); 1024789Sahrens if (csize == 0) { 1025789Sahrens BP_ZERO(bp); 1026789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1027789Sahrens } else { 10281775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1029789Sahrens BP_SET_LSIZE(bp, lsize); 1030789Sahrens BP_SET_PSIZE(bp, csize); 1031789Sahrens BP_SET_COMPRESS(bp, compress); 1032789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 1033789Sahrens } 1034789Sahrens } 1035789Sahrens 1036789Sahrens zio_next_stage(zio); 1037789Sahrens } 1038789Sahrens 1039789Sahrens static void 1040789Sahrens zio_read_decompress(zio_t *zio) 1041789Sahrens { 1042789Sahrens blkptr_t *bp = zio->io_bp; 1043789Sahrens void *data; 1044789Sahrens uint64_t size; 1045789Sahrens uint64_t bufsize; 1046789Sahrens int compress = BP_GET_COMPRESS(bp); 1047789Sahrens 1048789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1049789Sahrens 1050789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1051789Sahrens 1052789Sahrens if (zio_decompress_data(compress, data, size, 1053789Sahrens zio->io_data, zio->io_size)) 1054789Sahrens zio->io_error = EIO; 1055789Sahrens 1056789Sahrens zio_buf_free(data, bufsize); 1057789Sahrens 1058789Sahrens zio_next_stage(zio); 1059789Sahrens } 1060789Sahrens 1061789Sahrens /* 1062789Sahrens * ========================================================================== 1063789Sahrens * Gang block support 1064789Sahrens * ========================================================================== 1065789Sahrens */ 1066789Sahrens static void 1067789Sahrens zio_gang_pipeline(zio_t *zio) 1068789Sahrens { 1069789Sahrens /* 1070789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1071789Sahrens * block. If we're not, strip out any gang-specific stages. 1072789Sahrens */ 10731775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1074789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1075789Sahrens 1076789Sahrens zio_next_stage(zio); 1077789Sahrens } 1078789Sahrens 1079789Sahrens static void 1080789Sahrens zio_gang_byteswap(zio_t *zio) 1081789Sahrens { 1082789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1083789Sahrens 1084789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1085789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1086789Sahrens } 1087789Sahrens 1088789Sahrens static void 1089789Sahrens zio_get_gang_header(zio_t *zio) 1090789Sahrens { 1091789Sahrens blkptr_t *bp = zio->io_bp; 1092789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1093789Sahrens void *gbuf = zio_buf_alloc(gsize); 1094789Sahrens 10951775Sbillm ASSERT(BP_IS_GANG(bp)); 1096789Sahrens 1097789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1098789Sahrens 1099789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1100789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1101789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1102789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 1103789Sahrens 1104789Sahrens zio_wait_children_done(zio); 1105789Sahrens } 1106789Sahrens 1107789Sahrens static void 1108789Sahrens zio_read_gang_members(zio_t *zio) 1109789Sahrens { 1110789Sahrens zio_gbh_phys_t *gbh; 1111789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1112789Sahrens int i; 1113789Sahrens 11141775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1115789Sahrens 1116789Sahrens zio_gang_byteswap(zio); 1117789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1118789Sahrens 1119789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1120789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1121789Sahrens lsize = BP_GET_PSIZE(gbp); 1122789Sahrens 1123789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1124789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1125789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1126789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1127789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1128789Sahrens 1129789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1130789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 11311544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 11321544Seschrock &zio->io_bookmark)); 1133789Sahrens } 1134789Sahrens 1135789Sahrens zio_buf_free(gbh, gbufsize); 1136789Sahrens zio_wait_children_done(zio); 1137789Sahrens } 1138789Sahrens 1139789Sahrens static void 1140789Sahrens zio_rewrite_gang_members(zio_t *zio) 1141789Sahrens { 1142789Sahrens zio_gbh_phys_t *gbh; 1143789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1144789Sahrens int i; 1145789Sahrens 11461775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1147789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1148789Sahrens 1149789Sahrens zio_gang_byteswap(zio); 1150789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1151789Sahrens 1152789Sahrens ASSERT(gsize == gbufsize); 1153789Sahrens 1154789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1155789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1156789Sahrens lsize = BP_GET_PSIZE(gbp); 1157789Sahrens 1158789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1159789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1160789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1161789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1162789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1163789Sahrens 1164789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1165789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 11661544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 11671544Seschrock &zio->io_bookmark)); 1168789Sahrens } 1169789Sahrens 1170789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1171789Sahrens zio_wait_children_ready(zio); 1172789Sahrens } 1173789Sahrens 1174789Sahrens static void 1175789Sahrens zio_free_gang_members(zio_t *zio) 1176789Sahrens { 1177789Sahrens zio_gbh_phys_t *gbh; 1178789Sahrens uint64_t gsize, gbufsize; 1179789Sahrens int i; 1180789Sahrens 11811775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1182789Sahrens 1183789Sahrens zio_gang_byteswap(zio); 1184789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1185789Sahrens 1186789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1187789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1188789Sahrens 1189789Sahrens if (BP_IS_HOLE(gbp)) 1190789Sahrens continue; 1191789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1192789Sahrens gbp, NULL, NULL)); 1193789Sahrens } 1194789Sahrens 1195789Sahrens zio_buf_free(gbh, gbufsize); 1196789Sahrens zio_next_stage(zio); 1197789Sahrens } 1198789Sahrens 1199789Sahrens static void 1200789Sahrens zio_claim_gang_members(zio_t *zio) 1201789Sahrens { 1202789Sahrens zio_gbh_phys_t *gbh; 1203789Sahrens uint64_t gsize, gbufsize; 1204789Sahrens int i; 1205789Sahrens 12061775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1207789Sahrens 1208789Sahrens zio_gang_byteswap(zio); 1209789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1210789Sahrens 1211789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1212789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1213789Sahrens if (BP_IS_HOLE(gbp)) 1214789Sahrens continue; 1215789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1216789Sahrens gbp, NULL, NULL)); 1217789Sahrens } 1218789Sahrens 1219789Sahrens zio_buf_free(gbh, gbufsize); 1220789Sahrens zio_next_stage(zio); 1221789Sahrens } 1222789Sahrens 1223789Sahrens static void 1224789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1225789Sahrens { 1226789Sahrens zio_t *pio = zio->io_parent; 12271775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 12281775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1229789Sahrens uint64_t asize; 12301775Sbillm int d; 1231789Sahrens 12321775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 12331775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 12341775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 12351775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 12361775Sbillm 1237789Sahrens mutex_enter(&pio->io_lock); 12381775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 12391775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 12401775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 12411775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 12421775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 12431775Sbillm } 1244789Sahrens mutex_exit(&pio->io_lock); 1245789Sahrens } 1246789Sahrens 1247789Sahrens static void 12484527Sperrin zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1249789Sahrens { 1250789Sahrens blkptr_t *bp = zio->io_bp; 12511775Sbillm dva_t *dva = bp->blk_dva; 12521775Sbillm spa_t *spa = zio->io_spa; 1253789Sahrens zio_gbh_phys_t *gbh; 12541775Sbillm uint64_t txg = zio->io_txg; 1255789Sahrens uint64_t resid = zio->io_size; 1256789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1257789Sahrens uint64_t gsize, loff, lsize; 1258789Sahrens uint32_t gbps_left; 12591775Sbillm int ndvas = zio->io_ndvas; 12601775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1261789Sahrens int error; 12621775Sbillm int i, d; 1263789Sahrens 1264789Sahrens gsize = SPA_GANGBLOCKSIZE; 1265789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1266789Sahrens 12674527Sperrin error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 12684527Sperrin B_FALSE); 1269789Sahrens if (error == ENOSPC) 1270789Sahrens panic("can't allocate gang block header"); 1271789Sahrens ASSERT(error == 0); 1272789Sahrens 12731775Sbillm for (d = 0; d < gbh_ndvas; d++) 12741775Sbillm DVA_SET_GANG(&dva[d], 1); 1275789Sahrens 12761775Sbillm bp->blk_birth = txg; 1277789Sahrens 1278789Sahrens gbh = zio_buf_alloc(gsize); 1279789Sahrens bzero(gbh, gsize); 1280789Sahrens 12811775Sbillm /* We need to test multi-level gang blocks */ 12821775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 12831775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 12841775Sbillm 1285789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1286789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1287789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 12881775Sbillm dva = gbp->blk_dva; 1289789Sahrens 1290789Sahrens ASSERT(gbps_left != 0); 1291789Sahrens maxalloc = MIN(maxalloc, resid); 1292789Sahrens 1293789Sahrens while (resid <= maxalloc * gbps_left) { 12944527Sperrin error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 12953063Sperrin txg, bp, B_FALSE); 1296789Sahrens if (error == 0) 1297789Sahrens break; 1298789Sahrens ASSERT3U(error, ==, ENOSPC); 1299789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1300789Sahrens panic("really out of space"); 1301789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1302789Sahrens } 1303789Sahrens 1304789Sahrens if (resid <= maxalloc * gbps_left) { 1305789Sahrens lsize = maxalloc; 1306789Sahrens BP_SET_LSIZE(gbp, lsize); 1307789Sahrens BP_SET_PSIZE(gbp, lsize); 1308789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 13091775Sbillm gbp->blk_birth = txg; 13101775Sbillm zio_nowait(zio_rewrite(zio, spa, 13111775Sbillm zio->io_checksum, txg, gbp, 1312789Sahrens (char *)zio->io_data + loff, lsize, 1313789Sahrens zio_write_allocate_gang_member_done, NULL, 13141544Seschrock zio->io_priority, zio->io_flags, 13151544Seschrock &zio->io_bookmark)); 1316789Sahrens } else { 1317789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1318789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 13191775Sbillm zio_nowait(zio_write_allocate(zio, spa, 13201775Sbillm zio->io_checksum, txg, gbp, 1321789Sahrens (char *)zio->io_data + loff, lsize, 1322789Sahrens zio_write_allocate_gang_member_done, NULL, 1323789Sahrens zio->io_priority, zio->io_flags)); 1324789Sahrens } 1325789Sahrens } 1326789Sahrens 1327789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1328789Sahrens 1329789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1330789Sahrens 1331789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 13321775Sbillm /* 13331775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 13341775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 13351775Sbillm * so we have to wait for that to finish in order for our BP 13361775Sbillm * to be stable. 13371775Sbillm */ 1338789Sahrens zio_wait_children_done(zio); 1339789Sahrens } 1340789Sahrens 1341789Sahrens /* 1342789Sahrens * ========================================================================== 1343789Sahrens * Allocate and free blocks 1344789Sahrens * ========================================================================== 1345789Sahrens */ 1346789Sahrens static void 1347789Sahrens zio_dva_allocate(zio_t *zio) 1348789Sahrens { 13494527Sperrin spa_t *spa = zio->io_spa; 13504527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1351789Sahrens blkptr_t *bp = zio->io_bp; 1352789Sahrens int error; 1353789Sahrens 1354789Sahrens ASSERT(BP_IS_HOLE(bp)); 13551775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 13561775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 13574527Sperrin ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1358789Sahrens 1359789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1360789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 13614527Sperrin zio_write_allocate_gang_members(zio, mc); 1362789Sahrens return; 1363789Sahrens } 1364789Sahrens 1365789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1366789Sahrens 13674527Sperrin error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 13683063Sperrin zio->io_txg, NULL, B_FALSE); 1369789Sahrens 1370789Sahrens if (error == 0) { 1371789Sahrens bp->blk_birth = zio->io_txg; 1372789Sahrens } else if (error == ENOSPC) { 1373789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1374789Sahrens panic("really, truly out of space"); 13754527Sperrin zio_write_allocate_gang_members(zio, mc); 1376789Sahrens return; 1377789Sahrens } else { 1378789Sahrens zio->io_error = error; 1379789Sahrens } 1380789Sahrens zio_next_stage(zio); 1381789Sahrens } 1382789Sahrens 1383789Sahrens static void 1384789Sahrens zio_dva_free(zio_t *zio) 1385789Sahrens { 1386789Sahrens blkptr_t *bp = zio->io_bp; 1387789Sahrens 13881807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1389789Sahrens 1390789Sahrens BP_ZERO(bp); 1391789Sahrens 1392789Sahrens zio_next_stage(zio); 1393789Sahrens } 1394789Sahrens 1395789Sahrens static void 1396789Sahrens zio_dva_claim(zio_t *zio) 1397789Sahrens { 13981807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1399789Sahrens 1400789Sahrens zio_next_stage(zio); 1401789Sahrens } 1402789Sahrens 1403789Sahrens /* 1404789Sahrens * ========================================================================== 1405789Sahrens * Read and write to physical devices 1406789Sahrens * ========================================================================== 1407789Sahrens */ 1408789Sahrens 1409789Sahrens static void 14101775Sbillm zio_vdev_io_start(zio_t *zio) 1411789Sahrens { 1412789Sahrens vdev_t *vd = zio->io_vd; 14131775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 14141775Sbillm blkptr_t *bp = zio->io_bp; 14151775Sbillm uint64_t align; 1416789Sahrens 14171775Sbillm if (vd == NULL) { 14181775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14191775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 14201775Sbillm return; 14211775Sbillm } 14221775Sbillm 14231775Sbillm align = 1ULL << tvd->vdev_ashift; 14241775Sbillm 14251732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1426789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1427789Sahrens 14281775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 14291775Sbillm vd->vdev_children == 0) { 1430789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1431789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1432789Sahrens } 1433789Sahrens 14341732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 14351732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 14361732Sbonwick char *abuf = zio_buf_alloc(asize); 14371732Sbonwick ASSERT(vd == tvd); 14381732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 14391732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 14401732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 14411732Sbonwick } 14421732Sbonwick zio_push_transform(zio, abuf, asize, asize); 14431732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 14441732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 14451732Sbonwick } 14461732Sbonwick 14471732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 14481732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 14491732Sbonwick ASSERT(bp == NULL || 14501732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1451789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1452789Sahrens 1453789Sahrens vdev_io_start(zio); 1454789Sahrens 1455789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1456789Sahrens } 1457789Sahrens 1458789Sahrens static void 1459789Sahrens zio_vdev_io_done(zio_t *zio) 1460789Sahrens { 14611775Sbillm if (zio->io_vd == NULL) 14621775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14631775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 14641775Sbillm else 14651775Sbillm vdev_io_done(zio); 1466789Sahrens } 1467789Sahrens 1468789Sahrens /* XXPOLICY */ 14691544Seschrock boolean_t 1470789Sahrens zio_should_retry(zio_t *zio) 1471789Sahrens { 1472789Sahrens vdev_t *vd = zio->io_vd; 1473789Sahrens 1474789Sahrens if (zio->io_error == 0) 1475789Sahrens return (B_FALSE); 1476789Sahrens if (zio->io_delegate_list != NULL) 1477789Sahrens return (B_FALSE); 14781775Sbillm if (vd && vd != vd->vdev_top) 1479789Sahrens return (B_FALSE); 1480789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1481789Sahrens return (B_FALSE); 14821544Seschrock if (zio->io_retries > 0) 1483789Sahrens return (B_FALSE); 1484789Sahrens 1485789Sahrens return (B_TRUE); 1486789Sahrens } 1487789Sahrens 1488789Sahrens static void 1489789Sahrens zio_vdev_io_assess(zio_t *zio) 1490789Sahrens { 1491789Sahrens vdev_t *vd = zio->io_vd; 14921775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1493789Sahrens 14941544Seschrock ASSERT(zio->io_vsd == NULL); 1495789Sahrens 14961732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 14971732Sbonwick void *abuf; 14981732Sbonwick uint64_t asize; 14991732Sbonwick ASSERT(vd == tvd); 15001732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 15011732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 15021732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 15031732Sbonwick zio_buf_free(abuf, asize); 15041732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 15051732Sbonwick } 15061732Sbonwick 15071544Seschrock if (zio_injection_enabled && !zio->io_error) 15081544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1509789Sahrens 1510789Sahrens /* 1511789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1512789Sahrens */ 1513789Sahrens /* XXPOLICY */ 1514789Sahrens if (zio_should_retry(zio)) { 1515789Sahrens ASSERT(tvd == vd); 1516789Sahrens 1517789Sahrens zio->io_retries++; 1518789Sahrens zio->io_error = 0; 15193463Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | 15203463Sahrens ZIO_FLAG_CONFIG_GRABBED; 1521789Sahrens /* XXPOLICY */ 1522789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1523789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 15241775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1525789Sahrens 1526789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1527789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1528789Sahrens vdev_description(vd), zio->io_offset); 1529789Sahrens 15301544Seschrock zio_next_stage_async(zio); 15311544Seschrock return; 15321544Seschrock } 1533789Sahrens 1534789Sahrens zio_next_stage(zio); 1535789Sahrens } 1536789Sahrens 1537789Sahrens void 1538789Sahrens zio_vdev_io_reissue(zio_t *zio) 1539789Sahrens { 1540789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1541789Sahrens ASSERT(zio->io_error == 0); 1542789Sahrens 1543789Sahrens zio->io_stage--; 1544789Sahrens } 1545789Sahrens 1546789Sahrens void 1547789Sahrens zio_vdev_io_redone(zio_t *zio) 1548789Sahrens { 1549789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1550789Sahrens 1551789Sahrens zio->io_stage--; 1552789Sahrens } 1553789Sahrens 1554789Sahrens void 1555789Sahrens zio_vdev_io_bypass(zio_t *zio) 1556789Sahrens { 1557789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1558789Sahrens ASSERT(zio->io_error == 0); 1559789Sahrens 1560789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1561789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1562789Sahrens } 1563789Sahrens 1564789Sahrens /* 1565789Sahrens * ========================================================================== 1566789Sahrens * Generate and verify checksums 1567789Sahrens * ========================================================================== 1568789Sahrens */ 1569789Sahrens static void 1570789Sahrens zio_checksum_generate(zio_t *zio) 1571789Sahrens { 1572789Sahrens int checksum = zio->io_checksum; 1573789Sahrens blkptr_t *bp = zio->io_bp; 1574789Sahrens 1575789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1576789Sahrens 1577789Sahrens BP_SET_CHECKSUM(bp, checksum); 1578789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1579789Sahrens 1580789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1581789Sahrens 1582789Sahrens zio_next_stage(zio); 1583789Sahrens } 1584789Sahrens 1585789Sahrens static void 1586789Sahrens zio_gang_checksum_generate(zio_t *zio) 1587789Sahrens { 1588789Sahrens zio_cksum_t zc; 1589789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1590789Sahrens 15911775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1592789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1593789Sahrens 1594789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1595789Sahrens 1596789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1597789Sahrens 1598789Sahrens zio_next_stage(zio); 1599789Sahrens } 1600789Sahrens 1601789Sahrens static void 1602789Sahrens zio_checksum_verify(zio_t *zio) 1603789Sahrens { 1604789Sahrens if (zio->io_bp != NULL) { 1605789Sahrens zio->io_error = zio_checksum_error(zio); 16061544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 16071544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 16081544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1609789Sahrens } 1610789Sahrens 1611789Sahrens zio_next_stage(zio); 1612789Sahrens } 1613789Sahrens 1614789Sahrens /* 1615789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1616789Sahrens */ 1617789Sahrens void 1618789Sahrens zio_checksum_verified(zio_t *zio) 1619789Sahrens { 1620789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1621789Sahrens } 1622789Sahrens 1623789Sahrens /* 1624789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1625789Sahrens */ 1626789Sahrens void 1627789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1628789Sahrens { 16291775Sbillm blkptr_t *bp = zio->io_bp; 16301775Sbillm 16311775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 16321775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 16331775Sbillm zcp->zc_word[2] = bp->blk_birth; 1634789Sahrens zcp->zc_word[3] = 0; 1635789Sahrens } 1636789Sahrens 1637789Sahrens /* 1638789Sahrens * ========================================================================== 1639789Sahrens * Define the pipeline 1640789Sahrens * ========================================================================== 1641789Sahrens */ 1642789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1643789Sahrens 1644789Sahrens static void 1645789Sahrens zio_badop(zio_t *zio) 1646789Sahrens { 1647789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1648789Sahrens } 1649789Sahrens 1650789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1651789Sahrens zio_badop, 1652789Sahrens zio_wait_children_ready, 1653789Sahrens zio_write_compress, 1654789Sahrens zio_checksum_generate, 1655789Sahrens zio_gang_pipeline, 1656789Sahrens zio_get_gang_header, 1657789Sahrens zio_rewrite_gang_members, 1658789Sahrens zio_free_gang_members, 1659789Sahrens zio_claim_gang_members, 1660789Sahrens zio_dva_allocate, 1661789Sahrens zio_dva_free, 1662789Sahrens zio_dva_claim, 1663789Sahrens zio_gang_checksum_generate, 1664789Sahrens zio_ready, 1665789Sahrens zio_vdev_io_start, 1666789Sahrens zio_vdev_io_done, 1667789Sahrens zio_vdev_io_assess, 1668789Sahrens zio_wait_children_done, 1669789Sahrens zio_checksum_verify, 1670789Sahrens zio_read_gang_members, 1671789Sahrens zio_read_decompress, 1672789Sahrens zio_done, 1673789Sahrens zio_badop 1674789Sahrens }; 1675789Sahrens 1676789Sahrens /* 1677789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1678789Sahrens * There's no locking on io_stage because there's no legitimate way for 1679789Sahrens * multiple threads to be attempting to process the same I/O. 1680789Sahrens */ 1681789Sahrens void 1682789Sahrens zio_next_stage(zio_t *zio) 1683789Sahrens { 1684789Sahrens uint32_t pipeline = zio->io_pipeline; 1685789Sahrens 1686789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1687789Sahrens 1688789Sahrens if (zio->io_error) { 1689789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1690789Sahrens zio, vdev_description(zio->io_vd), 1691789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1692789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1693789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1694789Sahrens } 1695789Sahrens 1696789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1697789Sahrens continue; 1698789Sahrens 1699789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1700789Sahrens ASSERT(zio->io_stalled == 0); 1701789Sahrens 17023689Sek110237 /* 17033689Sek110237 * See the comment in zio_next_stage_async() about per-CPU taskqs. 17043689Sek110237 */ 17053689Sek110237 if (((1U << zio->io_stage) & zio->io_async_stages) && 17063689Sek110237 (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && 17073689Sek110237 !(zio->io_flags & ZIO_FLAG_METADATA)) { 17083689Sek110237 taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 17093689Sek110237 (void) taskq_dispatch(tq, 17103689Sek110237 (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 17113689Sek110237 } else { 17123689Sek110237 zio_pipeline[zio->io_stage](zio); 17133689Sek110237 } 1714789Sahrens } 1715789Sahrens 1716789Sahrens void 1717789Sahrens zio_next_stage_async(zio_t *zio) 1718789Sahrens { 1719789Sahrens taskq_t *tq; 1720789Sahrens uint32_t pipeline = zio->io_pipeline; 1721789Sahrens 1722789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1723789Sahrens 1724789Sahrens if (zio->io_error) { 1725789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1726789Sahrens zio, vdev_description(zio->io_vd), 1727789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1728789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1729789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1730789Sahrens } 1731789Sahrens 1732789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1733789Sahrens continue; 1734789Sahrens 1735789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1736789Sahrens ASSERT(zio->io_stalled == 0); 1737789Sahrens 1738789Sahrens /* 1739789Sahrens * For performance, we'll probably want two sets of task queues: 1740789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1741789Sahrens * part is for read performance: since we have to make a pass over 1742789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1743789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1744789Sahrens * that thread is probably still there. Getting this optimization 1745789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1746789Sahrens * 1747789Sahrens * Note that having two sets of task queues is also necessary for 1748789Sahrens * correctness: if all of the issue threads get bogged down waiting 1749789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1750789Sahrens * there won't be any threads available to service I/O completion 1751789Sahrens * interrupts. 1752789Sahrens */ 1753789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1754789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1755789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1756789Sahrens else 1757789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1758789Sahrens (void) taskq_dispatch(tq, 1759789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1760789Sahrens } else { 1761789Sahrens zio_pipeline[zio->io_stage](zio); 1762789Sahrens } 1763789Sahrens } 1764789Sahrens 17653668Sgw25295 static boolean_t 17663668Sgw25295 zio_alloc_should_fail(void) 17673668Sgw25295 { 17683668Sgw25295 static uint16_t allocs = 0; 17693668Sgw25295 17703668Sgw25295 return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); 17713668Sgw25295 } 17723668Sgw25295 1773789Sahrens /* 1774789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1775789Sahrens */ 1776789Sahrens int 17773063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17783063Sperrin uint64_t txg) 1779789Sahrens { 1780789Sahrens int error; 1781789Sahrens 17821544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1783789Sahrens 17843668Sgw25295 if (zio_zil_fail_shift && zio_alloc_should_fail()) { 17853668Sgw25295 spa_config_exit(spa, FTAG); 17863668Sgw25295 return (ENOSPC); 17873668Sgw25295 } 17883668Sgw25295 17893063Sperrin /* 17904527Sperrin * We were passed the previous log block's DVA in bp->blk_dva[0]. 17914527Sperrin * We use that as a hint for which vdev to allocate from next. 17923063Sperrin */ 17934527Sperrin error = metaslab_alloc(spa, spa->spa_log_class, size, 17944527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 17954527Sperrin 17964527Sperrin if (error) 17974527Sperrin error = metaslab_alloc(spa, spa->spa_normal_class, size, 17984527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 1799789Sahrens 1800789Sahrens if (error == 0) { 18013063Sperrin BP_SET_LSIZE(new_bp, size); 18023063Sperrin BP_SET_PSIZE(new_bp, size); 18033063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 18043063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 18053063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 18063063Sperrin BP_SET_LEVEL(new_bp, 0); 18073063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 18083063Sperrin new_bp->blk_birth = txg; 1809789Sahrens } 1810789Sahrens 18111544Seschrock spa_config_exit(spa, FTAG); 1812789Sahrens 1813789Sahrens return (error); 1814789Sahrens } 1815789Sahrens 1816789Sahrens /* 1817789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1818789Sahrens * nothing to do except metaslab_free() it. 1819789Sahrens */ 1820789Sahrens void 1821789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1822789Sahrens { 18231775Sbillm ASSERT(!BP_IS_GANG(bp)); 1824789Sahrens 18251544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1826789Sahrens 18271807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1828789Sahrens 18291544Seschrock spa_config_exit(spa, FTAG); 1830789Sahrens } 18314469Sperrin 18324469Sperrin /* 18334469Sperrin * start an async flush of the write cache for this vdev 18344469Sperrin */ 18354469Sperrin void 18364469Sperrin zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio) 18374469Sperrin { 18384469Sperrin vdev_t *vd; 18394469Sperrin 18404469Sperrin /* 18414469Sperrin * Lock out configuration changes. 18424469Sperrin */ 18434469Sperrin spa_config_enter(spa, RW_READER, FTAG); 18444469Sperrin 18454469Sperrin if (*zio == NULL) 18464469Sperrin *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 18474469Sperrin 18484469Sperrin vd = vdev_lookup_top(spa, vdev); 18494469Sperrin ASSERT(vd); 18504469Sperrin 18514469Sperrin (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE, 18524469Sperrin NULL, NULL, ZIO_PRIORITY_NOW, 18534469Sperrin ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 18544469Sperrin 18554469Sperrin spa_config_exit(spa, FTAG); 18564469Sperrin } 1857