1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 673668Sgw25295 /* Force an allocation failure when non-zero */ 683668Sgw25295 uint16_t zio_zil_fail_shift = 0; 693668Sgw25295 70789Sahrens typedef struct zio_sync_pass { 71789Sahrens int zp_defer_free; /* defer frees after this pass */ 72789Sahrens int zp_dontcompress; /* don't compress after this pass */ 73789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74789Sahrens } zio_sync_pass_t; 75789Sahrens 76789Sahrens zio_sync_pass_t zio_sync_pass = { 77789Sahrens 1, /* zp_defer_free */ 78789Sahrens 4, /* zp_dontcompress */ 79789Sahrens 1, /* zp_rewrite */ 80789Sahrens }; 81789Sahrens 82789Sahrens /* 83789Sahrens * ========================================================================== 84789Sahrens * I/O kmem caches 85789Sahrens * ========================================================================== 86789Sahrens */ 874055Seschrock kmem_cache_t *zio_cache; 88789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 893290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 903290Sjohansen 913290Sjohansen #ifdef _KERNEL 923290Sjohansen extern vmem_t *zio_alloc_arena; 933290Sjohansen #endif 94789Sahrens 95789Sahrens void 96789Sahrens zio_init(void) 97789Sahrens { 98789Sahrens size_t c; 993290Sjohansen vmem_t *data_alloc_arena = NULL; 1003290Sjohansen 1013290Sjohansen #ifdef _KERNEL 1023290Sjohansen data_alloc_arena = zio_alloc_arena; 1033290Sjohansen #endif 104789Sahrens 1054055Seschrock zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 1064055Seschrock NULL, NULL, NULL, NULL, NULL, 0); 1074055Seschrock 108789Sahrens /* 109789Sahrens * For small buffers, we want a cache for each multiple of 110789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 111789Sahrens * for each quarter-power of 2. For large buffers, we want 112789Sahrens * a cache for each multiple of PAGESIZE. 113789Sahrens */ 114789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 115789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 116789Sahrens size_t p2 = size; 117789Sahrens size_t align = 0; 118789Sahrens 119789Sahrens while (p2 & (p2 - 1)) 120789Sahrens p2 &= p2 - 1; 121789Sahrens 122789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 123789Sahrens align = SPA_MINBLOCKSIZE; 124789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 125789Sahrens align = PAGESIZE; 126789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 127789Sahrens align = p2 >> 2; 128789Sahrens } 129789Sahrens 130789Sahrens if (align != 0) { 1313290Sjohansen char name[36]; 1322856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 133789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 134849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1353290Sjohansen 1363290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1373290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1383290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1393290Sjohansen KMC_NODEBUG); 1403290Sjohansen 141789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 142789Sahrens size, align); 143789Sahrens } 144789Sahrens } 145789Sahrens 146789Sahrens while (--c != 0) { 147789Sahrens ASSERT(zio_buf_cache[c] != NULL); 148789Sahrens if (zio_buf_cache[c - 1] == NULL) 149789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1503290Sjohansen 1513290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1523290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1533290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 154789Sahrens } 1551544Seschrock 1561544Seschrock zio_inject_init(); 157789Sahrens } 158789Sahrens 159789Sahrens void 160789Sahrens zio_fini(void) 161789Sahrens { 162789Sahrens size_t c; 163789Sahrens kmem_cache_t *last_cache = NULL; 1643290Sjohansen kmem_cache_t *last_data_cache = NULL; 165789Sahrens 166789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 167789Sahrens if (zio_buf_cache[c] != last_cache) { 168789Sahrens last_cache = zio_buf_cache[c]; 169789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 170789Sahrens } 171789Sahrens zio_buf_cache[c] = NULL; 1723290Sjohansen 1733290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1743290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1753290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1763290Sjohansen } 1773290Sjohansen zio_data_buf_cache[c] = NULL; 178789Sahrens } 1791544Seschrock 1804055Seschrock kmem_cache_destroy(zio_cache); 1814055Seschrock 1821544Seschrock zio_inject_fini(); 183789Sahrens } 184789Sahrens 185789Sahrens /* 186789Sahrens * ========================================================================== 187789Sahrens * Allocate and free I/O buffers 188789Sahrens * ========================================================================== 189789Sahrens */ 1903290Sjohansen 1913290Sjohansen /* 1923290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1933290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1943290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1953290Sjohansen * excess / transient data in-core during a crashdump. 1963290Sjohansen */ 197789Sahrens void * 198789Sahrens zio_buf_alloc(size_t size) 199789Sahrens { 200789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 201789Sahrens 202789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 203789Sahrens 204789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 205789Sahrens } 206789Sahrens 2073290Sjohansen /* 2083290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2093290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2103290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2113290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2123290Sjohansen */ 2133290Sjohansen void * 2143290Sjohansen zio_data_buf_alloc(size_t size) 2153290Sjohansen { 2163290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2173290Sjohansen 2183290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2193290Sjohansen 2203290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2213290Sjohansen } 2223290Sjohansen 223789Sahrens void 224789Sahrens zio_buf_free(void *buf, size_t size) 225789Sahrens { 226789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 227789Sahrens 228789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 229789Sahrens 230789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 231789Sahrens } 232789Sahrens 2333290Sjohansen void 2343290Sjohansen zio_data_buf_free(void *buf, size_t size) 2353290Sjohansen { 2363290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2373290Sjohansen 2383290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2393290Sjohansen 2403290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2413290Sjohansen } 2423463Sahrens 243789Sahrens /* 244789Sahrens * ========================================================================== 245789Sahrens * Push and pop I/O transform buffers 246789Sahrens * ========================================================================== 247789Sahrens */ 248789Sahrens static void 249789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 250789Sahrens { 251789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 252789Sahrens 253789Sahrens zt->zt_data = data; 254789Sahrens zt->zt_size = size; 255789Sahrens zt->zt_bufsize = bufsize; 256789Sahrens 257789Sahrens zt->zt_next = zio->io_transform_stack; 258789Sahrens zio->io_transform_stack = zt; 259789Sahrens 260789Sahrens zio->io_data = data; 261789Sahrens zio->io_size = size; 262789Sahrens } 263789Sahrens 264789Sahrens static void 265789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 266789Sahrens { 267789Sahrens zio_transform_t *zt = zio->io_transform_stack; 268789Sahrens 269789Sahrens *data = zt->zt_data; 270789Sahrens *size = zt->zt_size; 271789Sahrens *bufsize = zt->zt_bufsize; 272789Sahrens 273789Sahrens zio->io_transform_stack = zt->zt_next; 274789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 275789Sahrens 276789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 277789Sahrens zio->io_data = zt->zt_data; 278789Sahrens zio->io_size = zt->zt_size; 279789Sahrens } 280789Sahrens } 281789Sahrens 282789Sahrens static void 283789Sahrens zio_clear_transform_stack(zio_t *zio) 284789Sahrens { 285789Sahrens void *data; 286789Sahrens uint64_t size, bufsize; 287789Sahrens 288789Sahrens ASSERT(zio->io_transform_stack != NULL); 289789Sahrens 290789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 291789Sahrens while (zio->io_transform_stack != NULL) { 292789Sahrens zio_buf_free(data, bufsize); 293789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 294789Sahrens } 295789Sahrens } 296789Sahrens 297789Sahrens /* 298789Sahrens * ========================================================================== 299789Sahrens * Create the various types of I/O (read, write, free) 300789Sahrens * ========================================================================== 301789Sahrens */ 302789Sahrens static zio_t * 303789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 304789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 305789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 306789Sahrens { 307789Sahrens zio_t *zio; 308789Sahrens 309789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 310789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 311789Sahrens 3124055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 3134055Seschrock bzero(zio, sizeof (zio_t)); 314789Sahrens zio->io_parent = pio; 315789Sahrens zio->io_spa = spa; 316789Sahrens zio->io_txg = txg; 317*4634Sek110237 zio->io_flags = flags; 318789Sahrens if (bp != NULL) { 319789Sahrens zio->io_bp = bp; 320789Sahrens zio->io_bp_copy = *bp; 321789Sahrens zio->io_bp_orig = *bp; 322*4634Sek110237 if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata || 323*4634Sek110237 BP_GET_LEVEL(bp) != 0) 324*4634Sek110237 zio->io_flags |= ZIO_FLAG_METADATA; 325789Sahrens } 326789Sahrens zio->io_done = done; 327789Sahrens zio->io_private = private; 328789Sahrens zio->io_type = type; 329789Sahrens zio->io_priority = priority; 330789Sahrens zio->io_stage = stage; 331789Sahrens zio->io_pipeline = pipeline; 332789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 333789Sahrens zio->io_timestamp = lbolt64; 334*4634Sek110237 if (pio != NULL) 335*4634Sek110237 zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA); 3362856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 337789Sahrens zio_push_transform(zio, data, size, size); 338789Sahrens 3393463Sahrens /* 3403463Sahrens * Note on config lock: 3413463Sahrens * 3423463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3433463Sahrens * lock, so we don't need it for this io. 3443463Sahrens * 3453463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3463463Sahrens * config lock on behalf of this io, so it should be released 3473463Sahrens * in zio_done. 3483463Sahrens * 3493463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3503463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3513463Sahrens * The NULL top-level ios rarely have any children, so we delay 3523463Sahrens * grabbing the lock until the first child is added (but it is 3533463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3543463Sahrens * children don't need to also grab it). This greatly reduces 3553463Sahrens * contention on the config lock. 3563463Sahrens */ 357789Sahrens if (pio == NULL) { 3583463Sahrens if (type != ZIO_TYPE_NULL && 3593463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3601544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 3613463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3623463Sahrens } 363789Sahrens zio->io_root = zio; 364789Sahrens } else { 365789Sahrens zio->io_root = pio->io_root; 3661544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3671544Seschrock zio->io_logical = pio->io_logical; 368789Sahrens mutex_enter(&pio->io_lock); 3693463Sahrens if (pio->io_parent == NULL && 3703463Sahrens pio->io_type == ZIO_TYPE_NULL && 3713463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 3723463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 3733463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3743463Sahrens spa_config_enter(zio->io_spa, RW_READER, pio); 3753463Sahrens } 376789Sahrens if (stage < ZIO_STAGE_READY) 377789Sahrens pio->io_children_notready++; 378789Sahrens pio->io_children_notdone++; 379789Sahrens zio->io_sibling_next = pio->io_child; 380789Sahrens zio->io_sibling_prev = NULL; 381789Sahrens if (pio->io_child != NULL) 382789Sahrens pio->io_child->io_sibling_prev = zio; 383789Sahrens pio->io_child = zio; 3841775Sbillm zio->io_ndvas = pio->io_ndvas; 385789Sahrens mutex_exit(&pio->io_lock); 386789Sahrens } 387789Sahrens 388789Sahrens return (zio); 389789Sahrens } 390789Sahrens 391789Sahrens zio_t * 392789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 393789Sahrens int flags) 394789Sahrens { 395789Sahrens zio_t *zio; 396789Sahrens 397789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 398789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 399789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 400789Sahrens 401789Sahrens return (zio); 402789Sahrens } 403789Sahrens 404789Sahrens zio_t * 405789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 406789Sahrens { 407789Sahrens return (zio_null(NULL, spa, done, private, flags)); 408789Sahrens } 409789Sahrens 410789Sahrens zio_t * 411789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 412789Sahrens uint64_t size, zio_done_func_t *done, void *private, 4131544Seschrock int priority, int flags, zbookmark_t *zb) 414789Sahrens { 415789Sahrens zio_t *zio; 416789Sahrens 417789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 418789Sahrens 419789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4202981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4212981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4221544Seschrock zio->io_bookmark = *zb; 4231544Seschrock 4241544Seschrock zio->io_logical = zio; 425789Sahrens 426789Sahrens /* 427789Sahrens * Work off our copy of the bp so the caller can free it. 428789Sahrens */ 429789Sahrens zio->io_bp = &zio->io_bp_copy; 430789Sahrens 431789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 432789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 433789Sahrens void *cbuf = zio_buf_alloc(csize); 434789Sahrens 435789Sahrens zio_push_transform(zio, cbuf, csize, csize); 436789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 437789Sahrens } 438789Sahrens 4391775Sbillm if (BP_IS_GANG(bp)) { 440789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 441789Sahrens void *gbuf = zio_buf_alloc(gsize); 442789Sahrens 443789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 444789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 445789Sahrens } 446789Sahrens 447789Sahrens return (zio); 448789Sahrens } 449789Sahrens 450789Sahrens zio_t * 4511775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 452789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4533547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 4543547Smaybee int flags, zbookmark_t *zb) 455789Sahrens { 456789Sahrens zio_t *zio; 457789Sahrens 458789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 459789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 460789Sahrens 461789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 462789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 463789Sahrens 464789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4652981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 466789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 467789Sahrens 4683547Smaybee zio->io_ready = ready; 4693547Smaybee 4701544Seschrock zio->io_bookmark = *zb; 4711544Seschrock 4721544Seschrock zio->io_logical = zio; 4731544Seschrock 474789Sahrens zio->io_checksum = checksum; 475789Sahrens zio->io_compress = compress; 4761775Sbillm zio->io_ndvas = ncopies; 477789Sahrens 478789Sahrens if (compress != ZIO_COMPRESS_OFF) 479789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 480789Sahrens 481789Sahrens if (bp->blk_birth != txg) { 482789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 483789Sahrens BP_ZERO(bp); 484789Sahrens BP_SET_LSIZE(bp, size); 485789Sahrens BP_SET_PSIZE(bp, size); 4861775Sbillm } else { 4871775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 4881775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 4891775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 490789Sahrens } 491789Sahrens 492789Sahrens return (zio); 493789Sahrens } 494789Sahrens 495789Sahrens zio_t * 496789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 497789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4981544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4991544Seschrock zbookmark_t *zb) 500789Sahrens { 501789Sahrens zio_t *zio; 502789Sahrens 503789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5042981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 505789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 506789Sahrens 5071544Seschrock zio->io_bookmark = *zb; 508789Sahrens zio->io_checksum = checksum; 509789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 510789Sahrens 5111775Sbillm if (pio != NULL) 5121775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5131775Sbillm 514789Sahrens return (zio); 515789Sahrens } 516789Sahrens 517789Sahrens static zio_t * 518789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 519789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 520789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 521789Sahrens { 522789Sahrens zio_t *zio; 523789Sahrens 524789Sahrens BP_ZERO(bp); 525789Sahrens BP_SET_LSIZE(bp, size); 526789Sahrens BP_SET_PSIZE(bp, size); 527789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 528789Sahrens 529789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 530789Sahrens ZIO_TYPE_WRITE, priority, flags, 531789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 532789Sahrens 533789Sahrens zio->io_checksum = checksum; 534789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 535789Sahrens 536789Sahrens return (zio); 537789Sahrens } 538789Sahrens 539789Sahrens zio_t * 540789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 541789Sahrens zio_done_func_t *done, void *private) 542789Sahrens { 543789Sahrens zio_t *zio; 544789Sahrens 545789Sahrens ASSERT(!BP_IS_HOLE(bp)); 546789Sahrens 547789Sahrens if (txg == spa->spa_syncing_txg && 548789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 549789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 550789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 551789Sahrens } 552789Sahrens 553789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5542981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 555789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 556789Sahrens 557789Sahrens zio->io_bp = &zio->io_bp_copy; 558789Sahrens 559789Sahrens return (zio); 560789Sahrens } 561789Sahrens 562789Sahrens zio_t * 563789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 564789Sahrens zio_done_func_t *done, void *private) 565789Sahrens { 566789Sahrens zio_t *zio; 567789Sahrens 568789Sahrens /* 569789Sahrens * A claim is an allocation of a specific block. Claims are needed 570789Sahrens * to support immediate writes in the intent log. The issue is that 571789Sahrens * immediate writes contain committed data, but in a txg that was 572789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 573789Sahrens * the intent log claims all blocks that contain immediate write data 574789Sahrens * so that the SPA knows they're in use. 575789Sahrens * 576789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 577789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 578789Sahrens */ 579789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 580789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 581789Sahrens 582789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 583789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 584789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 585789Sahrens 586789Sahrens zio->io_bp = &zio->io_bp_copy; 587789Sahrens 588789Sahrens return (zio); 589789Sahrens } 590789Sahrens 591789Sahrens zio_t * 592789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 593789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 594789Sahrens { 595789Sahrens zio_t *zio; 596789Sahrens int c; 597789Sahrens 598789Sahrens if (vd->vdev_children == 0) { 599789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 600789Sahrens ZIO_TYPE_IOCTL, priority, flags, 601789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 602789Sahrens 603789Sahrens zio->io_vd = vd; 604789Sahrens zio->io_cmd = cmd; 605789Sahrens } else { 606789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 607789Sahrens 608789Sahrens for (c = 0; c < vd->vdev_children; c++) 609789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 610789Sahrens done, private, priority, flags)); 611789Sahrens } 612789Sahrens 613789Sahrens return (zio); 614789Sahrens } 615789Sahrens 616789Sahrens static void 617789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 618789Sahrens int checksum) 619789Sahrens { 620789Sahrens ASSERT(vd->vdev_children == 0); 621789Sahrens 622789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 623789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 624789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 625789Sahrens 626789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 627789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 628789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 629789Sahrens 630789Sahrens BP_ZERO(bp); 631789Sahrens 632789Sahrens BP_SET_LSIZE(bp, size); 633789Sahrens BP_SET_PSIZE(bp, size); 634789Sahrens 635789Sahrens BP_SET_CHECKSUM(bp, checksum); 636789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 637789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 638789Sahrens 639789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 640789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 641789Sahrens } 642789Sahrens 643789Sahrens zio_t * 644789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 645789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 646789Sahrens int priority, int flags) 647789Sahrens { 648789Sahrens zio_t *zio; 649789Sahrens blkptr_t blk; 650789Sahrens 651789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 652789Sahrens 653789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 654789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 655789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 656789Sahrens 657789Sahrens zio->io_vd = vd; 658789Sahrens zio->io_offset = offset; 659789Sahrens 660789Sahrens /* 661789Sahrens * Work off our copy of the bp so the caller can free it. 662789Sahrens */ 663789Sahrens zio->io_bp = &zio->io_bp_copy; 664789Sahrens 665789Sahrens return (zio); 666789Sahrens } 667789Sahrens 668789Sahrens zio_t * 669789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 670789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 671789Sahrens int priority, int flags) 672789Sahrens { 673789Sahrens zio_block_tail_t *zbt; 674789Sahrens void *wbuf; 675789Sahrens zio_t *zio; 676789Sahrens blkptr_t blk; 677789Sahrens 678789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 679789Sahrens 680789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 681789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 682789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 683789Sahrens 684789Sahrens zio->io_vd = vd; 685789Sahrens zio->io_offset = offset; 686789Sahrens 687789Sahrens zio->io_bp = &zio->io_bp_copy; 688789Sahrens zio->io_checksum = checksum; 689789Sahrens 690789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 691789Sahrens /* 692789Sahrens * zbt checksums are necessarily destructive -- they modify 693789Sahrens * one word of the write buffer to hold the verifier/checksum. 694789Sahrens * Therefore, we must make a local copy in case the data is 695789Sahrens * being written to multiple places. 696789Sahrens */ 697789Sahrens wbuf = zio_buf_alloc(size); 698789Sahrens bcopy(data, wbuf, size); 699789Sahrens zio_push_transform(zio, wbuf, size, size); 700789Sahrens 701789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 702789Sahrens zbt->zbt_cksum = blk.blk_cksum; 703789Sahrens } 704789Sahrens 705789Sahrens return (zio); 706789Sahrens } 707789Sahrens 708789Sahrens /* 709789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 710789Sahrens */ 711789Sahrens zio_t * 712789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 713789Sahrens void *data, uint64_t size, int type, int priority, int flags, 714789Sahrens zio_done_func_t *done, void *private) 715789Sahrens { 716789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 717789Sahrens zio_t *cio; 718789Sahrens 719789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 720789Sahrens /* 721789Sahrens * If we have the bp, then the child should perform the 722789Sahrens * checksum and the parent need not. This pushes error 723789Sahrens * detection as close to the leaves as possible and 724789Sahrens * eliminates redundant checksums in the interior nodes. 725789Sahrens */ 726789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 727789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 728789Sahrens } 729789Sahrens 730789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 731789Sahrens done, private, type, priority, 732789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 7331775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 734789Sahrens 735789Sahrens cio->io_vd = vd; 736789Sahrens cio->io_offset = offset; 737789Sahrens 738789Sahrens return (cio); 739789Sahrens } 740789Sahrens 741789Sahrens /* 742789Sahrens * ========================================================================== 743789Sahrens * Initiate I/O, either sync or async 744789Sahrens * ========================================================================== 745789Sahrens */ 746789Sahrens int 747789Sahrens zio_wait(zio_t *zio) 748789Sahrens { 749789Sahrens int error; 750789Sahrens 751789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 752789Sahrens 753789Sahrens zio->io_waiter = curthread; 754789Sahrens 755789Sahrens zio_next_stage_async(zio); 756789Sahrens 757789Sahrens mutex_enter(&zio->io_lock); 758789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 759789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 760789Sahrens mutex_exit(&zio->io_lock); 761789Sahrens 762789Sahrens error = zio->io_error; 7632856Snd150628 mutex_destroy(&zio->io_lock); 7644055Seschrock kmem_cache_free(zio_cache, zio); 765789Sahrens 766789Sahrens return (error); 767789Sahrens } 768789Sahrens 769789Sahrens void 770789Sahrens zio_nowait(zio_t *zio) 771789Sahrens { 772789Sahrens zio_next_stage_async(zio); 773789Sahrens } 774789Sahrens 775789Sahrens /* 776789Sahrens * ========================================================================== 777789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 778789Sahrens * ========================================================================== 779789Sahrens */ 780789Sahrens static void 781789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 782789Sahrens { 783789Sahrens mutex_enter(&zio->io_lock); 784789Sahrens if (*countp == 0) { 785789Sahrens ASSERT(zio->io_stalled == 0); 786789Sahrens mutex_exit(&zio->io_lock); 787789Sahrens zio_next_stage(zio); 788789Sahrens } else { 789789Sahrens zio->io_stalled = stage; 790789Sahrens mutex_exit(&zio->io_lock); 791789Sahrens } 792789Sahrens } 793789Sahrens 794789Sahrens static void 795789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 796789Sahrens { 797789Sahrens zio_t *pio = zio->io_parent; 798789Sahrens 799789Sahrens mutex_enter(&pio->io_lock); 800789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 801789Sahrens pio->io_error = zio->io_error; 802789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 803789Sahrens pio->io_stalled = 0; 804789Sahrens mutex_exit(&pio->io_lock); 805789Sahrens zio_next_stage_async(pio); 806789Sahrens } else { 807789Sahrens mutex_exit(&pio->io_lock); 808789Sahrens } 809789Sahrens } 810789Sahrens 811789Sahrens static void 812789Sahrens zio_wait_children_ready(zio_t *zio) 813789Sahrens { 814789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 815789Sahrens &zio->io_children_notready); 816789Sahrens } 817789Sahrens 818789Sahrens void 819789Sahrens zio_wait_children_done(zio_t *zio) 820789Sahrens { 821789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 822789Sahrens &zio->io_children_notdone); 823789Sahrens } 824789Sahrens 825789Sahrens static void 826789Sahrens zio_ready(zio_t *zio) 827789Sahrens { 828789Sahrens zio_t *pio = zio->io_parent; 829789Sahrens 8303547Smaybee if (zio->io_ready) 8313547Smaybee zio->io_ready(zio); 8323547Smaybee 833789Sahrens if (pio != NULL) 834789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 835789Sahrens &pio->io_children_notready); 836789Sahrens 837789Sahrens if (zio->io_bp) 838789Sahrens zio->io_bp_copy = *zio->io_bp; 839789Sahrens 840789Sahrens zio_next_stage(zio); 841789Sahrens } 842789Sahrens 843789Sahrens static void 844789Sahrens zio_done(zio_t *zio) 845789Sahrens { 846789Sahrens zio_t *pio = zio->io_parent; 847789Sahrens spa_t *spa = zio->io_spa; 848789Sahrens blkptr_t *bp = zio->io_bp; 849789Sahrens vdev_t *vd = zio->io_vd; 850789Sahrens 851789Sahrens ASSERT(zio->io_children_notready == 0); 852789Sahrens ASSERT(zio->io_children_notdone == 0); 853789Sahrens 854789Sahrens if (bp != NULL) { 855789Sahrens ASSERT(bp->blk_pad[0] == 0); 856789Sahrens ASSERT(bp->blk_pad[1] == 0); 857789Sahrens ASSERT(bp->blk_pad[2] == 0); 858789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 859789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 8601775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 861789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 8621775Sbillm if (zio->io_ndvas != 0) 8631775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 8641775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 8651775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 8661775Sbillm } 867789Sahrens } 868789Sahrens 869789Sahrens if (vd != NULL) 870789Sahrens vdev_stat_update(zio); 871789Sahrens 872789Sahrens if (zio->io_error) { 8731544Seschrock /* 8741544Seschrock * If this I/O is attached to a particular vdev, 8751544Seschrock * generate an error message describing the I/O failure 8761544Seschrock * at the block level. We ignore these errors if the 8771544Seschrock * device is currently unavailable. 8781544Seschrock */ 8791732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 8801544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 8811732Sbonwick zio->io_spa, vd, zio, 0, 0); 882789Sahrens 8831544Seschrock if ((zio->io_error == EIO || 8841544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 8851544Seschrock zio->io_logical == zio) { 8861544Seschrock /* 8871544Seschrock * For root I/O requests, tell the SPA to log the error 8881544Seschrock * appropriately. Also, generate a logical data 8891544Seschrock * ereport. 8901544Seschrock */ 8911544Seschrock spa_log_error(zio->io_spa, zio); 8921544Seschrock 8931544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 8941544Seschrock zio->io_spa, NULL, zio, 0, 0); 8951544Seschrock } 896789Sahrens 8971544Seschrock /* 8981544Seschrock * For I/O requests that cannot fail, panic appropriately. 8991544Seschrock */ 9001544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 9013459Sek110237 char *blkbuf; 9023459Sek110237 9033459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 9043459Sek110237 if (blkbuf) { 9053459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 9063459Sek110237 bp ? bp : &zio->io_bp_copy); 9073459Sek110237 } 9081544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 9091544Seschrock "%d", zio->io_error == ECKSUM ? 9101544Seschrock "bad checksum" : "I/O failure", 9111544Seschrock zio_type_name[zio->io_type], 9121544Seschrock vdev_description(vd), 9131544Seschrock (u_longlong_t)zio->io_offset, 9143459Sek110237 zio, blkbuf ? blkbuf : "", zio->io_error); 9151544Seschrock } 916789Sahrens } 917789Sahrens zio_clear_transform_stack(zio); 918789Sahrens 919789Sahrens if (zio->io_done) 920789Sahrens zio->io_done(zio); 921789Sahrens 922789Sahrens ASSERT(zio->io_delegate_list == NULL); 923789Sahrens ASSERT(zio->io_delegate_next == NULL); 924789Sahrens 925789Sahrens if (pio != NULL) { 926789Sahrens zio_t *next, *prev; 927789Sahrens 928789Sahrens mutex_enter(&pio->io_lock); 929789Sahrens next = zio->io_sibling_next; 930789Sahrens prev = zio->io_sibling_prev; 931789Sahrens if (next != NULL) 932789Sahrens next->io_sibling_prev = prev; 933789Sahrens if (prev != NULL) 934789Sahrens prev->io_sibling_next = next; 935789Sahrens if (pio->io_child == zio) 936789Sahrens pio->io_child = next; 937789Sahrens mutex_exit(&pio->io_lock); 938789Sahrens 939789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 940789Sahrens &pio->io_children_notdone); 941789Sahrens } 942789Sahrens 9433463Sahrens /* 9444055Seschrock * Note: this I/O is now done, and will shortly be freed, so there is no 9454055Seschrock * need to clear this (or any other) flag. 9463463Sahrens */ 9473463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 9481544Seschrock spa_config_exit(spa, zio); 949789Sahrens 950789Sahrens if (zio->io_waiter != NULL) { 951789Sahrens mutex_enter(&zio->io_lock); 952789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 953789Sahrens zio->io_stalled = zio->io_stage; 954789Sahrens cv_broadcast(&zio->io_cv); 955789Sahrens mutex_exit(&zio->io_lock); 956789Sahrens } else { 9574055Seschrock kmem_cache_free(zio_cache, zio); 958789Sahrens } 959789Sahrens } 960789Sahrens 961789Sahrens /* 962789Sahrens * ========================================================================== 963789Sahrens * Compression support 964789Sahrens * ========================================================================== 965789Sahrens */ 966789Sahrens static void 967789Sahrens zio_write_compress(zio_t *zio) 968789Sahrens { 969789Sahrens int compress = zio->io_compress; 970789Sahrens blkptr_t *bp = zio->io_bp; 971789Sahrens void *cbuf; 972789Sahrens uint64_t lsize = zio->io_size; 973789Sahrens uint64_t csize = lsize; 974789Sahrens uint64_t cbufsize = 0; 975789Sahrens int pass; 976789Sahrens 977789Sahrens if (bp->blk_birth == zio->io_txg) { 978789Sahrens /* 979789Sahrens * We're rewriting an existing block, which means we're 980789Sahrens * working on behalf of spa_sync(). For spa_sync() to 981789Sahrens * converge, it must eventually be the case that we don't 982789Sahrens * have to allocate new blocks. But compression changes 983789Sahrens * the blocksize, which forces a reallocate, and makes 984789Sahrens * convergence take longer. Therefore, after the first 985789Sahrens * few passes, stop compressing to ensure convergence. 986789Sahrens */ 987789Sahrens pass = spa_sync_pass(zio->io_spa); 988789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 989789Sahrens compress = ZIO_COMPRESS_OFF; 990789Sahrens } else { 991789Sahrens ASSERT(BP_IS_HOLE(bp)); 992789Sahrens pass = 1; 993789Sahrens } 994789Sahrens 995789Sahrens if (compress != ZIO_COMPRESS_OFF) 996789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 997789Sahrens &cbuf, &csize, &cbufsize)) 998789Sahrens compress = ZIO_COMPRESS_OFF; 999789Sahrens 1000789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 1001789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 1002789Sahrens 1003789Sahrens /* 1004789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 1005789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 1006789Sahrens * but newly allocated blocks are sequential, so they can be written 1007789Sahrens * to disk faster. Therefore, we allow the first few passes of 1008789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 1009789Sahrens * There should only be a handful of blocks after pass 1 in any case. 1010789Sahrens */ 1011789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1012789Sahrens pass > zio_sync_pass.zp_rewrite) { 1013789Sahrens ASSERT(csize != 0); 10142885Sahrens BP_SET_LSIZE(bp, lsize); 10152885Sahrens BP_SET_COMPRESS(bp, compress); 1016789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 1017789Sahrens } else { 10183882Sahrens if (bp->blk_birth == zio->io_txg) 10193882Sahrens BP_ZERO(bp); 1020789Sahrens if (csize == 0) { 1021789Sahrens BP_ZERO(bp); 1022789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1023789Sahrens } else { 10241775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1025789Sahrens BP_SET_LSIZE(bp, lsize); 1026789Sahrens BP_SET_PSIZE(bp, csize); 1027789Sahrens BP_SET_COMPRESS(bp, compress); 1028789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 1029789Sahrens } 1030789Sahrens } 1031789Sahrens 1032789Sahrens zio_next_stage(zio); 1033789Sahrens } 1034789Sahrens 1035789Sahrens static void 1036789Sahrens zio_read_decompress(zio_t *zio) 1037789Sahrens { 1038789Sahrens blkptr_t *bp = zio->io_bp; 1039789Sahrens void *data; 1040789Sahrens uint64_t size; 1041789Sahrens uint64_t bufsize; 1042789Sahrens int compress = BP_GET_COMPRESS(bp); 1043789Sahrens 1044789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1045789Sahrens 1046789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1047789Sahrens 1048789Sahrens if (zio_decompress_data(compress, data, size, 1049789Sahrens zio->io_data, zio->io_size)) 1050789Sahrens zio->io_error = EIO; 1051789Sahrens 1052789Sahrens zio_buf_free(data, bufsize); 1053789Sahrens 1054789Sahrens zio_next_stage(zio); 1055789Sahrens } 1056789Sahrens 1057789Sahrens /* 1058789Sahrens * ========================================================================== 1059789Sahrens * Gang block support 1060789Sahrens * ========================================================================== 1061789Sahrens */ 1062789Sahrens static void 1063789Sahrens zio_gang_pipeline(zio_t *zio) 1064789Sahrens { 1065789Sahrens /* 1066789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1067789Sahrens * block. If we're not, strip out any gang-specific stages. 1068789Sahrens */ 10691775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1070789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1071789Sahrens 1072789Sahrens zio_next_stage(zio); 1073789Sahrens } 1074789Sahrens 1075789Sahrens static void 1076789Sahrens zio_gang_byteswap(zio_t *zio) 1077789Sahrens { 1078789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1079789Sahrens 1080789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1081789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1082789Sahrens } 1083789Sahrens 1084789Sahrens static void 1085789Sahrens zio_get_gang_header(zio_t *zio) 1086789Sahrens { 1087789Sahrens blkptr_t *bp = zio->io_bp; 1088789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1089789Sahrens void *gbuf = zio_buf_alloc(gsize); 1090789Sahrens 10911775Sbillm ASSERT(BP_IS_GANG(bp)); 1092789Sahrens 1093789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1094789Sahrens 1095789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1096789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1097789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1098789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 1099789Sahrens 1100789Sahrens zio_wait_children_done(zio); 1101789Sahrens } 1102789Sahrens 1103789Sahrens static void 1104789Sahrens zio_read_gang_members(zio_t *zio) 1105789Sahrens { 1106789Sahrens zio_gbh_phys_t *gbh; 1107789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1108789Sahrens int i; 1109789Sahrens 11101775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1111789Sahrens 1112789Sahrens zio_gang_byteswap(zio); 1113789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1114789Sahrens 1115789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1116789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1117789Sahrens lsize = BP_GET_PSIZE(gbp); 1118789Sahrens 1119789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1120789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1121789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1122789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1123789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1124789Sahrens 1125789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1126789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 11271544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 11281544Seschrock &zio->io_bookmark)); 1129789Sahrens } 1130789Sahrens 1131789Sahrens zio_buf_free(gbh, gbufsize); 1132789Sahrens zio_wait_children_done(zio); 1133789Sahrens } 1134789Sahrens 1135789Sahrens static void 1136789Sahrens zio_rewrite_gang_members(zio_t *zio) 1137789Sahrens { 1138789Sahrens zio_gbh_phys_t *gbh; 1139789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1140789Sahrens int i; 1141789Sahrens 11421775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1143789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1144789Sahrens 1145789Sahrens zio_gang_byteswap(zio); 1146789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1147789Sahrens 1148789Sahrens ASSERT(gsize == gbufsize); 1149789Sahrens 1150789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1151789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1152789Sahrens lsize = BP_GET_PSIZE(gbp); 1153789Sahrens 1154789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1155789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1156789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1157789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1158789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1159789Sahrens 1160789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1161789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 11621544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 11631544Seschrock &zio->io_bookmark)); 1164789Sahrens } 1165789Sahrens 1166789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1167789Sahrens zio_wait_children_ready(zio); 1168789Sahrens } 1169789Sahrens 1170789Sahrens static void 1171789Sahrens zio_free_gang_members(zio_t *zio) 1172789Sahrens { 1173789Sahrens zio_gbh_phys_t *gbh; 1174789Sahrens uint64_t gsize, gbufsize; 1175789Sahrens int i; 1176789Sahrens 11771775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1178789Sahrens 1179789Sahrens zio_gang_byteswap(zio); 1180789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1181789Sahrens 1182789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1183789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1184789Sahrens 1185789Sahrens if (BP_IS_HOLE(gbp)) 1186789Sahrens continue; 1187789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1188789Sahrens gbp, NULL, NULL)); 1189789Sahrens } 1190789Sahrens 1191789Sahrens zio_buf_free(gbh, gbufsize); 1192789Sahrens zio_next_stage(zio); 1193789Sahrens } 1194789Sahrens 1195789Sahrens static void 1196789Sahrens zio_claim_gang_members(zio_t *zio) 1197789Sahrens { 1198789Sahrens zio_gbh_phys_t *gbh; 1199789Sahrens uint64_t gsize, gbufsize; 1200789Sahrens int i; 1201789Sahrens 12021775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1203789Sahrens 1204789Sahrens zio_gang_byteswap(zio); 1205789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1206789Sahrens 1207789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1208789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1209789Sahrens if (BP_IS_HOLE(gbp)) 1210789Sahrens continue; 1211789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1212789Sahrens gbp, NULL, NULL)); 1213789Sahrens } 1214789Sahrens 1215789Sahrens zio_buf_free(gbh, gbufsize); 1216789Sahrens zio_next_stage(zio); 1217789Sahrens } 1218789Sahrens 1219789Sahrens static void 1220789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1221789Sahrens { 1222789Sahrens zio_t *pio = zio->io_parent; 12231775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 12241775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1225789Sahrens uint64_t asize; 12261775Sbillm int d; 1227789Sahrens 12281775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 12291775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 12301775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 12311775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 12321775Sbillm 1233789Sahrens mutex_enter(&pio->io_lock); 12341775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 12351775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 12361775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 12371775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 12381775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 12391775Sbillm } 1240789Sahrens mutex_exit(&pio->io_lock); 1241789Sahrens } 1242789Sahrens 1243789Sahrens static void 12444527Sperrin zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1245789Sahrens { 1246789Sahrens blkptr_t *bp = zio->io_bp; 12471775Sbillm dva_t *dva = bp->blk_dva; 12481775Sbillm spa_t *spa = zio->io_spa; 1249789Sahrens zio_gbh_phys_t *gbh; 12501775Sbillm uint64_t txg = zio->io_txg; 1251789Sahrens uint64_t resid = zio->io_size; 1252789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1253789Sahrens uint64_t gsize, loff, lsize; 1254789Sahrens uint32_t gbps_left; 12551775Sbillm int ndvas = zio->io_ndvas; 12561775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1257789Sahrens int error; 12581775Sbillm int i, d; 1259789Sahrens 1260789Sahrens gsize = SPA_GANGBLOCKSIZE; 1261789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1262789Sahrens 12634527Sperrin error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 12644527Sperrin B_FALSE); 1265789Sahrens if (error == ENOSPC) 1266789Sahrens panic("can't allocate gang block header"); 1267789Sahrens ASSERT(error == 0); 1268789Sahrens 12691775Sbillm for (d = 0; d < gbh_ndvas; d++) 12701775Sbillm DVA_SET_GANG(&dva[d], 1); 1271789Sahrens 12721775Sbillm bp->blk_birth = txg; 1273789Sahrens 1274789Sahrens gbh = zio_buf_alloc(gsize); 1275789Sahrens bzero(gbh, gsize); 1276789Sahrens 12771775Sbillm /* We need to test multi-level gang blocks */ 12781775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 12791775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 12801775Sbillm 1281789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1282789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1283789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 12841775Sbillm dva = gbp->blk_dva; 1285789Sahrens 1286789Sahrens ASSERT(gbps_left != 0); 1287789Sahrens maxalloc = MIN(maxalloc, resid); 1288789Sahrens 1289789Sahrens while (resid <= maxalloc * gbps_left) { 12904527Sperrin error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 12913063Sperrin txg, bp, B_FALSE); 1292789Sahrens if (error == 0) 1293789Sahrens break; 1294789Sahrens ASSERT3U(error, ==, ENOSPC); 1295789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1296789Sahrens panic("really out of space"); 1297789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1298789Sahrens } 1299789Sahrens 1300789Sahrens if (resid <= maxalloc * gbps_left) { 1301789Sahrens lsize = maxalloc; 1302789Sahrens BP_SET_LSIZE(gbp, lsize); 1303789Sahrens BP_SET_PSIZE(gbp, lsize); 1304789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 13051775Sbillm gbp->blk_birth = txg; 13061775Sbillm zio_nowait(zio_rewrite(zio, spa, 13071775Sbillm zio->io_checksum, txg, gbp, 1308789Sahrens (char *)zio->io_data + loff, lsize, 1309789Sahrens zio_write_allocate_gang_member_done, NULL, 13101544Seschrock zio->io_priority, zio->io_flags, 13111544Seschrock &zio->io_bookmark)); 1312789Sahrens } else { 1313789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1314789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 13151775Sbillm zio_nowait(zio_write_allocate(zio, spa, 13161775Sbillm zio->io_checksum, txg, gbp, 1317789Sahrens (char *)zio->io_data + loff, lsize, 1318789Sahrens zio_write_allocate_gang_member_done, NULL, 1319789Sahrens zio->io_priority, zio->io_flags)); 1320789Sahrens } 1321789Sahrens } 1322789Sahrens 1323789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1324789Sahrens 1325789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1326789Sahrens 1327789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 13281775Sbillm /* 13291775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 13301775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 13311775Sbillm * so we have to wait for that to finish in order for our BP 13321775Sbillm * to be stable. 13331775Sbillm */ 1334789Sahrens zio_wait_children_done(zio); 1335789Sahrens } 1336789Sahrens 1337789Sahrens /* 1338789Sahrens * ========================================================================== 1339789Sahrens * Allocate and free blocks 1340789Sahrens * ========================================================================== 1341789Sahrens */ 1342789Sahrens static void 1343789Sahrens zio_dva_allocate(zio_t *zio) 1344789Sahrens { 13454527Sperrin spa_t *spa = zio->io_spa; 13464527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1347789Sahrens blkptr_t *bp = zio->io_bp; 1348789Sahrens int error; 1349789Sahrens 1350789Sahrens ASSERT(BP_IS_HOLE(bp)); 13511775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 13521775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 13534527Sperrin ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1354789Sahrens 1355789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1356789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 13574527Sperrin zio_write_allocate_gang_members(zio, mc); 1358789Sahrens return; 1359789Sahrens } 1360789Sahrens 1361789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1362789Sahrens 13634527Sperrin error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 13643063Sperrin zio->io_txg, NULL, B_FALSE); 1365789Sahrens 1366789Sahrens if (error == 0) { 1367789Sahrens bp->blk_birth = zio->io_txg; 1368789Sahrens } else if (error == ENOSPC) { 1369789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1370789Sahrens panic("really, truly out of space"); 13714527Sperrin zio_write_allocate_gang_members(zio, mc); 1372789Sahrens return; 1373789Sahrens } else { 1374789Sahrens zio->io_error = error; 1375789Sahrens } 1376789Sahrens zio_next_stage(zio); 1377789Sahrens } 1378789Sahrens 1379789Sahrens static void 1380789Sahrens zio_dva_free(zio_t *zio) 1381789Sahrens { 1382789Sahrens blkptr_t *bp = zio->io_bp; 1383789Sahrens 13841807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1385789Sahrens 1386789Sahrens BP_ZERO(bp); 1387789Sahrens 1388789Sahrens zio_next_stage(zio); 1389789Sahrens } 1390789Sahrens 1391789Sahrens static void 1392789Sahrens zio_dva_claim(zio_t *zio) 1393789Sahrens { 13941807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1395789Sahrens 1396789Sahrens zio_next_stage(zio); 1397789Sahrens } 1398789Sahrens 1399789Sahrens /* 1400789Sahrens * ========================================================================== 1401789Sahrens * Read and write to physical devices 1402789Sahrens * ========================================================================== 1403789Sahrens */ 1404789Sahrens 1405789Sahrens static void 14061775Sbillm zio_vdev_io_start(zio_t *zio) 1407789Sahrens { 1408789Sahrens vdev_t *vd = zio->io_vd; 14091775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 14101775Sbillm blkptr_t *bp = zio->io_bp; 14111775Sbillm uint64_t align; 1412789Sahrens 14131775Sbillm if (vd == NULL) { 14141775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14151775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 14161775Sbillm return; 14171775Sbillm } 14181775Sbillm 14191775Sbillm align = 1ULL << tvd->vdev_ashift; 14201775Sbillm 14211732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1422789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1423789Sahrens 14241775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 14251775Sbillm vd->vdev_children == 0) { 1426789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1427789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1428789Sahrens } 1429789Sahrens 14301732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 14311732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 14321732Sbonwick char *abuf = zio_buf_alloc(asize); 14331732Sbonwick ASSERT(vd == tvd); 14341732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 14351732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 14361732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 14371732Sbonwick } 14381732Sbonwick zio_push_transform(zio, abuf, asize, asize); 14391732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 14401732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 14411732Sbonwick } 14421732Sbonwick 14431732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 14441732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 14451732Sbonwick ASSERT(bp == NULL || 14461732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1447789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1448789Sahrens 1449789Sahrens vdev_io_start(zio); 1450789Sahrens 1451789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1452789Sahrens } 1453789Sahrens 1454789Sahrens static void 1455789Sahrens zio_vdev_io_done(zio_t *zio) 1456789Sahrens { 14571775Sbillm if (zio->io_vd == NULL) 14581775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14591775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 14601775Sbillm else 14611775Sbillm vdev_io_done(zio); 1462789Sahrens } 1463789Sahrens 1464789Sahrens /* XXPOLICY */ 14651544Seschrock boolean_t 1466789Sahrens zio_should_retry(zio_t *zio) 1467789Sahrens { 1468789Sahrens vdev_t *vd = zio->io_vd; 1469789Sahrens 1470789Sahrens if (zio->io_error == 0) 1471789Sahrens return (B_FALSE); 1472789Sahrens if (zio->io_delegate_list != NULL) 1473789Sahrens return (B_FALSE); 14741775Sbillm if (vd && vd != vd->vdev_top) 1475789Sahrens return (B_FALSE); 1476789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1477789Sahrens return (B_FALSE); 14781544Seschrock if (zio->io_retries > 0) 1479789Sahrens return (B_FALSE); 1480789Sahrens 1481789Sahrens return (B_TRUE); 1482789Sahrens } 1483789Sahrens 1484789Sahrens static void 1485789Sahrens zio_vdev_io_assess(zio_t *zio) 1486789Sahrens { 1487789Sahrens vdev_t *vd = zio->io_vd; 14881775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1489789Sahrens 14901544Seschrock ASSERT(zio->io_vsd == NULL); 1491789Sahrens 14921732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 14931732Sbonwick void *abuf; 14941732Sbonwick uint64_t asize; 14951732Sbonwick ASSERT(vd == tvd); 14961732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 14971732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 14981732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 14991732Sbonwick zio_buf_free(abuf, asize); 15001732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 15011732Sbonwick } 15021732Sbonwick 15031544Seschrock if (zio_injection_enabled && !zio->io_error) 15041544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1505789Sahrens 1506789Sahrens /* 1507789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1508789Sahrens */ 1509789Sahrens /* XXPOLICY */ 1510789Sahrens if (zio_should_retry(zio)) { 1511789Sahrens ASSERT(tvd == vd); 1512789Sahrens 1513789Sahrens zio->io_retries++; 1514789Sahrens zio->io_error = 0; 15153463Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | 15163463Sahrens ZIO_FLAG_CONFIG_GRABBED; 1517789Sahrens /* XXPOLICY */ 1518789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1519789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 15201775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1521789Sahrens 1522789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1523789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1524789Sahrens vdev_description(vd), zio->io_offset); 1525789Sahrens 15261544Seschrock zio_next_stage_async(zio); 15271544Seschrock return; 15281544Seschrock } 1529789Sahrens 1530789Sahrens zio_next_stage(zio); 1531789Sahrens } 1532789Sahrens 1533789Sahrens void 1534789Sahrens zio_vdev_io_reissue(zio_t *zio) 1535789Sahrens { 1536789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1537789Sahrens ASSERT(zio->io_error == 0); 1538789Sahrens 1539789Sahrens zio->io_stage--; 1540789Sahrens } 1541789Sahrens 1542789Sahrens void 1543789Sahrens zio_vdev_io_redone(zio_t *zio) 1544789Sahrens { 1545789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1546789Sahrens 1547789Sahrens zio->io_stage--; 1548789Sahrens } 1549789Sahrens 1550789Sahrens void 1551789Sahrens zio_vdev_io_bypass(zio_t *zio) 1552789Sahrens { 1553789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1554789Sahrens ASSERT(zio->io_error == 0); 1555789Sahrens 1556789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1557789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1558789Sahrens } 1559789Sahrens 1560789Sahrens /* 1561789Sahrens * ========================================================================== 1562789Sahrens * Generate and verify checksums 1563789Sahrens * ========================================================================== 1564789Sahrens */ 1565789Sahrens static void 1566789Sahrens zio_checksum_generate(zio_t *zio) 1567789Sahrens { 1568789Sahrens int checksum = zio->io_checksum; 1569789Sahrens blkptr_t *bp = zio->io_bp; 1570789Sahrens 1571789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1572789Sahrens 1573789Sahrens BP_SET_CHECKSUM(bp, checksum); 1574789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1575789Sahrens 1576789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1577789Sahrens 1578789Sahrens zio_next_stage(zio); 1579789Sahrens } 1580789Sahrens 1581789Sahrens static void 1582789Sahrens zio_gang_checksum_generate(zio_t *zio) 1583789Sahrens { 1584789Sahrens zio_cksum_t zc; 1585789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1586789Sahrens 15871775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1588789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1589789Sahrens 1590789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1591789Sahrens 1592789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1593789Sahrens 1594789Sahrens zio_next_stage(zio); 1595789Sahrens } 1596789Sahrens 1597789Sahrens static void 1598789Sahrens zio_checksum_verify(zio_t *zio) 1599789Sahrens { 1600789Sahrens if (zio->io_bp != NULL) { 1601789Sahrens zio->io_error = zio_checksum_error(zio); 16021544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 16031544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 16041544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1605789Sahrens } 1606789Sahrens 1607789Sahrens zio_next_stage(zio); 1608789Sahrens } 1609789Sahrens 1610789Sahrens /* 1611789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1612789Sahrens */ 1613789Sahrens void 1614789Sahrens zio_checksum_verified(zio_t *zio) 1615789Sahrens { 1616789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1617789Sahrens } 1618789Sahrens 1619789Sahrens /* 1620789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1621789Sahrens */ 1622789Sahrens void 1623789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1624789Sahrens { 16251775Sbillm blkptr_t *bp = zio->io_bp; 16261775Sbillm 16271775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 16281775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 16291775Sbillm zcp->zc_word[2] = bp->blk_birth; 1630789Sahrens zcp->zc_word[3] = 0; 1631789Sahrens } 1632789Sahrens 1633789Sahrens /* 1634789Sahrens * ========================================================================== 1635789Sahrens * Define the pipeline 1636789Sahrens * ========================================================================== 1637789Sahrens */ 1638789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1639789Sahrens 1640789Sahrens static void 1641789Sahrens zio_badop(zio_t *zio) 1642789Sahrens { 1643789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1644789Sahrens } 1645789Sahrens 1646789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1647789Sahrens zio_badop, 1648789Sahrens zio_wait_children_ready, 1649789Sahrens zio_write_compress, 1650789Sahrens zio_checksum_generate, 1651789Sahrens zio_gang_pipeline, 1652789Sahrens zio_get_gang_header, 1653789Sahrens zio_rewrite_gang_members, 1654789Sahrens zio_free_gang_members, 1655789Sahrens zio_claim_gang_members, 1656789Sahrens zio_dva_allocate, 1657789Sahrens zio_dva_free, 1658789Sahrens zio_dva_claim, 1659789Sahrens zio_gang_checksum_generate, 1660789Sahrens zio_ready, 1661789Sahrens zio_vdev_io_start, 1662789Sahrens zio_vdev_io_done, 1663789Sahrens zio_vdev_io_assess, 1664789Sahrens zio_wait_children_done, 1665789Sahrens zio_checksum_verify, 1666789Sahrens zio_read_gang_members, 1667789Sahrens zio_read_decompress, 1668789Sahrens zio_done, 1669789Sahrens zio_badop 1670789Sahrens }; 1671789Sahrens 1672789Sahrens /* 1673789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1674789Sahrens * There's no locking on io_stage because there's no legitimate way for 1675789Sahrens * multiple threads to be attempting to process the same I/O. 1676789Sahrens */ 1677789Sahrens void 1678789Sahrens zio_next_stage(zio_t *zio) 1679789Sahrens { 1680789Sahrens uint32_t pipeline = zio->io_pipeline; 1681789Sahrens 1682789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1683789Sahrens 1684789Sahrens if (zio->io_error) { 1685789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1686789Sahrens zio, vdev_description(zio->io_vd), 1687789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1688789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1689789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1690789Sahrens } 1691789Sahrens 1692789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1693789Sahrens continue; 1694789Sahrens 1695789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1696789Sahrens ASSERT(zio->io_stalled == 0); 1697789Sahrens 16983689Sek110237 /* 16993689Sek110237 * See the comment in zio_next_stage_async() about per-CPU taskqs. 17003689Sek110237 */ 17013689Sek110237 if (((1U << zio->io_stage) & zio->io_async_stages) && 17023689Sek110237 (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && 17033689Sek110237 !(zio->io_flags & ZIO_FLAG_METADATA)) { 17043689Sek110237 taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 17053689Sek110237 (void) taskq_dispatch(tq, 17063689Sek110237 (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 17073689Sek110237 } else { 17083689Sek110237 zio_pipeline[zio->io_stage](zio); 17093689Sek110237 } 1710789Sahrens } 1711789Sahrens 1712789Sahrens void 1713789Sahrens zio_next_stage_async(zio_t *zio) 1714789Sahrens { 1715789Sahrens taskq_t *tq; 1716789Sahrens uint32_t pipeline = zio->io_pipeline; 1717789Sahrens 1718789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1719789Sahrens 1720789Sahrens if (zio->io_error) { 1721789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1722789Sahrens zio, vdev_description(zio->io_vd), 1723789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1724789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1725789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1726789Sahrens } 1727789Sahrens 1728789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1729789Sahrens continue; 1730789Sahrens 1731789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1732789Sahrens ASSERT(zio->io_stalled == 0); 1733789Sahrens 1734789Sahrens /* 1735789Sahrens * For performance, we'll probably want two sets of task queues: 1736789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1737789Sahrens * part is for read performance: since we have to make a pass over 1738789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1739789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1740789Sahrens * that thread is probably still there. Getting this optimization 1741789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1742789Sahrens * 1743789Sahrens * Note that having two sets of task queues is also necessary for 1744789Sahrens * correctness: if all of the issue threads get bogged down waiting 1745789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1746789Sahrens * there won't be any threads available to service I/O completion 1747789Sahrens * interrupts. 1748789Sahrens */ 1749789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1750789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1751789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1752789Sahrens else 1753789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1754789Sahrens (void) taskq_dispatch(tq, 1755789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1756789Sahrens } else { 1757789Sahrens zio_pipeline[zio->io_stage](zio); 1758789Sahrens } 1759789Sahrens } 1760789Sahrens 17613668Sgw25295 static boolean_t 17623668Sgw25295 zio_alloc_should_fail(void) 17633668Sgw25295 { 17643668Sgw25295 static uint16_t allocs = 0; 17653668Sgw25295 17663668Sgw25295 return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); 17673668Sgw25295 } 17683668Sgw25295 1769789Sahrens /* 1770789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1771789Sahrens */ 1772789Sahrens int 17733063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17743063Sperrin uint64_t txg) 1775789Sahrens { 1776789Sahrens int error; 1777789Sahrens 17781544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1779789Sahrens 17803668Sgw25295 if (zio_zil_fail_shift && zio_alloc_should_fail()) { 17813668Sgw25295 spa_config_exit(spa, FTAG); 17823668Sgw25295 return (ENOSPC); 17833668Sgw25295 } 17843668Sgw25295 17853063Sperrin /* 17864527Sperrin * We were passed the previous log block's DVA in bp->blk_dva[0]. 17874527Sperrin * We use that as a hint for which vdev to allocate from next. 17883063Sperrin */ 17894527Sperrin error = metaslab_alloc(spa, spa->spa_log_class, size, 17904527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 17914527Sperrin 17924527Sperrin if (error) 17934527Sperrin error = metaslab_alloc(spa, spa->spa_normal_class, size, 17944527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 1795789Sahrens 1796789Sahrens if (error == 0) { 17973063Sperrin BP_SET_LSIZE(new_bp, size); 17983063Sperrin BP_SET_PSIZE(new_bp, size); 17993063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 18003063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 18013063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 18023063Sperrin BP_SET_LEVEL(new_bp, 0); 18033063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 18043063Sperrin new_bp->blk_birth = txg; 1805789Sahrens } 1806789Sahrens 18071544Seschrock spa_config_exit(spa, FTAG); 1808789Sahrens 1809789Sahrens return (error); 1810789Sahrens } 1811789Sahrens 1812789Sahrens /* 1813789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1814789Sahrens * nothing to do except metaslab_free() it. 1815789Sahrens */ 1816789Sahrens void 1817789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1818789Sahrens { 18191775Sbillm ASSERT(!BP_IS_GANG(bp)); 1820789Sahrens 18211544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1822789Sahrens 18231807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1824789Sahrens 18251544Seschrock spa_config_exit(spa, FTAG); 1826789Sahrens } 18274469Sperrin 18284469Sperrin /* 18294469Sperrin * start an async flush of the write cache for this vdev 18304469Sperrin */ 18314469Sperrin void 18324469Sperrin zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio) 18334469Sperrin { 18344469Sperrin vdev_t *vd; 18354469Sperrin 18364469Sperrin /* 18374469Sperrin * Lock out configuration changes. 18384469Sperrin */ 18394469Sperrin spa_config_enter(spa, RW_READER, FTAG); 18404469Sperrin 18414469Sperrin if (*zio == NULL) 18424469Sperrin *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 18434469Sperrin 18444469Sperrin vd = vdev_lookup_top(spa, vdev); 18454469Sperrin ASSERT(vd); 18464469Sperrin 18474469Sperrin (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE, 18484469Sperrin NULL, NULL, ZIO_PRIORITY_NOW, 18494469Sperrin ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 18504469Sperrin 18514469Sperrin spa_config_exit(spa, FTAG); 18524469Sperrin } 1853