1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 67*3668Sgw25295 /* Force an allocation failure when non-zero */ 68*3668Sgw25295 uint16_t zio_zil_fail_shift = 0; 69*3668Sgw25295 70789Sahrens typedef struct zio_sync_pass { 71789Sahrens int zp_defer_free; /* defer frees after this pass */ 72789Sahrens int zp_dontcompress; /* don't compress after this pass */ 73789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74789Sahrens } zio_sync_pass_t; 75789Sahrens 76789Sahrens zio_sync_pass_t zio_sync_pass = { 77789Sahrens 1, /* zp_defer_free */ 78789Sahrens 4, /* zp_dontcompress */ 79789Sahrens 1, /* zp_rewrite */ 80789Sahrens }; 81789Sahrens 82789Sahrens /* 83789Sahrens * ========================================================================== 84789Sahrens * I/O kmem caches 85789Sahrens * ========================================================================== 86789Sahrens */ 87789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 883290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 893290Sjohansen 903290Sjohansen #ifdef _KERNEL 913290Sjohansen extern vmem_t *zio_alloc_arena; 923290Sjohansen #endif 93789Sahrens 94789Sahrens void 95789Sahrens zio_init(void) 96789Sahrens { 97789Sahrens size_t c; 983290Sjohansen vmem_t *data_alloc_arena = NULL; 993290Sjohansen 1003290Sjohansen #ifdef _KERNEL 1013290Sjohansen data_alloc_arena = zio_alloc_arena; 1023290Sjohansen #endif 103789Sahrens 104789Sahrens /* 105789Sahrens * For small buffers, we want a cache for each multiple of 106789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 107789Sahrens * for each quarter-power of 2. For large buffers, we want 108789Sahrens * a cache for each multiple of PAGESIZE. 109789Sahrens */ 110789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 111789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 112789Sahrens size_t p2 = size; 113789Sahrens size_t align = 0; 114789Sahrens 115789Sahrens while (p2 & (p2 - 1)) 116789Sahrens p2 &= p2 - 1; 117789Sahrens 118789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 119789Sahrens align = SPA_MINBLOCKSIZE; 120789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 121789Sahrens align = PAGESIZE; 122789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 123789Sahrens align = p2 >> 2; 124789Sahrens } 125789Sahrens 126789Sahrens if (align != 0) { 1273290Sjohansen char name[36]; 1282856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 129789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 130849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1313290Sjohansen 1323290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1333290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1343290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1353290Sjohansen KMC_NODEBUG); 1363290Sjohansen 137789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 138789Sahrens size, align); 139789Sahrens } 140789Sahrens } 141789Sahrens 142789Sahrens while (--c != 0) { 143789Sahrens ASSERT(zio_buf_cache[c] != NULL); 144789Sahrens if (zio_buf_cache[c - 1] == NULL) 145789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1463290Sjohansen 1473290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1483290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1493290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 150789Sahrens } 1511544Seschrock 1521544Seschrock zio_inject_init(); 153789Sahrens } 154789Sahrens 155789Sahrens void 156789Sahrens zio_fini(void) 157789Sahrens { 158789Sahrens size_t c; 159789Sahrens kmem_cache_t *last_cache = NULL; 1603290Sjohansen kmem_cache_t *last_data_cache = NULL; 161789Sahrens 162789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 163789Sahrens if (zio_buf_cache[c] != last_cache) { 164789Sahrens last_cache = zio_buf_cache[c]; 165789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 166789Sahrens } 167789Sahrens zio_buf_cache[c] = NULL; 1683290Sjohansen 1693290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1703290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1713290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1723290Sjohansen } 1733290Sjohansen zio_data_buf_cache[c] = NULL; 174789Sahrens } 1751544Seschrock 1761544Seschrock zio_inject_fini(); 177789Sahrens } 178789Sahrens 179789Sahrens /* 180789Sahrens * ========================================================================== 181789Sahrens * Allocate and free I/O buffers 182789Sahrens * ========================================================================== 183789Sahrens */ 1843290Sjohansen 1853290Sjohansen /* 1863290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1873290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1883290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1893290Sjohansen * excess / transient data in-core during a crashdump. 1903290Sjohansen */ 191789Sahrens void * 192789Sahrens zio_buf_alloc(size_t size) 193789Sahrens { 194789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 195789Sahrens 196789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 197789Sahrens 198789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 199789Sahrens } 200789Sahrens 2013290Sjohansen /* 2023290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2033290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2043290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2053290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2063290Sjohansen */ 2073290Sjohansen void * 2083290Sjohansen zio_data_buf_alloc(size_t size) 2093290Sjohansen { 2103290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2113290Sjohansen 2123290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2133290Sjohansen 2143290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2153290Sjohansen } 2163290Sjohansen 217789Sahrens void 218789Sahrens zio_buf_free(void *buf, size_t size) 219789Sahrens { 220789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 221789Sahrens 222789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 223789Sahrens 224789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 225789Sahrens } 226789Sahrens 2273290Sjohansen void 2283290Sjohansen zio_data_buf_free(void *buf, size_t size) 2293290Sjohansen { 2303290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2313290Sjohansen 2323290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2333290Sjohansen 2343290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2353290Sjohansen } 2363463Sahrens 237789Sahrens /* 238789Sahrens * ========================================================================== 239789Sahrens * Push and pop I/O transform buffers 240789Sahrens * ========================================================================== 241789Sahrens */ 242789Sahrens static void 243789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 244789Sahrens { 245789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 246789Sahrens 247789Sahrens zt->zt_data = data; 248789Sahrens zt->zt_size = size; 249789Sahrens zt->zt_bufsize = bufsize; 250789Sahrens 251789Sahrens zt->zt_next = zio->io_transform_stack; 252789Sahrens zio->io_transform_stack = zt; 253789Sahrens 254789Sahrens zio->io_data = data; 255789Sahrens zio->io_size = size; 256789Sahrens } 257789Sahrens 258789Sahrens static void 259789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 260789Sahrens { 261789Sahrens zio_transform_t *zt = zio->io_transform_stack; 262789Sahrens 263789Sahrens *data = zt->zt_data; 264789Sahrens *size = zt->zt_size; 265789Sahrens *bufsize = zt->zt_bufsize; 266789Sahrens 267789Sahrens zio->io_transform_stack = zt->zt_next; 268789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 269789Sahrens 270789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 271789Sahrens zio->io_data = zt->zt_data; 272789Sahrens zio->io_size = zt->zt_size; 273789Sahrens } 274789Sahrens } 275789Sahrens 276789Sahrens static void 277789Sahrens zio_clear_transform_stack(zio_t *zio) 278789Sahrens { 279789Sahrens void *data; 280789Sahrens uint64_t size, bufsize; 281789Sahrens 282789Sahrens ASSERT(zio->io_transform_stack != NULL); 283789Sahrens 284789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 285789Sahrens while (zio->io_transform_stack != NULL) { 286789Sahrens zio_buf_free(data, bufsize); 287789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 288789Sahrens } 289789Sahrens } 290789Sahrens 291789Sahrens /* 292789Sahrens * ========================================================================== 293789Sahrens * Create the various types of I/O (read, write, free) 294789Sahrens * ========================================================================== 295789Sahrens */ 296789Sahrens static zio_t * 297789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 298789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 299789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 300789Sahrens { 301789Sahrens zio_t *zio; 302789Sahrens 303789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 304789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 305789Sahrens 306789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 307789Sahrens zio->io_parent = pio; 308789Sahrens zio->io_spa = spa; 309789Sahrens zio->io_txg = txg; 310789Sahrens if (bp != NULL) { 311789Sahrens zio->io_bp = bp; 312789Sahrens zio->io_bp_copy = *bp; 313789Sahrens zio->io_bp_orig = *bp; 314789Sahrens } 315789Sahrens zio->io_done = done; 316789Sahrens zio->io_private = private; 317789Sahrens zio->io_type = type; 318789Sahrens zio->io_priority = priority; 319789Sahrens zio->io_stage = stage; 320789Sahrens zio->io_pipeline = pipeline; 321789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 322789Sahrens zio->io_timestamp = lbolt64; 323789Sahrens zio->io_flags = flags; 3242856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 325789Sahrens zio_push_transform(zio, data, size, size); 326789Sahrens 3273463Sahrens /* 3283463Sahrens * Note on config lock: 3293463Sahrens * 3303463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3313463Sahrens * lock, so we don't need it for this io. 3323463Sahrens * 3333463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3343463Sahrens * config lock on behalf of this io, so it should be released 3353463Sahrens * in zio_done. 3363463Sahrens * 3373463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3383463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3393463Sahrens * The NULL top-level ios rarely have any children, so we delay 3403463Sahrens * grabbing the lock until the first child is added (but it is 3413463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3423463Sahrens * children don't need to also grab it). This greatly reduces 3433463Sahrens * contention on the config lock. 3443463Sahrens */ 345789Sahrens if (pio == NULL) { 3463463Sahrens if (type != ZIO_TYPE_NULL && 3473463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3481544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 3493463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3503463Sahrens } 351789Sahrens zio->io_root = zio; 352789Sahrens } else { 353789Sahrens zio->io_root = pio->io_root; 3541544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3551544Seschrock zio->io_logical = pio->io_logical; 356789Sahrens mutex_enter(&pio->io_lock); 3573463Sahrens if (pio->io_parent == NULL && 3583463Sahrens pio->io_type == ZIO_TYPE_NULL && 3593463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 3603463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 3613463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3623463Sahrens spa_config_enter(zio->io_spa, RW_READER, pio); 3633463Sahrens } 364789Sahrens if (stage < ZIO_STAGE_READY) 365789Sahrens pio->io_children_notready++; 366789Sahrens pio->io_children_notdone++; 367789Sahrens zio->io_sibling_next = pio->io_child; 368789Sahrens zio->io_sibling_prev = NULL; 369789Sahrens if (pio->io_child != NULL) 370789Sahrens pio->io_child->io_sibling_prev = zio; 371789Sahrens pio->io_child = zio; 3721775Sbillm zio->io_ndvas = pio->io_ndvas; 373789Sahrens mutex_exit(&pio->io_lock); 374789Sahrens } 375789Sahrens 376789Sahrens return (zio); 377789Sahrens } 378789Sahrens 379789Sahrens zio_t * 380789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 381789Sahrens int flags) 382789Sahrens { 383789Sahrens zio_t *zio; 384789Sahrens 385789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 386789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 387789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 388789Sahrens 389789Sahrens return (zio); 390789Sahrens } 391789Sahrens 392789Sahrens zio_t * 393789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 394789Sahrens { 395789Sahrens return (zio_null(NULL, spa, done, private, flags)); 396789Sahrens } 397789Sahrens 398789Sahrens zio_t * 399789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 400789Sahrens uint64_t size, zio_done_func_t *done, void *private, 4011544Seschrock int priority, int flags, zbookmark_t *zb) 402789Sahrens { 403789Sahrens zio_t *zio; 404789Sahrens 405789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 406789Sahrens 407789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4082981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4092981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4101544Seschrock zio->io_bookmark = *zb; 4111544Seschrock 4121544Seschrock zio->io_logical = zio; 413789Sahrens 414789Sahrens /* 415789Sahrens * Work off our copy of the bp so the caller can free it. 416789Sahrens */ 417789Sahrens zio->io_bp = &zio->io_bp_copy; 418789Sahrens 419789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 420789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 421789Sahrens void *cbuf = zio_buf_alloc(csize); 422789Sahrens 423789Sahrens zio_push_transform(zio, cbuf, csize, csize); 424789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 425789Sahrens } 426789Sahrens 4271775Sbillm if (BP_IS_GANG(bp)) { 428789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 429789Sahrens void *gbuf = zio_buf_alloc(gsize); 430789Sahrens 431789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 432789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 433789Sahrens } 434789Sahrens 435789Sahrens return (zio); 436789Sahrens } 437789Sahrens 438789Sahrens zio_t * 4391775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 440789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4413547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 4423547Smaybee int flags, zbookmark_t *zb) 443789Sahrens { 444789Sahrens zio_t *zio; 445789Sahrens 446789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 447789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 448789Sahrens 449789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 450789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 451789Sahrens 452789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4532981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 454789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 455789Sahrens 4563547Smaybee zio->io_ready = ready; 4573547Smaybee 4581544Seschrock zio->io_bookmark = *zb; 4591544Seschrock 4601544Seschrock zio->io_logical = zio; 4611544Seschrock 462789Sahrens zio->io_checksum = checksum; 463789Sahrens zio->io_compress = compress; 4641775Sbillm zio->io_ndvas = ncopies; 465789Sahrens 466789Sahrens if (compress != ZIO_COMPRESS_OFF) 467789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 468789Sahrens 469789Sahrens if (bp->blk_birth != txg) { 470789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 471789Sahrens BP_ZERO(bp); 472789Sahrens BP_SET_LSIZE(bp, size); 473789Sahrens BP_SET_PSIZE(bp, size); 4741775Sbillm } else { 4751775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 4761775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 4771775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 478789Sahrens } 479789Sahrens 480789Sahrens return (zio); 481789Sahrens } 482789Sahrens 483789Sahrens zio_t * 484789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 485789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4861544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4871544Seschrock zbookmark_t *zb) 488789Sahrens { 489789Sahrens zio_t *zio; 490789Sahrens 491789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4922981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 493789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 494789Sahrens 4951544Seschrock zio->io_bookmark = *zb; 496789Sahrens zio->io_checksum = checksum; 497789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 498789Sahrens 4991775Sbillm if (pio != NULL) 5001775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5011775Sbillm 502789Sahrens return (zio); 503789Sahrens } 504789Sahrens 505789Sahrens static zio_t * 506789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 507789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 508789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 509789Sahrens { 510789Sahrens zio_t *zio; 511789Sahrens 512789Sahrens BP_ZERO(bp); 513789Sahrens BP_SET_LSIZE(bp, size); 514789Sahrens BP_SET_PSIZE(bp, size); 515789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 516789Sahrens 517789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 518789Sahrens ZIO_TYPE_WRITE, priority, flags, 519789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 520789Sahrens 521789Sahrens zio->io_checksum = checksum; 522789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 523789Sahrens 524789Sahrens return (zio); 525789Sahrens } 526789Sahrens 527789Sahrens zio_t * 528789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 529789Sahrens zio_done_func_t *done, void *private) 530789Sahrens { 531789Sahrens zio_t *zio; 532789Sahrens 533789Sahrens ASSERT(!BP_IS_HOLE(bp)); 534789Sahrens 535789Sahrens if (txg == spa->spa_syncing_txg && 536789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 537789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 538789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 539789Sahrens } 540789Sahrens 541789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5422981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 543789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 544789Sahrens 545789Sahrens zio->io_bp = &zio->io_bp_copy; 546789Sahrens 547789Sahrens return (zio); 548789Sahrens } 549789Sahrens 550789Sahrens zio_t * 551789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 552789Sahrens zio_done_func_t *done, void *private) 553789Sahrens { 554789Sahrens zio_t *zio; 555789Sahrens 556789Sahrens /* 557789Sahrens * A claim is an allocation of a specific block. Claims are needed 558789Sahrens * to support immediate writes in the intent log. The issue is that 559789Sahrens * immediate writes contain committed data, but in a txg that was 560789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 561789Sahrens * the intent log claims all blocks that contain immediate write data 562789Sahrens * so that the SPA knows they're in use. 563789Sahrens * 564789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 565789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 566789Sahrens */ 567789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 568789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 569789Sahrens 570789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 571789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 572789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 573789Sahrens 574789Sahrens zio->io_bp = &zio->io_bp_copy; 575789Sahrens 576789Sahrens return (zio); 577789Sahrens } 578789Sahrens 579789Sahrens zio_t * 580789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 581789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 582789Sahrens { 583789Sahrens zio_t *zio; 584789Sahrens int c; 585789Sahrens 586789Sahrens if (vd->vdev_children == 0) { 587789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 588789Sahrens ZIO_TYPE_IOCTL, priority, flags, 589789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 590789Sahrens 591789Sahrens zio->io_vd = vd; 592789Sahrens zio->io_cmd = cmd; 593789Sahrens } else { 594789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 595789Sahrens 596789Sahrens for (c = 0; c < vd->vdev_children; c++) 597789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 598789Sahrens done, private, priority, flags)); 599789Sahrens } 600789Sahrens 601789Sahrens return (zio); 602789Sahrens } 603789Sahrens 604789Sahrens static void 605789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 606789Sahrens int checksum) 607789Sahrens { 608789Sahrens ASSERT(vd->vdev_children == 0); 609789Sahrens 610789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 611789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 612789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 613789Sahrens 614789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 615789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 616789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 617789Sahrens 618789Sahrens BP_ZERO(bp); 619789Sahrens 620789Sahrens BP_SET_LSIZE(bp, size); 621789Sahrens BP_SET_PSIZE(bp, size); 622789Sahrens 623789Sahrens BP_SET_CHECKSUM(bp, checksum); 624789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 625789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 626789Sahrens 627789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 628789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 629789Sahrens } 630789Sahrens 631789Sahrens zio_t * 632789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 633789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 634789Sahrens int priority, int flags) 635789Sahrens { 636789Sahrens zio_t *zio; 637789Sahrens blkptr_t blk; 638789Sahrens 639789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 640789Sahrens 641789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 642789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 643789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 644789Sahrens 645789Sahrens zio->io_vd = vd; 646789Sahrens zio->io_offset = offset; 647789Sahrens 648789Sahrens /* 649789Sahrens * Work off our copy of the bp so the caller can free it. 650789Sahrens */ 651789Sahrens zio->io_bp = &zio->io_bp_copy; 652789Sahrens 653789Sahrens return (zio); 654789Sahrens } 655789Sahrens 656789Sahrens zio_t * 657789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 658789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 659789Sahrens int priority, int flags) 660789Sahrens { 661789Sahrens zio_block_tail_t *zbt; 662789Sahrens void *wbuf; 663789Sahrens zio_t *zio; 664789Sahrens blkptr_t blk; 665789Sahrens 666789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 667789Sahrens 668789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 669789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 670789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 671789Sahrens 672789Sahrens zio->io_vd = vd; 673789Sahrens zio->io_offset = offset; 674789Sahrens 675789Sahrens zio->io_bp = &zio->io_bp_copy; 676789Sahrens zio->io_checksum = checksum; 677789Sahrens 678789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 679789Sahrens /* 680789Sahrens * zbt checksums are necessarily destructive -- they modify 681789Sahrens * one word of the write buffer to hold the verifier/checksum. 682789Sahrens * Therefore, we must make a local copy in case the data is 683789Sahrens * being written to multiple places. 684789Sahrens */ 685789Sahrens wbuf = zio_buf_alloc(size); 686789Sahrens bcopy(data, wbuf, size); 687789Sahrens zio_push_transform(zio, wbuf, size, size); 688789Sahrens 689789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 690789Sahrens zbt->zbt_cksum = blk.blk_cksum; 691789Sahrens } 692789Sahrens 693789Sahrens return (zio); 694789Sahrens } 695789Sahrens 696789Sahrens /* 697789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 698789Sahrens */ 699789Sahrens zio_t * 700789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 701789Sahrens void *data, uint64_t size, int type, int priority, int flags, 702789Sahrens zio_done_func_t *done, void *private) 703789Sahrens { 704789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 705789Sahrens zio_t *cio; 706789Sahrens 707789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 708789Sahrens /* 709789Sahrens * If we have the bp, then the child should perform the 710789Sahrens * checksum and the parent need not. This pushes error 711789Sahrens * detection as close to the leaves as possible and 712789Sahrens * eliminates redundant checksums in the interior nodes. 713789Sahrens */ 714789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 715789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 716789Sahrens } 717789Sahrens 718789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 719789Sahrens done, private, type, priority, 720789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 7211775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 722789Sahrens 723789Sahrens cio->io_vd = vd; 724789Sahrens cio->io_offset = offset; 725789Sahrens 726789Sahrens return (cio); 727789Sahrens } 728789Sahrens 729789Sahrens /* 730789Sahrens * ========================================================================== 731789Sahrens * Initiate I/O, either sync or async 732789Sahrens * ========================================================================== 733789Sahrens */ 734789Sahrens int 735789Sahrens zio_wait(zio_t *zio) 736789Sahrens { 737789Sahrens int error; 738789Sahrens 739789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 740789Sahrens 741789Sahrens zio->io_waiter = curthread; 742789Sahrens 743789Sahrens zio_next_stage_async(zio); 744789Sahrens 745789Sahrens mutex_enter(&zio->io_lock); 746789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 747789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 748789Sahrens mutex_exit(&zio->io_lock); 749789Sahrens 750789Sahrens error = zio->io_error; 7512856Snd150628 mutex_destroy(&zio->io_lock); 752789Sahrens kmem_free(zio, sizeof (zio_t)); 753789Sahrens 754789Sahrens return (error); 755789Sahrens } 756789Sahrens 757789Sahrens void 758789Sahrens zio_nowait(zio_t *zio) 759789Sahrens { 760789Sahrens zio_next_stage_async(zio); 761789Sahrens } 762789Sahrens 763789Sahrens /* 764789Sahrens * ========================================================================== 765789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 766789Sahrens * ========================================================================== 767789Sahrens */ 768789Sahrens static void 769789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 770789Sahrens { 771789Sahrens mutex_enter(&zio->io_lock); 772789Sahrens if (*countp == 0) { 773789Sahrens ASSERT(zio->io_stalled == 0); 774789Sahrens mutex_exit(&zio->io_lock); 775789Sahrens zio_next_stage(zio); 776789Sahrens } else { 777789Sahrens zio->io_stalled = stage; 778789Sahrens mutex_exit(&zio->io_lock); 779789Sahrens } 780789Sahrens } 781789Sahrens 782789Sahrens static void 783789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 784789Sahrens { 785789Sahrens zio_t *pio = zio->io_parent; 786789Sahrens 787789Sahrens mutex_enter(&pio->io_lock); 788789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 789789Sahrens pio->io_error = zio->io_error; 790789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 791789Sahrens pio->io_stalled = 0; 792789Sahrens mutex_exit(&pio->io_lock); 793789Sahrens zio_next_stage_async(pio); 794789Sahrens } else { 795789Sahrens mutex_exit(&pio->io_lock); 796789Sahrens } 797789Sahrens } 798789Sahrens 799789Sahrens static void 800789Sahrens zio_wait_children_ready(zio_t *zio) 801789Sahrens { 802789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 803789Sahrens &zio->io_children_notready); 804789Sahrens } 805789Sahrens 806789Sahrens void 807789Sahrens zio_wait_children_done(zio_t *zio) 808789Sahrens { 809789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 810789Sahrens &zio->io_children_notdone); 811789Sahrens } 812789Sahrens 813789Sahrens static void 814789Sahrens zio_ready(zio_t *zio) 815789Sahrens { 816789Sahrens zio_t *pio = zio->io_parent; 817789Sahrens 8183547Smaybee if (zio->io_ready) 8193547Smaybee zio->io_ready(zio); 8203547Smaybee 821789Sahrens if (pio != NULL) 822789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 823789Sahrens &pio->io_children_notready); 824789Sahrens 825789Sahrens if (zio->io_bp) 826789Sahrens zio->io_bp_copy = *zio->io_bp; 827789Sahrens 828789Sahrens zio_next_stage(zio); 829789Sahrens } 830789Sahrens 831789Sahrens static void 832789Sahrens zio_done(zio_t *zio) 833789Sahrens { 834789Sahrens zio_t *pio = zio->io_parent; 835789Sahrens spa_t *spa = zio->io_spa; 836789Sahrens blkptr_t *bp = zio->io_bp; 837789Sahrens vdev_t *vd = zio->io_vd; 838789Sahrens 839789Sahrens ASSERT(zio->io_children_notready == 0); 840789Sahrens ASSERT(zio->io_children_notdone == 0); 841789Sahrens 842789Sahrens if (bp != NULL) { 843789Sahrens ASSERT(bp->blk_pad[0] == 0); 844789Sahrens ASSERT(bp->blk_pad[1] == 0); 845789Sahrens ASSERT(bp->blk_pad[2] == 0); 846789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 847789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 8481775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 849789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 8501775Sbillm if (zio->io_ndvas != 0) 8511775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 8521775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 8531775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 8541775Sbillm } 855789Sahrens } 856789Sahrens 857789Sahrens if (vd != NULL) 858789Sahrens vdev_stat_update(zio); 859789Sahrens 860789Sahrens if (zio->io_error) { 8611544Seschrock /* 8621544Seschrock * If this I/O is attached to a particular vdev, 8631544Seschrock * generate an error message describing the I/O failure 8641544Seschrock * at the block level. We ignore these errors if the 8651544Seschrock * device is currently unavailable. 8661544Seschrock */ 8671732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 8681544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 8691732Sbonwick zio->io_spa, vd, zio, 0, 0); 870789Sahrens 8711544Seschrock if ((zio->io_error == EIO || 8721544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 8731544Seschrock zio->io_logical == zio) { 8741544Seschrock /* 8751544Seschrock * For root I/O requests, tell the SPA to log the error 8761544Seschrock * appropriately. Also, generate a logical data 8771544Seschrock * ereport. 8781544Seschrock */ 8791544Seschrock spa_log_error(zio->io_spa, zio); 8801544Seschrock 8811544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 8821544Seschrock zio->io_spa, NULL, zio, 0, 0); 8831544Seschrock } 884789Sahrens 8851544Seschrock /* 8861544Seschrock * For I/O requests that cannot fail, panic appropriately. 8871544Seschrock */ 8881544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 8893459Sek110237 char *blkbuf; 8903459Sek110237 8913459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 8923459Sek110237 if (blkbuf) { 8933459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 8943459Sek110237 bp ? bp : &zio->io_bp_copy); 8953459Sek110237 } 8961544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 8971544Seschrock "%d", zio->io_error == ECKSUM ? 8981544Seschrock "bad checksum" : "I/O failure", 8991544Seschrock zio_type_name[zio->io_type], 9001544Seschrock vdev_description(vd), 9011544Seschrock (u_longlong_t)zio->io_offset, 9023459Sek110237 zio, blkbuf ? blkbuf : "", zio->io_error); 9031544Seschrock } 904789Sahrens } 905789Sahrens zio_clear_transform_stack(zio); 906789Sahrens 907789Sahrens if (zio->io_done) 908789Sahrens zio->io_done(zio); 909789Sahrens 910789Sahrens ASSERT(zio->io_delegate_list == NULL); 911789Sahrens ASSERT(zio->io_delegate_next == NULL); 912789Sahrens 913789Sahrens if (pio != NULL) { 914789Sahrens zio_t *next, *prev; 915789Sahrens 916789Sahrens mutex_enter(&pio->io_lock); 917789Sahrens next = zio->io_sibling_next; 918789Sahrens prev = zio->io_sibling_prev; 919789Sahrens if (next != NULL) 920789Sahrens next->io_sibling_prev = prev; 921789Sahrens if (prev != NULL) 922789Sahrens prev->io_sibling_next = next; 923789Sahrens if (pio->io_child == zio) 924789Sahrens pio->io_child = next; 925789Sahrens mutex_exit(&pio->io_lock); 926789Sahrens 927789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 928789Sahrens &pio->io_children_notdone); 929789Sahrens } 930789Sahrens 9313463Sahrens /* 9323463Sahrens * Note: this I/O is now done, and will shortly be 9333463Sahrens * kmem_free()'d, so there is no need to clear this (or any 9343463Sahrens * other) flag. 9353463Sahrens */ 9363463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 9371544Seschrock spa_config_exit(spa, zio); 938789Sahrens 939789Sahrens if (zio->io_waiter != NULL) { 940789Sahrens mutex_enter(&zio->io_lock); 941789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 942789Sahrens zio->io_stalled = zio->io_stage; 943789Sahrens cv_broadcast(&zio->io_cv); 944789Sahrens mutex_exit(&zio->io_lock); 945789Sahrens } else { 946789Sahrens kmem_free(zio, sizeof (zio_t)); 947789Sahrens } 948789Sahrens } 949789Sahrens 950789Sahrens /* 951789Sahrens * ========================================================================== 952789Sahrens * Compression support 953789Sahrens * ========================================================================== 954789Sahrens */ 955789Sahrens static void 956789Sahrens zio_write_compress(zio_t *zio) 957789Sahrens { 958789Sahrens int compress = zio->io_compress; 959789Sahrens blkptr_t *bp = zio->io_bp; 960789Sahrens void *cbuf; 961789Sahrens uint64_t lsize = zio->io_size; 962789Sahrens uint64_t csize = lsize; 963789Sahrens uint64_t cbufsize = 0; 964789Sahrens int pass; 965789Sahrens 966789Sahrens if (bp->blk_birth == zio->io_txg) { 967789Sahrens /* 968789Sahrens * We're rewriting an existing block, which means we're 969789Sahrens * working on behalf of spa_sync(). For spa_sync() to 970789Sahrens * converge, it must eventually be the case that we don't 971789Sahrens * have to allocate new blocks. But compression changes 972789Sahrens * the blocksize, which forces a reallocate, and makes 973789Sahrens * convergence take longer. Therefore, after the first 974789Sahrens * few passes, stop compressing to ensure convergence. 975789Sahrens */ 976789Sahrens pass = spa_sync_pass(zio->io_spa); 977789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 978789Sahrens compress = ZIO_COMPRESS_OFF; 979789Sahrens } else { 980789Sahrens ASSERT(BP_IS_HOLE(bp)); 981789Sahrens pass = 1; 982789Sahrens } 983789Sahrens 984789Sahrens if (compress != ZIO_COMPRESS_OFF) 985789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 986789Sahrens &cbuf, &csize, &cbufsize)) 987789Sahrens compress = ZIO_COMPRESS_OFF; 988789Sahrens 989789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 990789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 991789Sahrens 992789Sahrens /* 993789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 994789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 995789Sahrens * but newly allocated blocks are sequential, so they can be written 996789Sahrens * to disk faster. Therefore, we allow the first few passes of 997789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 998789Sahrens * There should only be a handful of blocks after pass 1 in any case. 999789Sahrens */ 1000789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1001789Sahrens pass > zio_sync_pass.zp_rewrite) { 1002789Sahrens ASSERT(csize != 0); 10032885Sahrens BP_SET_LSIZE(bp, lsize); 10042885Sahrens BP_SET_COMPRESS(bp, compress); 1005789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 1006789Sahrens } else { 1007789Sahrens if (bp->blk_birth == zio->io_txg) { 1008789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 1009789Sahrens bzero(bp, sizeof (blkptr_t)); 1010789Sahrens } 1011789Sahrens if (csize == 0) { 1012789Sahrens BP_ZERO(bp); 1013789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1014789Sahrens } else { 10151775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1016789Sahrens BP_SET_LSIZE(bp, lsize); 1017789Sahrens BP_SET_PSIZE(bp, csize); 1018789Sahrens BP_SET_COMPRESS(bp, compress); 1019789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 1020789Sahrens } 1021789Sahrens } 1022789Sahrens 1023789Sahrens zio_next_stage(zio); 1024789Sahrens } 1025789Sahrens 1026789Sahrens static void 1027789Sahrens zio_read_decompress(zio_t *zio) 1028789Sahrens { 1029789Sahrens blkptr_t *bp = zio->io_bp; 1030789Sahrens void *data; 1031789Sahrens uint64_t size; 1032789Sahrens uint64_t bufsize; 1033789Sahrens int compress = BP_GET_COMPRESS(bp); 1034789Sahrens 1035789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1036789Sahrens 1037789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1038789Sahrens 1039789Sahrens if (zio_decompress_data(compress, data, size, 1040789Sahrens zio->io_data, zio->io_size)) 1041789Sahrens zio->io_error = EIO; 1042789Sahrens 1043789Sahrens zio_buf_free(data, bufsize); 1044789Sahrens 1045789Sahrens zio_next_stage(zio); 1046789Sahrens } 1047789Sahrens 1048789Sahrens /* 1049789Sahrens * ========================================================================== 1050789Sahrens * Gang block support 1051789Sahrens * ========================================================================== 1052789Sahrens */ 1053789Sahrens static void 1054789Sahrens zio_gang_pipeline(zio_t *zio) 1055789Sahrens { 1056789Sahrens /* 1057789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1058789Sahrens * block. If we're not, strip out any gang-specific stages. 1059789Sahrens */ 10601775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1061789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1062789Sahrens 1063789Sahrens zio_next_stage(zio); 1064789Sahrens } 1065789Sahrens 1066789Sahrens static void 1067789Sahrens zio_gang_byteswap(zio_t *zio) 1068789Sahrens { 1069789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1070789Sahrens 1071789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1072789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1073789Sahrens } 1074789Sahrens 1075789Sahrens static void 1076789Sahrens zio_get_gang_header(zio_t *zio) 1077789Sahrens { 1078789Sahrens blkptr_t *bp = zio->io_bp; 1079789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1080789Sahrens void *gbuf = zio_buf_alloc(gsize); 1081789Sahrens 10821775Sbillm ASSERT(BP_IS_GANG(bp)); 1083789Sahrens 1084789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1085789Sahrens 1086789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1087789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1088789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1089789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 1090789Sahrens 1091789Sahrens zio_wait_children_done(zio); 1092789Sahrens } 1093789Sahrens 1094789Sahrens static void 1095789Sahrens zio_read_gang_members(zio_t *zio) 1096789Sahrens { 1097789Sahrens zio_gbh_phys_t *gbh; 1098789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1099789Sahrens int i; 1100789Sahrens 11011775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1102789Sahrens 1103789Sahrens zio_gang_byteswap(zio); 1104789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1105789Sahrens 1106789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1107789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1108789Sahrens lsize = BP_GET_PSIZE(gbp); 1109789Sahrens 1110789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1111789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1112789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1113789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1114789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1115789Sahrens 1116789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1117789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 11181544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 11191544Seschrock &zio->io_bookmark)); 1120789Sahrens } 1121789Sahrens 1122789Sahrens zio_buf_free(gbh, gbufsize); 1123789Sahrens zio_wait_children_done(zio); 1124789Sahrens } 1125789Sahrens 1126789Sahrens static void 1127789Sahrens zio_rewrite_gang_members(zio_t *zio) 1128789Sahrens { 1129789Sahrens zio_gbh_phys_t *gbh; 1130789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1131789Sahrens int i; 1132789Sahrens 11331775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1134789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1135789Sahrens 1136789Sahrens zio_gang_byteswap(zio); 1137789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1138789Sahrens 1139789Sahrens ASSERT(gsize == gbufsize); 1140789Sahrens 1141789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1142789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1143789Sahrens lsize = BP_GET_PSIZE(gbp); 1144789Sahrens 1145789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1146789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1147789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1148789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1149789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1150789Sahrens 1151789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1152789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 11531544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 11541544Seschrock &zio->io_bookmark)); 1155789Sahrens } 1156789Sahrens 1157789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1158789Sahrens zio_wait_children_ready(zio); 1159789Sahrens } 1160789Sahrens 1161789Sahrens static void 1162789Sahrens zio_free_gang_members(zio_t *zio) 1163789Sahrens { 1164789Sahrens zio_gbh_phys_t *gbh; 1165789Sahrens uint64_t gsize, gbufsize; 1166789Sahrens int i; 1167789Sahrens 11681775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1169789Sahrens 1170789Sahrens zio_gang_byteswap(zio); 1171789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1172789Sahrens 1173789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1174789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1175789Sahrens 1176789Sahrens if (BP_IS_HOLE(gbp)) 1177789Sahrens continue; 1178789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1179789Sahrens gbp, NULL, NULL)); 1180789Sahrens } 1181789Sahrens 1182789Sahrens zio_buf_free(gbh, gbufsize); 1183789Sahrens zio_next_stage(zio); 1184789Sahrens } 1185789Sahrens 1186789Sahrens static void 1187789Sahrens zio_claim_gang_members(zio_t *zio) 1188789Sahrens { 1189789Sahrens zio_gbh_phys_t *gbh; 1190789Sahrens uint64_t gsize, gbufsize; 1191789Sahrens int i; 1192789Sahrens 11931775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1194789Sahrens 1195789Sahrens zio_gang_byteswap(zio); 1196789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1197789Sahrens 1198789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1199789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1200789Sahrens if (BP_IS_HOLE(gbp)) 1201789Sahrens continue; 1202789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1203789Sahrens gbp, NULL, NULL)); 1204789Sahrens } 1205789Sahrens 1206789Sahrens zio_buf_free(gbh, gbufsize); 1207789Sahrens zio_next_stage(zio); 1208789Sahrens } 1209789Sahrens 1210789Sahrens static void 1211789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1212789Sahrens { 1213789Sahrens zio_t *pio = zio->io_parent; 12141775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 12151775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1216789Sahrens uint64_t asize; 12171775Sbillm int d; 1218789Sahrens 12191775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 12201775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 12211775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 12221775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 12231775Sbillm 1224789Sahrens mutex_enter(&pio->io_lock); 12251775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 12261775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 12271775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 12281775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 12291775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 12301775Sbillm } 1231789Sahrens mutex_exit(&pio->io_lock); 1232789Sahrens } 1233789Sahrens 1234789Sahrens static void 1235789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1236789Sahrens { 1237789Sahrens blkptr_t *bp = zio->io_bp; 12381775Sbillm dva_t *dva = bp->blk_dva; 12391775Sbillm spa_t *spa = zio->io_spa; 1240789Sahrens zio_gbh_phys_t *gbh; 12411775Sbillm uint64_t txg = zio->io_txg; 1242789Sahrens uint64_t resid = zio->io_size; 1243789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1244789Sahrens uint64_t gsize, loff, lsize; 1245789Sahrens uint32_t gbps_left; 12461775Sbillm int ndvas = zio->io_ndvas; 12471775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1248789Sahrens int error; 12491775Sbillm int i, d; 1250789Sahrens 1251789Sahrens gsize = SPA_GANGBLOCKSIZE; 1252789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1253789Sahrens 12543063Sperrin error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); 1255789Sahrens if (error == ENOSPC) 1256789Sahrens panic("can't allocate gang block header"); 1257789Sahrens ASSERT(error == 0); 1258789Sahrens 12591775Sbillm for (d = 0; d < gbh_ndvas; d++) 12601775Sbillm DVA_SET_GANG(&dva[d], 1); 1261789Sahrens 12621775Sbillm bp->blk_birth = txg; 1263789Sahrens 1264789Sahrens gbh = zio_buf_alloc(gsize); 1265789Sahrens bzero(gbh, gsize); 1266789Sahrens 12671775Sbillm /* We need to test multi-level gang blocks */ 12681775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 12691775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 12701775Sbillm 1271789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1272789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1273789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 12741775Sbillm dva = gbp->blk_dva; 1275789Sahrens 1276789Sahrens ASSERT(gbps_left != 0); 1277789Sahrens maxalloc = MIN(maxalloc, resid); 1278789Sahrens 1279789Sahrens while (resid <= maxalloc * gbps_left) { 12801775Sbillm error = metaslab_alloc(spa, maxalloc, gbp, ndvas, 12813063Sperrin txg, bp, B_FALSE); 1282789Sahrens if (error == 0) 1283789Sahrens break; 1284789Sahrens ASSERT3U(error, ==, ENOSPC); 1285789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1286789Sahrens panic("really out of space"); 1287789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1288789Sahrens } 1289789Sahrens 1290789Sahrens if (resid <= maxalloc * gbps_left) { 1291789Sahrens lsize = maxalloc; 1292789Sahrens BP_SET_LSIZE(gbp, lsize); 1293789Sahrens BP_SET_PSIZE(gbp, lsize); 1294789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 12951775Sbillm gbp->blk_birth = txg; 12961775Sbillm zio_nowait(zio_rewrite(zio, spa, 12971775Sbillm zio->io_checksum, txg, gbp, 1298789Sahrens (char *)zio->io_data + loff, lsize, 1299789Sahrens zio_write_allocate_gang_member_done, NULL, 13001544Seschrock zio->io_priority, zio->io_flags, 13011544Seschrock &zio->io_bookmark)); 1302789Sahrens } else { 1303789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1304789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 13051775Sbillm zio_nowait(zio_write_allocate(zio, spa, 13061775Sbillm zio->io_checksum, txg, gbp, 1307789Sahrens (char *)zio->io_data + loff, lsize, 1308789Sahrens zio_write_allocate_gang_member_done, NULL, 1309789Sahrens zio->io_priority, zio->io_flags)); 1310789Sahrens } 1311789Sahrens } 1312789Sahrens 1313789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1314789Sahrens 1315789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1316789Sahrens 1317789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 13181775Sbillm /* 13191775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 13201775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 13211775Sbillm * so we have to wait for that to finish in order for our BP 13221775Sbillm * to be stable. 13231775Sbillm */ 1324789Sahrens zio_wait_children_done(zio); 1325789Sahrens } 1326789Sahrens 1327789Sahrens /* 1328789Sahrens * ========================================================================== 1329789Sahrens * Allocate and free blocks 1330789Sahrens * ========================================================================== 1331789Sahrens */ 1332789Sahrens static void 1333789Sahrens zio_dva_allocate(zio_t *zio) 1334789Sahrens { 1335789Sahrens blkptr_t *bp = zio->io_bp; 1336789Sahrens int error; 1337789Sahrens 1338789Sahrens ASSERT(BP_IS_HOLE(bp)); 13391775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 13401775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 13411775Sbillm ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); 1342789Sahrens 1343789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1344789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1345789Sahrens zio_write_allocate_gang_members(zio); 1346789Sahrens return; 1347789Sahrens } 1348789Sahrens 1349789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1350789Sahrens 13511775Sbillm error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, 13523063Sperrin zio->io_txg, NULL, B_FALSE); 1353789Sahrens 1354789Sahrens if (error == 0) { 1355789Sahrens bp->blk_birth = zio->io_txg; 1356789Sahrens } else if (error == ENOSPC) { 1357789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1358789Sahrens panic("really, truly out of space"); 1359789Sahrens zio_write_allocate_gang_members(zio); 1360789Sahrens return; 1361789Sahrens } else { 1362789Sahrens zio->io_error = error; 1363789Sahrens } 1364789Sahrens zio_next_stage(zio); 1365789Sahrens } 1366789Sahrens 1367789Sahrens static void 1368789Sahrens zio_dva_free(zio_t *zio) 1369789Sahrens { 1370789Sahrens blkptr_t *bp = zio->io_bp; 1371789Sahrens 13721807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1373789Sahrens 1374789Sahrens BP_ZERO(bp); 1375789Sahrens 1376789Sahrens zio_next_stage(zio); 1377789Sahrens } 1378789Sahrens 1379789Sahrens static void 1380789Sahrens zio_dva_claim(zio_t *zio) 1381789Sahrens { 13821807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1383789Sahrens 1384789Sahrens zio_next_stage(zio); 1385789Sahrens } 1386789Sahrens 1387789Sahrens /* 1388789Sahrens * ========================================================================== 1389789Sahrens * Read and write to physical devices 1390789Sahrens * ========================================================================== 1391789Sahrens */ 1392789Sahrens 1393789Sahrens static void 13941775Sbillm zio_vdev_io_start(zio_t *zio) 1395789Sahrens { 1396789Sahrens vdev_t *vd = zio->io_vd; 13971775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 13981775Sbillm blkptr_t *bp = zio->io_bp; 13991775Sbillm uint64_t align; 1400789Sahrens 14011775Sbillm if (vd == NULL) { 14021775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14031775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 14041775Sbillm return; 14051775Sbillm } 14061775Sbillm 14071775Sbillm align = 1ULL << tvd->vdev_ashift; 14081775Sbillm 14091732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1410789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1411789Sahrens 14121775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 14131775Sbillm vd->vdev_children == 0) { 1414789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1415789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1416789Sahrens } 1417789Sahrens 14181732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 14191732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 14201732Sbonwick char *abuf = zio_buf_alloc(asize); 14211732Sbonwick ASSERT(vd == tvd); 14221732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 14231732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 14241732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 14251732Sbonwick } 14261732Sbonwick zio_push_transform(zio, abuf, asize, asize); 14271732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 14281732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 14291732Sbonwick } 14301732Sbonwick 14311732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 14321732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 14331732Sbonwick ASSERT(bp == NULL || 14341732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1435789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1436789Sahrens 1437789Sahrens vdev_io_start(zio); 1438789Sahrens 1439789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1440789Sahrens } 1441789Sahrens 1442789Sahrens static void 1443789Sahrens zio_vdev_io_done(zio_t *zio) 1444789Sahrens { 14451775Sbillm if (zio->io_vd == NULL) 14461775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14471775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 14481775Sbillm else 14491775Sbillm vdev_io_done(zio); 1450789Sahrens } 1451789Sahrens 1452789Sahrens /* XXPOLICY */ 14531544Seschrock boolean_t 1454789Sahrens zio_should_retry(zio_t *zio) 1455789Sahrens { 1456789Sahrens vdev_t *vd = zio->io_vd; 1457789Sahrens 1458789Sahrens if (zio->io_error == 0) 1459789Sahrens return (B_FALSE); 1460789Sahrens if (zio->io_delegate_list != NULL) 1461789Sahrens return (B_FALSE); 14621775Sbillm if (vd && vd != vd->vdev_top) 1463789Sahrens return (B_FALSE); 1464789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1465789Sahrens return (B_FALSE); 14661544Seschrock if (zio->io_retries > 0) 1467789Sahrens return (B_FALSE); 1468789Sahrens 1469789Sahrens return (B_TRUE); 1470789Sahrens } 1471789Sahrens 1472789Sahrens static void 1473789Sahrens zio_vdev_io_assess(zio_t *zio) 1474789Sahrens { 1475789Sahrens vdev_t *vd = zio->io_vd; 14761775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1477789Sahrens 14781544Seschrock ASSERT(zio->io_vsd == NULL); 1479789Sahrens 14801732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 14811732Sbonwick void *abuf; 14821732Sbonwick uint64_t asize; 14831732Sbonwick ASSERT(vd == tvd); 14841732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 14851732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 14861732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 14871732Sbonwick zio_buf_free(abuf, asize); 14881732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 14891732Sbonwick } 14901732Sbonwick 14911544Seschrock if (zio_injection_enabled && !zio->io_error) 14921544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1493789Sahrens 1494789Sahrens /* 1495789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1496789Sahrens */ 1497789Sahrens /* XXPOLICY */ 1498789Sahrens if (zio_should_retry(zio)) { 1499789Sahrens ASSERT(tvd == vd); 1500789Sahrens 1501789Sahrens zio->io_retries++; 1502789Sahrens zio->io_error = 0; 15033463Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | 15043463Sahrens ZIO_FLAG_CONFIG_GRABBED; 1505789Sahrens /* XXPOLICY */ 1506789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1507789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 15081775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1509789Sahrens 1510789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1511789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1512789Sahrens vdev_description(vd), zio->io_offset); 1513789Sahrens 15141544Seschrock zio_next_stage_async(zio); 15151544Seschrock return; 15161544Seschrock } 1517789Sahrens 15181775Sbillm if (zio->io_error != 0 && zio->io_error != ECKSUM && 15191775Sbillm !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { 1520789Sahrens /* 15211544Seschrock * Poor man's hotplug support. Even if we're done retrying this 15221544Seschrock * I/O, try to reopen the vdev to see if it's still attached. 15231544Seschrock * To avoid excessive thrashing, we only try it once a minute. 15241544Seschrock * This also has the effect of detecting when missing devices 15251544Seschrock * have come back, by polling the device once a minute. 15261544Seschrock * 15271544Seschrock * We need to do this asynchronously because we can't grab 15281544Seschrock * all the necessary locks way down here. 1529789Sahrens */ 15301544Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 15311544Seschrock vd->vdev_last_try = gethrtime(); 15321544Seschrock tvd->vdev_reopen_wanted = 1; 15331544Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 15341544Seschrock } 1535789Sahrens } 1536789Sahrens 1537789Sahrens zio_next_stage(zio); 1538789Sahrens } 1539789Sahrens 1540789Sahrens void 1541789Sahrens zio_vdev_io_reissue(zio_t *zio) 1542789Sahrens { 1543789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1544789Sahrens ASSERT(zio->io_error == 0); 1545789Sahrens 1546789Sahrens zio->io_stage--; 1547789Sahrens } 1548789Sahrens 1549789Sahrens void 1550789Sahrens zio_vdev_io_redone(zio_t *zio) 1551789Sahrens { 1552789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1553789Sahrens 1554789Sahrens zio->io_stage--; 1555789Sahrens } 1556789Sahrens 1557789Sahrens void 1558789Sahrens zio_vdev_io_bypass(zio_t *zio) 1559789Sahrens { 1560789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1561789Sahrens ASSERT(zio->io_error == 0); 1562789Sahrens 1563789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1564789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1565789Sahrens } 1566789Sahrens 1567789Sahrens /* 1568789Sahrens * ========================================================================== 1569789Sahrens * Generate and verify checksums 1570789Sahrens * ========================================================================== 1571789Sahrens */ 1572789Sahrens static void 1573789Sahrens zio_checksum_generate(zio_t *zio) 1574789Sahrens { 1575789Sahrens int checksum = zio->io_checksum; 1576789Sahrens blkptr_t *bp = zio->io_bp; 1577789Sahrens 1578789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1579789Sahrens 1580789Sahrens BP_SET_CHECKSUM(bp, checksum); 1581789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1582789Sahrens 1583789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1584789Sahrens 1585789Sahrens zio_next_stage(zio); 1586789Sahrens } 1587789Sahrens 1588789Sahrens static void 1589789Sahrens zio_gang_checksum_generate(zio_t *zio) 1590789Sahrens { 1591789Sahrens zio_cksum_t zc; 1592789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1593789Sahrens 15941775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1595789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1596789Sahrens 1597789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1598789Sahrens 1599789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1600789Sahrens 1601789Sahrens zio_next_stage(zio); 1602789Sahrens } 1603789Sahrens 1604789Sahrens static void 1605789Sahrens zio_checksum_verify(zio_t *zio) 1606789Sahrens { 1607789Sahrens if (zio->io_bp != NULL) { 1608789Sahrens zio->io_error = zio_checksum_error(zio); 16091544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 16101544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 16111544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1612789Sahrens } 1613789Sahrens 1614789Sahrens zio_next_stage(zio); 1615789Sahrens } 1616789Sahrens 1617789Sahrens /* 1618789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1619789Sahrens */ 1620789Sahrens void 1621789Sahrens zio_checksum_verified(zio_t *zio) 1622789Sahrens { 1623789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1624789Sahrens } 1625789Sahrens 1626789Sahrens /* 1627789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1628789Sahrens */ 1629789Sahrens void 1630789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1631789Sahrens { 16321775Sbillm blkptr_t *bp = zio->io_bp; 16331775Sbillm 16341775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 16351775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 16361775Sbillm zcp->zc_word[2] = bp->blk_birth; 1637789Sahrens zcp->zc_word[3] = 0; 1638789Sahrens } 1639789Sahrens 1640789Sahrens /* 1641789Sahrens * ========================================================================== 1642789Sahrens * Define the pipeline 1643789Sahrens * ========================================================================== 1644789Sahrens */ 1645789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1646789Sahrens 1647789Sahrens static void 1648789Sahrens zio_badop(zio_t *zio) 1649789Sahrens { 1650789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1651789Sahrens } 1652789Sahrens 1653789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1654789Sahrens zio_badop, 1655789Sahrens zio_wait_children_ready, 1656789Sahrens zio_write_compress, 1657789Sahrens zio_checksum_generate, 1658789Sahrens zio_gang_pipeline, 1659789Sahrens zio_get_gang_header, 1660789Sahrens zio_rewrite_gang_members, 1661789Sahrens zio_free_gang_members, 1662789Sahrens zio_claim_gang_members, 1663789Sahrens zio_dva_allocate, 1664789Sahrens zio_dva_free, 1665789Sahrens zio_dva_claim, 1666789Sahrens zio_gang_checksum_generate, 1667789Sahrens zio_ready, 1668789Sahrens zio_vdev_io_start, 1669789Sahrens zio_vdev_io_done, 1670789Sahrens zio_vdev_io_assess, 1671789Sahrens zio_wait_children_done, 1672789Sahrens zio_checksum_verify, 1673789Sahrens zio_read_gang_members, 1674789Sahrens zio_read_decompress, 1675789Sahrens zio_done, 1676789Sahrens zio_badop 1677789Sahrens }; 1678789Sahrens 1679789Sahrens /* 1680789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1681789Sahrens * There's no locking on io_stage because there's no legitimate way for 1682789Sahrens * multiple threads to be attempting to process the same I/O. 1683789Sahrens */ 1684789Sahrens void 1685789Sahrens zio_next_stage(zio_t *zio) 1686789Sahrens { 1687789Sahrens uint32_t pipeline = zio->io_pipeline; 1688789Sahrens 1689789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1690789Sahrens 1691789Sahrens if (zio->io_error) { 1692789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1693789Sahrens zio, vdev_description(zio->io_vd), 1694789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1695789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1696789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1697789Sahrens } 1698789Sahrens 1699789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1700789Sahrens continue; 1701789Sahrens 1702789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1703789Sahrens ASSERT(zio->io_stalled == 0); 1704789Sahrens 1705789Sahrens zio_pipeline[zio->io_stage](zio); 1706789Sahrens } 1707789Sahrens 1708789Sahrens void 1709789Sahrens zio_next_stage_async(zio_t *zio) 1710789Sahrens { 1711789Sahrens taskq_t *tq; 1712789Sahrens uint32_t pipeline = zio->io_pipeline; 1713789Sahrens 1714789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1715789Sahrens 1716789Sahrens if (zio->io_error) { 1717789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1718789Sahrens zio, vdev_description(zio->io_vd), 1719789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1720789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1721789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1722789Sahrens } 1723789Sahrens 1724789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1725789Sahrens continue; 1726789Sahrens 1727789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1728789Sahrens ASSERT(zio->io_stalled == 0); 1729789Sahrens 1730789Sahrens /* 1731789Sahrens * For performance, we'll probably want two sets of task queues: 1732789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1733789Sahrens * part is for read performance: since we have to make a pass over 1734789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1735789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1736789Sahrens * that thread is probably still there. Getting this optimization 1737789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1738789Sahrens * 1739789Sahrens * Note that having two sets of task queues is also necessary for 1740789Sahrens * correctness: if all of the issue threads get bogged down waiting 1741789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1742789Sahrens * there won't be any threads available to service I/O completion 1743789Sahrens * interrupts. 1744789Sahrens */ 1745789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1746789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1747789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1748789Sahrens else 1749789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1750789Sahrens (void) taskq_dispatch(tq, 1751789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1752789Sahrens } else { 1753789Sahrens zio_pipeline[zio->io_stage](zio); 1754789Sahrens } 1755789Sahrens } 1756789Sahrens 1757*3668Sgw25295 static boolean_t 1758*3668Sgw25295 zio_alloc_should_fail(void) 1759*3668Sgw25295 { 1760*3668Sgw25295 static uint16_t allocs = 0; 1761*3668Sgw25295 1762*3668Sgw25295 return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); 1763*3668Sgw25295 } 1764*3668Sgw25295 1765789Sahrens /* 1766789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1767789Sahrens */ 1768789Sahrens int 17693063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17703063Sperrin uint64_t txg) 1771789Sahrens { 1772789Sahrens int error; 1773789Sahrens 17741544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1775789Sahrens 1776*3668Sgw25295 if (zio_zil_fail_shift && zio_alloc_should_fail()) { 1777*3668Sgw25295 spa_config_exit(spa, FTAG); 1778*3668Sgw25295 return (ENOSPC); 1779*3668Sgw25295 } 1780*3668Sgw25295 17813063Sperrin /* 17823063Sperrin * We were passed the previous log blocks dva_t in bp->blk_dva[0]. 17833063Sperrin */ 17843063Sperrin error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); 1785789Sahrens 1786789Sahrens if (error == 0) { 17873063Sperrin BP_SET_LSIZE(new_bp, size); 17883063Sperrin BP_SET_PSIZE(new_bp, size); 17893063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 17903063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 17913063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 17923063Sperrin BP_SET_LEVEL(new_bp, 0); 17933063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 17943063Sperrin new_bp->blk_birth = txg; 1795789Sahrens } 1796789Sahrens 17971544Seschrock spa_config_exit(spa, FTAG); 1798789Sahrens 1799789Sahrens return (error); 1800789Sahrens } 1801789Sahrens 1802789Sahrens /* 1803789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1804789Sahrens * nothing to do except metaslab_free() it. 1805789Sahrens */ 1806789Sahrens void 1807789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1808789Sahrens { 18091775Sbillm ASSERT(!BP_IS_GANG(bp)); 1810789Sahrens 18111544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1812789Sahrens 18131807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1814789Sahrens 18151544Seschrock spa_config_exit(spa, FTAG); 1816789Sahrens } 1817