1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 673668Sgw25295 /* Force an allocation failure when non-zero */ 683668Sgw25295 uint16_t zio_zil_fail_shift = 0; 693668Sgw25295 70789Sahrens typedef struct zio_sync_pass { 71789Sahrens int zp_defer_free; /* defer frees after this pass */ 72789Sahrens int zp_dontcompress; /* don't compress after this pass */ 73789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 74789Sahrens } zio_sync_pass_t; 75789Sahrens 76789Sahrens zio_sync_pass_t zio_sync_pass = { 77789Sahrens 1, /* zp_defer_free */ 78789Sahrens 4, /* zp_dontcompress */ 79789Sahrens 1, /* zp_rewrite */ 80789Sahrens }; 81789Sahrens 82789Sahrens /* 83789Sahrens * ========================================================================== 84789Sahrens * I/O kmem caches 85789Sahrens * ========================================================================== 86789Sahrens */ 87789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 883290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 893290Sjohansen 903290Sjohansen #ifdef _KERNEL 913290Sjohansen extern vmem_t *zio_alloc_arena; 923290Sjohansen #endif 93789Sahrens 94789Sahrens void 95789Sahrens zio_init(void) 96789Sahrens { 97789Sahrens size_t c; 983290Sjohansen vmem_t *data_alloc_arena = NULL; 993290Sjohansen 1003290Sjohansen #ifdef _KERNEL 1013290Sjohansen data_alloc_arena = zio_alloc_arena; 1023290Sjohansen #endif 103789Sahrens 104789Sahrens /* 105789Sahrens * For small buffers, we want a cache for each multiple of 106789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 107789Sahrens * for each quarter-power of 2. For large buffers, we want 108789Sahrens * a cache for each multiple of PAGESIZE. 109789Sahrens */ 110789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 111789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 112789Sahrens size_t p2 = size; 113789Sahrens size_t align = 0; 114789Sahrens 115789Sahrens while (p2 & (p2 - 1)) 116789Sahrens p2 &= p2 - 1; 117789Sahrens 118789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 119789Sahrens align = SPA_MINBLOCKSIZE; 120789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 121789Sahrens align = PAGESIZE; 122789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 123789Sahrens align = p2 >> 2; 124789Sahrens } 125789Sahrens 126789Sahrens if (align != 0) { 1273290Sjohansen char name[36]; 1282856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 129789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 130849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1313290Sjohansen 1323290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1333290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1343290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1353290Sjohansen KMC_NODEBUG); 1363290Sjohansen 137789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 138789Sahrens size, align); 139789Sahrens } 140789Sahrens } 141789Sahrens 142789Sahrens while (--c != 0) { 143789Sahrens ASSERT(zio_buf_cache[c] != NULL); 144789Sahrens if (zio_buf_cache[c - 1] == NULL) 145789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1463290Sjohansen 1473290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1483290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1493290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 150789Sahrens } 1511544Seschrock 1521544Seschrock zio_inject_init(); 153789Sahrens } 154789Sahrens 155789Sahrens void 156789Sahrens zio_fini(void) 157789Sahrens { 158789Sahrens size_t c; 159789Sahrens kmem_cache_t *last_cache = NULL; 1603290Sjohansen kmem_cache_t *last_data_cache = NULL; 161789Sahrens 162789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 163789Sahrens if (zio_buf_cache[c] != last_cache) { 164789Sahrens last_cache = zio_buf_cache[c]; 165789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 166789Sahrens } 167789Sahrens zio_buf_cache[c] = NULL; 1683290Sjohansen 1693290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1703290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1713290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1723290Sjohansen } 1733290Sjohansen zio_data_buf_cache[c] = NULL; 174789Sahrens } 1751544Seschrock 1761544Seschrock zio_inject_fini(); 177789Sahrens } 178789Sahrens 179789Sahrens /* 180789Sahrens * ========================================================================== 181789Sahrens * Allocate and free I/O buffers 182789Sahrens * ========================================================================== 183789Sahrens */ 1843290Sjohansen 1853290Sjohansen /* 1863290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1873290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1883290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1893290Sjohansen * excess / transient data in-core during a crashdump. 1903290Sjohansen */ 191789Sahrens void * 192789Sahrens zio_buf_alloc(size_t size) 193789Sahrens { 194789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 195789Sahrens 196789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 197789Sahrens 198789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 199789Sahrens } 200789Sahrens 2013290Sjohansen /* 2023290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2033290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2043290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2053290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2063290Sjohansen */ 2073290Sjohansen void * 2083290Sjohansen zio_data_buf_alloc(size_t size) 2093290Sjohansen { 2103290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2113290Sjohansen 2123290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2133290Sjohansen 2143290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2153290Sjohansen } 2163290Sjohansen 217789Sahrens void 218789Sahrens zio_buf_free(void *buf, size_t size) 219789Sahrens { 220789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 221789Sahrens 222789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 223789Sahrens 224789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 225789Sahrens } 226789Sahrens 2273290Sjohansen void 2283290Sjohansen zio_data_buf_free(void *buf, size_t size) 2293290Sjohansen { 2303290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2313290Sjohansen 2323290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2333290Sjohansen 2343290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2353290Sjohansen } 2363463Sahrens 237789Sahrens /* 238789Sahrens * ========================================================================== 239789Sahrens * Push and pop I/O transform buffers 240789Sahrens * ========================================================================== 241789Sahrens */ 242789Sahrens static void 243789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 244789Sahrens { 245789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 246789Sahrens 247789Sahrens zt->zt_data = data; 248789Sahrens zt->zt_size = size; 249789Sahrens zt->zt_bufsize = bufsize; 250789Sahrens 251789Sahrens zt->zt_next = zio->io_transform_stack; 252789Sahrens zio->io_transform_stack = zt; 253789Sahrens 254789Sahrens zio->io_data = data; 255789Sahrens zio->io_size = size; 256789Sahrens } 257789Sahrens 258789Sahrens static void 259789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 260789Sahrens { 261789Sahrens zio_transform_t *zt = zio->io_transform_stack; 262789Sahrens 263789Sahrens *data = zt->zt_data; 264789Sahrens *size = zt->zt_size; 265789Sahrens *bufsize = zt->zt_bufsize; 266789Sahrens 267789Sahrens zio->io_transform_stack = zt->zt_next; 268789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 269789Sahrens 270789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 271789Sahrens zio->io_data = zt->zt_data; 272789Sahrens zio->io_size = zt->zt_size; 273789Sahrens } 274789Sahrens } 275789Sahrens 276789Sahrens static void 277789Sahrens zio_clear_transform_stack(zio_t *zio) 278789Sahrens { 279789Sahrens void *data; 280789Sahrens uint64_t size, bufsize; 281789Sahrens 282789Sahrens ASSERT(zio->io_transform_stack != NULL); 283789Sahrens 284789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 285789Sahrens while (zio->io_transform_stack != NULL) { 286789Sahrens zio_buf_free(data, bufsize); 287789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 288789Sahrens } 289789Sahrens } 290789Sahrens 291789Sahrens /* 292789Sahrens * ========================================================================== 293789Sahrens * Create the various types of I/O (read, write, free) 294789Sahrens * ========================================================================== 295789Sahrens */ 296789Sahrens static zio_t * 297789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 298789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 299789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 300789Sahrens { 301789Sahrens zio_t *zio; 302789Sahrens 303789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 304789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 305789Sahrens 306789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 307789Sahrens zio->io_parent = pio; 308789Sahrens zio->io_spa = spa; 309789Sahrens zio->io_txg = txg; 310789Sahrens if (bp != NULL) { 311789Sahrens zio->io_bp = bp; 312789Sahrens zio->io_bp_copy = *bp; 313789Sahrens zio->io_bp_orig = *bp; 314789Sahrens } 315789Sahrens zio->io_done = done; 316789Sahrens zio->io_private = private; 317789Sahrens zio->io_type = type; 318789Sahrens zio->io_priority = priority; 319789Sahrens zio->io_stage = stage; 320789Sahrens zio->io_pipeline = pipeline; 321789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 322789Sahrens zio->io_timestamp = lbolt64; 323789Sahrens zio->io_flags = flags; 3242856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 325789Sahrens zio_push_transform(zio, data, size, size); 326789Sahrens 3273463Sahrens /* 3283463Sahrens * Note on config lock: 3293463Sahrens * 3303463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3313463Sahrens * lock, so we don't need it for this io. 3323463Sahrens * 3333463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3343463Sahrens * config lock on behalf of this io, so it should be released 3353463Sahrens * in zio_done. 3363463Sahrens * 3373463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3383463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3393463Sahrens * The NULL top-level ios rarely have any children, so we delay 3403463Sahrens * grabbing the lock until the first child is added (but it is 3413463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3423463Sahrens * children don't need to also grab it). This greatly reduces 3433463Sahrens * contention on the config lock. 3443463Sahrens */ 345789Sahrens if (pio == NULL) { 3463463Sahrens if (type != ZIO_TYPE_NULL && 3473463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3481544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 3493463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3503463Sahrens } 351789Sahrens zio->io_root = zio; 352789Sahrens } else { 353789Sahrens zio->io_root = pio->io_root; 3541544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3551544Seschrock zio->io_logical = pio->io_logical; 356789Sahrens mutex_enter(&pio->io_lock); 3573463Sahrens if (pio->io_parent == NULL && 3583463Sahrens pio->io_type == ZIO_TYPE_NULL && 3593463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 3603463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 3613463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3623463Sahrens spa_config_enter(zio->io_spa, RW_READER, pio); 3633463Sahrens } 364789Sahrens if (stage < ZIO_STAGE_READY) 365789Sahrens pio->io_children_notready++; 366789Sahrens pio->io_children_notdone++; 367789Sahrens zio->io_sibling_next = pio->io_child; 368789Sahrens zio->io_sibling_prev = NULL; 369789Sahrens if (pio->io_child != NULL) 370789Sahrens pio->io_child->io_sibling_prev = zio; 371789Sahrens pio->io_child = zio; 3721775Sbillm zio->io_ndvas = pio->io_ndvas; 373789Sahrens mutex_exit(&pio->io_lock); 374789Sahrens } 375789Sahrens 376789Sahrens return (zio); 377789Sahrens } 378789Sahrens 379789Sahrens zio_t * 380789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 381789Sahrens int flags) 382789Sahrens { 383789Sahrens zio_t *zio; 384789Sahrens 385789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 386789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 387789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 388789Sahrens 389789Sahrens return (zio); 390789Sahrens } 391789Sahrens 392789Sahrens zio_t * 393789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 394789Sahrens { 395789Sahrens return (zio_null(NULL, spa, done, private, flags)); 396789Sahrens } 397789Sahrens 398789Sahrens zio_t * 399789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 400789Sahrens uint64_t size, zio_done_func_t *done, void *private, 4011544Seschrock int priority, int flags, zbookmark_t *zb) 402789Sahrens { 403789Sahrens zio_t *zio; 404789Sahrens 405789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 406789Sahrens 407789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4082981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4092981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4101544Seschrock zio->io_bookmark = *zb; 4111544Seschrock 4121544Seschrock zio->io_logical = zio; 413789Sahrens 414789Sahrens /* 415789Sahrens * Work off our copy of the bp so the caller can free it. 416789Sahrens */ 417789Sahrens zio->io_bp = &zio->io_bp_copy; 418789Sahrens 419789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 420789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 421789Sahrens void *cbuf = zio_buf_alloc(csize); 422789Sahrens 423789Sahrens zio_push_transform(zio, cbuf, csize, csize); 424789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 425789Sahrens } 426789Sahrens 4271775Sbillm if (BP_IS_GANG(bp)) { 428789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 429789Sahrens void *gbuf = zio_buf_alloc(gsize); 430789Sahrens 431789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 432789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 433789Sahrens } 434789Sahrens 435789Sahrens return (zio); 436789Sahrens } 437789Sahrens 438789Sahrens zio_t * 4391775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 440789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4413547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 4423547Smaybee int flags, zbookmark_t *zb) 443789Sahrens { 444789Sahrens zio_t *zio; 445789Sahrens 446789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 447789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 448789Sahrens 449789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 450789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 451789Sahrens 452789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4532981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 454789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 455789Sahrens 4563547Smaybee zio->io_ready = ready; 4573547Smaybee 4581544Seschrock zio->io_bookmark = *zb; 4591544Seschrock 4601544Seschrock zio->io_logical = zio; 4611544Seschrock 462789Sahrens zio->io_checksum = checksum; 463789Sahrens zio->io_compress = compress; 4641775Sbillm zio->io_ndvas = ncopies; 465789Sahrens 466789Sahrens if (compress != ZIO_COMPRESS_OFF) 467789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 468789Sahrens 469789Sahrens if (bp->blk_birth != txg) { 470789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 471789Sahrens BP_ZERO(bp); 472789Sahrens BP_SET_LSIZE(bp, size); 473789Sahrens BP_SET_PSIZE(bp, size); 4741775Sbillm } else { 4751775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 4761775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 4771775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 478789Sahrens } 479789Sahrens 480789Sahrens return (zio); 481789Sahrens } 482789Sahrens 483789Sahrens zio_t * 484789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 485789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4861544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4871544Seschrock zbookmark_t *zb) 488789Sahrens { 489789Sahrens zio_t *zio; 490789Sahrens 491789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4922981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 493789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 494789Sahrens 4951544Seschrock zio->io_bookmark = *zb; 496789Sahrens zio->io_checksum = checksum; 497789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 498789Sahrens 4991775Sbillm if (pio != NULL) 5001775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5011775Sbillm 502789Sahrens return (zio); 503789Sahrens } 504789Sahrens 505789Sahrens static zio_t * 506789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 507789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 508789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 509789Sahrens { 510789Sahrens zio_t *zio; 511789Sahrens 512789Sahrens BP_ZERO(bp); 513789Sahrens BP_SET_LSIZE(bp, size); 514789Sahrens BP_SET_PSIZE(bp, size); 515789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 516789Sahrens 517789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 518789Sahrens ZIO_TYPE_WRITE, priority, flags, 519789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 520789Sahrens 521789Sahrens zio->io_checksum = checksum; 522789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 523789Sahrens 524789Sahrens return (zio); 525789Sahrens } 526789Sahrens 527789Sahrens zio_t * 528789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 529789Sahrens zio_done_func_t *done, void *private) 530789Sahrens { 531789Sahrens zio_t *zio; 532789Sahrens 533789Sahrens ASSERT(!BP_IS_HOLE(bp)); 534789Sahrens 535789Sahrens if (txg == spa->spa_syncing_txg && 536789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 537789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 538789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 539789Sahrens } 540789Sahrens 541789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5422981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 543789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 544789Sahrens 545789Sahrens zio->io_bp = &zio->io_bp_copy; 546789Sahrens 547789Sahrens return (zio); 548789Sahrens } 549789Sahrens 550789Sahrens zio_t * 551789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 552789Sahrens zio_done_func_t *done, void *private) 553789Sahrens { 554789Sahrens zio_t *zio; 555789Sahrens 556789Sahrens /* 557789Sahrens * A claim is an allocation of a specific block. Claims are needed 558789Sahrens * to support immediate writes in the intent log. The issue is that 559789Sahrens * immediate writes contain committed data, but in a txg that was 560789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 561789Sahrens * the intent log claims all blocks that contain immediate write data 562789Sahrens * so that the SPA knows they're in use. 563789Sahrens * 564789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 565789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 566789Sahrens */ 567789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 568789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 569789Sahrens 570789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 571789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 572789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 573789Sahrens 574789Sahrens zio->io_bp = &zio->io_bp_copy; 575789Sahrens 576789Sahrens return (zio); 577789Sahrens } 578789Sahrens 579789Sahrens zio_t * 580789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 581789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 582789Sahrens { 583789Sahrens zio_t *zio; 584789Sahrens int c; 585789Sahrens 586789Sahrens if (vd->vdev_children == 0) { 587789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 588789Sahrens ZIO_TYPE_IOCTL, priority, flags, 589789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 590789Sahrens 591789Sahrens zio->io_vd = vd; 592789Sahrens zio->io_cmd = cmd; 593789Sahrens } else { 594789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 595789Sahrens 596789Sahrens for (c = 0; c < vd->vdev_children; c++) 597789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 598789Sahrens done, private, priority, flags)); 599789Sahrens } 600789Sahrens 601789Sahrens return (zio); 602789Sahrens } 603789Sahrens 604789Sahrens static void 605789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 606789Sahrens int checksum) 607789Sahrens { 608789Sahrens ASSERT(vd->vdev_children == 0); 609789Sahrens 610789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 611789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 612789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 613789Sahrens 614789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 615789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 616789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 617789Sahrens 618789Sahrens BP_ZERO(bp); 619789Sahrens 620789Sahrens BP_SET_LSIZE(bp, size); 621789Sahrens BP_SET_PSIZE(bp, size); 622789Sahrens 623789Sahrens BP_SET_CHECKSUM(bp, checksum); 624789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 625789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 626789Sahrens 627789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 628789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 629789Sahrens } 630789Sahrens 631789Sahrens zio_t * 632789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 633789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 634789Sahrens int priority, int flags) 635789Sahrens { 636789Sahrens zio_t *zio; 637789Sahrens blkptr_t blk; 638789Sahrens 639789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 640789Sahrens 641789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 642789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 643789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 644789Sahrens 645789Sahrens zio->io_vd = vd; 646789Sahrens zio->io_offset = offset; 647789Sahrens 648789Sahrens /* 649789Sahrens * Work off our copy of the bp so the caller can free it. 650789Sahrens */ 651789Sahrens zio->io_bp = &zio->io_bp_copy; 652789Sahrens 653789Sahrens return (zio); 654789Sahrens } 655789Sahrens 656789Sahrens zio_t * 657789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 658789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 659789Sahrens int priority, int flags) 660789Sahrens { 661789Sahrens zio_block_tail_t *zbt; 662789Sahrens void *wbuf; 663789Sahrens zio_t *zio; 664789Sahrens blkptr_t blk; 665789Sahrens 666789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 667789Sahrens 668789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 669789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 670789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 671789Sahrens 672789Sahrens zio->io_vd = vd; 673789Sahrens zio->io_offset = offset; 674789Sahrens 675789Sahrens zio->io_bp = &zio->io_bp_copy; 676789Sahrens zio->io_checksum = checksum; 677789Sahrens 678789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 679789Sahrens /* 680789Sahrens * zbt checksums are necessarily destructive -- they modify 681789Sahrens * one word of the write buffer to hold the verifier/checksum. 682789Sahrens * Therefore, we must make a local copy in case the data is 683789Sahrens * being written to multiple places. 684789Sahrens */ 685789Sahrens wbuf = zio_buf_alloc(size); 686789Sahrens bcopy(data, wbuf, size); 687789Sahrens zio_push_transform(zio, wbuf, size, size); 688789Sahrens 689789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 690789Sahrens zbt->zbt_cksum = blk.blk_cksum; 691789Sahrens } 692789Sahrens 693789Sahrens return (zio); 694789Sahrens } 695789Sahrens 696789Sahrens /* 697789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 698789Sahrens */ 699789Sahrens zio_t * 700789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 701789Sahrens void *data, uint64_t size, int type, int priority, int flags, 702789Sahrens zio_done_func_t *done, void *private) 703789Sahrens { 704789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 705789Sahrens zio_t *cio; 706789Sahrens 707789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 708789Sahrens /* 709789Sahrens * If we have the bp, then the child should perform the 710789Sahrens * checksum and the parent need not. This pushes error 711789Sahrens * detection as close to the leaves as possible and 712789Sahrens * eliminates redundant checksums in the interior nodes. 713789Sahrens */ 714789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 715789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 716789Sahrens } 717789Sahrens 718789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 719789Sahrens done, private, type, priority, 720789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 7211775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 722789Sahrens 723789Sahrens cio->io_vd = vd; 724789Sahrens cio->io_offset = offset; 725789Sahrens 726789Sahrens return (cio); 727789Sahrens } 728789Sahrens 729789Sahrens /* 730789Sahrens * ========================================================================== 731789Sahrens * Initiate I/O, either sync or async 732789Sahrens * ========================================================================== 733789Sahrens */ 734789Sahrens int 735789Sahrens zio_wait(zio_t *zio) 736789Sahrens { 737789Sahrens int error; 738789Sahrens 739789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 740789Sahrens 741789Sahrens zio->io_waiter = curthread; 742789Sahrens 743789Sahrens zio_next_stage_async(zio); 744789Sahrens 745789Sahrens mutex_enter(&zio->io_lock); 746789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 747789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 748789Sahrens mutex_exit(&zio->io_lock); 749789Sahrens 750789Sahrens error = zio->io_error; 7512856Snd150628 mutex_destroy(&zio->io_lock); 752789Sahrens kmem_free(zio, sizeof (zio_t)); 753789Sahrens 754789Sahrens return (error); 755789Sahrens } 756789Sahrens 757789Sahrens void 758789Sahrens zio_nowait(zio_t *zio) 759789Sahrens { 760789Sahrens zio_next_stage_async(zio); 761789Sahrens } 762789Sahrens 763789Sahrens /* 764789Sahrens * ========================================================================== 765789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 766789Sahrens * ========================================================================== 767789Sahrens */ 768789Sahrens static void 769789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 770789Sahrens { 771789Sahrens mutex_enter(&zio->io_lock); 772789Sahrens if (*countp == 0) { 773789Sahrens ASSERT(zio->io_stalled == 0); 774789Sahrens mutex_exit(&zio->io_lock); 775789Sahrens zio_next_stage(zio); 776789Sahrens } else { 777789Sahrens zio->io_stalled = stage; 778789Sahrens mutex_exit(&zio->io_lock); 779789Sahrens } 780789Sahrens } 781789Sahrens 782789Sahrens static void 783789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 784789Sahrens { 785789Sahrens zio_t *pio = zio->io_parent; 786789Sahrens 787789Sahrens mutex_enter(&pio->io_lock); 788789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 789789Sahrens pio->io_error = zio->io_error; 790789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 791789Sahrens pio->io_stalled = 0; 792789Sahrens mutex_exit(&pio->io_lock); 793789Sahrens zio_next_stage_async(pio); 794789Sahrens } else { 795789Sahrens mutex_exit(&pio->io_lock); 796789Sahrens } 797789Sahrens } 798789Sahrens 799789Sahrens static void 800789Sahrens zio_wait_children_ready(zio_t *zio) 801789Sahrens { 802789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 803789Sahrens &zio->io_children_notready); 804789Sahrens } 805789Sahrens 806789Sahrens void 807789Sahrens zio_wait_children_done(zio_t *zio) 808789Sahrens { 809789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 810789Sahrens &zio->io_children_notdone); 811789Sahrens } 812789Sahrens 813789Sahrens static void 814789Sahrens zio_ready(zio_t *zio) 815789Sahrens { 816789Sahrens zio_t *pio = zio->io_parent; 817789Sahrens 8183547Smaybee if (zio->io_ready) 8193547Smaybee zio->io_ready(zio); 8203547Smaybee 821789Sahrens if (pio != NULL) 822789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 823789Sahrens &pio->io_children_notready); 824789Sahrens 825789Sahrens if (zio->io_bp) 826789Sahrens zio->io_bp_copy = *zio->io_bp; 827789Sahrens 828789Sahrens zio_next_stage(zio); 829789Sahrens } 830789Sahrens 831789Sahrens static void 832789Sahrens zio_done(zio_t *zio) 833789Sahrens { 834789Sahrens zio_t *pio = zio->io_parent; 835789Sahrens spa_t *spa = zio->io_spa; 836789Sahrens blkptr_t *bp = zio->io_bp; 837789Sahrens vdev_t *vd = zio->io_vd; 838789Sahrens 839789Sahrens ASSERT(zio->io_children_notready == 0); 840789Sahrens ASSERT(zio->io_children_notdone == 0); 841789Sahrens 842789Sahrens if (bp != NULL) { 843789Sahrens ASSERT(bp->blk_pad[0] == 0); 844789Sahrens ASSERT(bp->blk_pad[1] == 0); 845789Sahrens ASSERT(bp->blk_pad[2] == 0); 846789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 847789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 8481775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 849789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 8501775Sbillm if (zio->io_ndvas != 0) 8511775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 8521775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 8531775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 8541775Sbillm } 855789Sahrens } 856789Sahrens 857789Sahrens if (vd != NULL) 858789Sahrens vdev_stat_update(zio); 859789Sahrens 860789Sahrens if (zio->io_error) { 8611544Seschrock /* 8621544Seschrock * If this I/O is attached to a particular vdev, 8631544Seschrock * generate an error message describing the I/O failure 8641544Seschrock * at the block level. We ignore these errors if the 8651544Seschrock * device is currently unavailable. 8661544Seschrock */ 8671732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 8681544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 8691732Sbonwick zio->io_spa, vd, zio, 0, 0); 870789Sahrens 8711544Seschrock if ((zio->io_error == EIO || 8721544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 8731544Seschrock zio->io_logical == zio) { 8741544Seschrock /* 8751544Seschrock * For root I/O requests, tell the SPA to log the error 8761544Seschrock * appropriately. Also, generate a logical data 8771544Seschrock * ereport. 8781544Seschrock */ 8791544Seschrock spa_log_error(zio->io_spa, zio); 8801544Seschrock 8811544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 8821544Seschrock zio->io_spa, NULL, zio, 0, 0); 8831544Seschrock } 884789Sahrens 8851544Seschrock /* 8861544Seschrock * For I/O requests that cannot fail, panic appropriately. 8871544Seschrock */ 8881544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 8893459Sek110237 char *blkbuf; 8903459Sek110237 8913459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 8923459Sek110237 if (blkbuf) { 8933459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 8943459Sek110237 bp ? bp : &zio->io_bp_copy); 8953459Sek110237 } 8961544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 8971544Seschrock "%d", zio->io_error == ECKSUM ? 8981544Seschrock "bad checksum" : "I/O failure", 8991544Seschrock zio_type_name[zio->io_type], 9001544Seschrock vdev_description(vd), 9011544Seschrock (u_longlong_t)zio->io_offset, 9023459Sek110237 zio, blkbuf ? blkbuf : "", zio->io_error); 9031544Seschrock } 904789Sahrens } 905789Sahrens zio_clear_transform_stack(zio); 906789Sahrens 907789Sahrens if (zio->io_done) 908789Sahrens zio->io_done(zio); 909789Sahrens 910789Sahrens ASSERT(zio->io_delegate_list == NULL); 911789Sahrens ASSERT(zio->io_delegate_next == NULL); 912789Sahrens 913789Sahrens if (pio != NULL) { 914789Sahrens zio_t *next, *prev; 915789Sahrens 916789Sahrens mutex_enter(&pio->io_lock); 917789Sahrens next = zio->io_sibling_next; 918789Sahrens prev = zio->io_sibling_prev; 919789Sahrens if (next != NULL) 920789Sahrens next->io_sibling_prev = prev; 921789Sahrens if (prev != NULL) 922789Sahrens prev->io_sibling_next = next; 923789Sahrens if (pio->io_child == zio) 924789Sahrens pio->io_child = next; 925789Sahrens mutex_exit(&pio->io_lock); 926789Sahrens 927789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 928789Sahrens &pio->io_children_notdone); 929789Sahrens } 930789Sahrens 9313463Sahrens /* 9323463Sahrens * Note: this I/O is now done, and will shortly be 9333463Sahrens * kmem_free()'d, so there is no need to clear this (or any 9343463Sahrens * other) flag. 9353463Sahrens */ 9363463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 9371544Seschrock spa_config_exit(spa, zio); 938789Sahrens 939789Sahrens if (zio->io_waiter != NULL) { 940789Sahrens mutex_enter(&zio->io_lock); 941789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 942789Sahrens zio->io_stalled = zio->io_stage; 943789Sahrens cv_broadcast(&zio->io_cv); 944789Sahrens mutex_exit(&zio->io_lock); 945789Sahrens } else { 946789Sahrens kmem_free(zio, sizeof (zio_t)); 947789Sahrens } 948789Sahrens } 949789Sahrens 950789Sahrens /* 951789Sahrens * ========================================================================== 952789Sahrens * Compression support 953789Sahrens * ========================================================================== 954789Sahrens */ 955789Sahrens static void 956789Sahrens zio_write_compress(zio_t *zio) 957789Sahrens { 958789Sahrens int compress = zio->io_compress; 959789Sahrens blkptr_t *bp = zio->io_bp; 960789Sahrens void *cbuf; 961789Sahrens uint64_t lsize = zio->io_size; 962789Sahrens uint64_t csize = lsize; 963789Sahrens uint64_t cbufsize = 0; 964789Sahrens int pass; 965789Sahrens 966789Sahrens if (bp->blk_birth == zio->io_txg) { 967789Sahrens /* 968789Sahrens * We're rewriting an existing block, which means we're 969789Sahrens * working on behalf of spa_sync(). For spa_sync() to 970789Sahrens * converge, it must eventually be the case that we don't 971789Sahrens * have to allocate new blocks. But compression changes 972789Sahrens * the blocksize, which forces a reallocate, and makes 973789Sahrens * convergence take longer. Therefore, after the first 974789Sahrens * few passes, stop compressing to ensure convergence. 975789Sahrens */ 976789Sahrens pass = spa_sync_pass(zio->io_spa); 977789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 978789Sahrens compress = ZIO_COMPRESS_OFF; 979789Sahrens } else { 980789Sahrens ASSERT(BP_IS_HOLE(bp)); 981789Sahrens pass = 1; 982789Sahrens } 983789Sahrens 984789Sahrens if (compress != ZIO_COMPRESS_OFF) 985789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 986789Sahrens &cbuf, &csize, &cbufsize)) 987789Sahrens compress = ZIO_COMPRESS_OFF; 988789Sahrens 989789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 990789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 991789Sahrens 992789Sahrens /* 993789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 994789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 995789Sahrens * but newly allocated blocks are sequential, so they can be written 996789Sahrens * to disk faster. Therefore, we allow the first few passes of 997789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 998789Sahrens * There should only be a handful of blocks after pass 1 in any case. 999789Sahrens */ 1000789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1001789Sahrens pass > zio_sync_pass.zp_rewrite) { 1002789Sahrens ASSERT(csize != 0); 10032885Sahrens BP_SET_LSIZE(bp, lsize); 10042885Sahrens BP_SET_COMPRESS(bp, compress); 1005789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 1006789Sahrens } else { 1007*3882Sahrens if (bp->blk_birth == zio->io_txg) 1008*3882Sahrens BP_ZERO(bp); 1009789Sahrens if (csize == 0) { 1010789Sahrens BP_ZERO(bp); 1011789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1012789Sahrens } else { 10131775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1014789Sahrens BP_SET_LSIZE(bp, lsize); 1015789Sahrens BP_SET_PSIZE(bp, csize); 1016789Sahrens BP_SET_COMPRESS(bp, compress); 1017789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 1018789Sahrens } 1019789Sahrens } 1020789Sahrens 1021789Sahrens zio_next_stage(zio); 1022789Sahrens } 1023789Sahrens 1024789Sahrens static void 1025789Sahrens zio_read_decompress(zio_t *zio) 1026789Sahrens { 1027789Sahrens blkptr_t *bp = zio->io_bp; 1028789Sahrens void *data; 1029789Sahrens uint64_t size; 1030789Sahrens uint64_t bufsize; 1031789Sahrens int compress = BP_GET_COMPRESS(bp); 1032789Sahrens 1033789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1034789Sahrens 1035789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1036789Sahrens 1037789Sahrens if (zio_decompress_data(compress, data, size, 1038789Sahrens zio->io_data, zio->io_size)) 1039789Sahrens zio->io_error = EIO; 1040789Sahrens 1041789Sahrens zio_buf_free(data, bufsize); 1042789Sahrens 1043789Sahrens zio_next_stage(zio); 1044789Sahrens } 1045789Sahrens 1046789Sahrens /* 1047789Sahrens * ========================================================================== 1048789Sahrens * Gang block support 1049789Sahrens * ========================================================================== 1050789Sahrens */ 1051789Sahrens static void 1052789Sahrens zio_gang_pipeline(zio_t *zio) 1053789Sahrens { 1054789Sahrens /* 1055789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1056789Sahrens * block. If we're not, strip out any gang-specific stages. 1057789Sahrens */ 10581775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1059789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1060789Sahrens 1061789Sahrens zio_next_stage(zio); 1062789Sahrens } 1063789Sahrens 1064789Sahrens static void 1065789Sahrens zio_gang_byteswap(zio_t *zio) 1066789Sahrens { 1067789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1068789Sahrens 1069789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1070789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1071789Sahrens } 1072789Sahrens 1073789Sahrens static void 1074789Sahrens zio_get_gang_header(zio_t *zio) 1075789Sahrens { 1076789Sahrens blkptr_t *bp = zio->io_bp; 1077789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1078789Sahrens void *gbuf = zio_buf_alloc(gsize); 1079789Sahrens 10801775Sbillm ASSERT(BP_IS_GANG(bp)); 1081789Sahrens 1082789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1083789Sahrens 1084789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1085789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1086789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1087789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 1088789Sahrens 1089789Sahrens zio_wait_children_done(zio); 1090789Sahrens } 1091789Sahrens 1092789Sahrens static void 1093789Sahrens zio_read_gang_members(zio_t *zio) 1094789Sahrens { 1095789Sahrens zio_gbh_phys_t *gbh; 1096789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1097789Sahrens int i; 1098789Sahrens 10991775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1100789Sahrens 1101789Sahrens zio_gang_byteswap(zio); 1102789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1103789Sahrens 1104789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1105789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1106789Sahrens lsize = BP_GET_PSIZE(gbp); 1107789Sahrens 1108789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1109789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1110789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1111789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1112789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1113789Sahrens 1114789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1115789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 11161544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 11171544Seschrock &zio->io_bookmark)); 1118789Sahrens } 1119789Sahrens 1120789Sahrens zio_buf_free(gbh, gbufsize); 1121789Sahrens zio_wait_children_done(zio); 1122789Sahrens } 1123789Sahrens 1124789Sahrens static void 1125789Sahrens zio_rewrite_gang_members(zio_t *zio) 1126789Sahrens { 1127789Sahrens zio_gbh_phys_t *gbh; 1128789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1129789Sahrens int i; 1130789Sahrens 11311775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1132789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1133789Sahrens 1134789Sahrens zio_gang_byteswap(zio); 1135789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1136789Sahrens 1137789Sahrens ASSERT(gsize == gbufsize); 1138789Sahrens 1139789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1140789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1141789Sahrens lsize = BP_GET_PSIZE(gbp); 1142789Sahrens 1143789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1144789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1145789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1146789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1147789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1148789Sahrens 1149789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1150789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 11511544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 11521544Seschrock &zio->io_bookmark)); 1153789Sahrens } 1154789Sahrens 1155789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1156789Sahrens zio_wait_children_ready(zio); 1157789Sahrens } 1158789Sahrens 1159789Sahrens static void 1160789Sahrens zio_free_gang_members(zio_t *zio) 1161789Sahrens { 1162789Sahrens zio_gbh_phys_t *gbh; 1163789Sahrens uint64_t gsize, gbufsize; 1164789Sahrens int i; 1165789Sahrens 11661775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1167789Sahrens 1168789Sahrens zio_gang_byteswap(zio); 1169789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1170789Sahrens 1171789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1172789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1173789Sahrens 1174789Sahrens if (BP_IS_HOLE(gbp)) 1175789Sahrens continue; 1176789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1177789Sahrens gbp, NULL, NULL)); 1178789Sahrens } 1179789Sahrens 1180789Sahrens zio_buf_free(gbh, gbufsize); 1181789Sahrens zio_next_stage(zio); 1182789Sahrens } 1183789Sahrens 1184789Sahrens static void 1185789Sahrens zio_claim_gang_members(zio_t *zio) 1186789Sahrens { 1187789Sahrens zio_gbh_phys_t *gbh; 1188789Sahrens uint64_t gsize, gbufsize; 1189789Sahrens int i; 1190789Sahrens 11911775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1192789Sahrens 1193789Sahrens zio_gang_byteswap(zio); 1194789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1195789Sahrens 1196789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1197789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1198789Sahrens if (BP_IS_HOLE(gbp)) 1199789Sahrens continue; 1200789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1201789Sahrens gbp, NULL, NULL)); 1202789Sahrens } 1203789Sahrens 1204789Sahrens zio_buf_free(gbh, gbufsize); 1205789Sahrens zio_next_stage(zio); 1206789Sahrens } 1207789Sahrens 1208789Sahrens static void 1209789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1210789Sahrens { 1211789Sahrens zio_t *pio = zio->io_parent; 12121775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 12131775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1214789Sahrens uint64_t asize; 12151775Sbillm int d; 1216789Sahrens 12171775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 12181775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 12191775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 12201775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 12211775Sbillm 1222789Sahrens mutex_enter(&pio->io_lock); 12231775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 12241775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 12251775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 12261775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 12271775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 12281775Sbillm } 1229789Sahrens mutex_exit(&pio->io_lock); 1230789Sahrens } 1231789Sahrens 1232789Sahrens static void 1233789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1234789Sahrens { 1235789Sahrens blkptr_t *bp = zio->io_bp; 12361775Sbillm dva_t *dva = bp->blk_dva; 12371775Sbillm spa_t *spa = zio->io_spa; 1238789Sahrens zio_gbh_phys_t *gbh; 12391775Sbillm uint64_t txg = zio->io_txg; 1240789Sahrens uint64_t resid = zio->io_size; 1241789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1242789Sahrens uint64_t gsize, loff, lsize; 1243789Sahrens uint32_t gbps_left; 12441775Sbillm int ndvas = zio->io_ndvas; 12451775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1246789Sahrens int error; 12471775Sbillm int i, d; 1248789Sahrens 1249789Sahrens gsize = SPA_GANGBLOCKSIZE; 1250789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1251789Sahrens 12523063Sperrin error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); 1253789Sahrens if (error == ENOSPC) 1254789Sahrens panic("can't allocate gang block header"); 1255789Sahrens ASSERT(error == 0); 1256789Sahrens 12571775Sbillm for (d = 0; d < gbh_ndvas; d++) 12581775Sbillm DVA_SET_GANG(&dva[d], 1); 1259789Sahrens 12601775Sbillm bp->blk_birth = txg; 1261789Sahrens 1262789Sahrens gbh = zio_buf_alloc(gsize); 1263789Sahrens bzero(gbh, gsize); 1264789Sahrens 12651775Sbillm /* We need to test multi-level gang blocks */ 12661775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 12671775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 12681775Sbillm 1269789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1270789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1271789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 12721775Sbillm dva = gbp->blk_dva; 1273789Sahrens 1274789Sahrens ASSERT(gbps_left != 0); 1275789Sahrens maxalloc = MIN(maxalloc, resid); 1276789Sahrens 1277789Sahrens while (resid <= maxalloc * gbps_left) { 12781775Sbillm error = metaslab_alloc(spa, maxalloc, gbp, ndvas, 12793063Sperrin txg, bp, B_FALSE); 1280789Sahrens if (error == 0) 1281789Sahrens break; 1282789Sahrens ASSERT3U(error, ==, ENOSPC); 1283789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1284789Sahrens panic("really out of space"); 1285789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1286789Sahrens } 1287789Sahrens 1288789Sahrens if (resid <= maxalloc * gbps_left) { 1289789Sahrens lsize = maxalloc; 1290789Sahrens BP_SET_LSIZE(gbp, lsize); 1291789Sahrens BP_SET_PSIZE(gbp, lsize); 1292789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 12931775Sbillm gbp->blk_birth = txg; 12941775Sbillm zio_nowait(zio_rewrite(zio, spa, 12951775Sbillm zio->io_checksum, txg, gbp, 1296789Sahrens (char *)zio->io_data + loff, lsize, 1297789Sahrens zio_write_allocate_gang_member_done, NULL, 12981544Seschrock zio->io_priority, zio->io_flags, 12991544Seschrock &zio->io_bookmark)); 1300789Sahrens } else { 1301789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1302789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 13031775Sbillm zio_nowait(zio_write_allocate(zio, spa, 13041775Sbillm zio->io_checksum, txg, gbp, 1305789Sahrens (char *)zio->io_data + loff, lsize, 1306789Sahrens zio_write_allocate_gang_member_done, NULL, 1307789Sahrens zio->io_priority, zio->io_flags)); 1308789Sahrens } 1309789Sahrens } 1310789Sahrens 1311789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1312789Sahrens 1313789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1314789Sahrens 1315789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 13161775Sbillm /* 13171775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 13181775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 13191775Sbillm * so we have to wait for that to finish in order for our BP 13201775Sbillm * to be stable. 13211775Sbillm */ 1322789Sahrens zio_wait_children_done(zio); 1323789Sahrens } 1324789Sahrens 1325789Sahrens /* 1326789Sahrens * ========================================================================== 1327789Sahrens * Allocate and free blocks 1328789Sahrens * ========================================================================== 1329789Sahrens */ 1330789Sahrens static void 1331789Sahrens zio_dva_allocate(zio_t *zio) 1332789Sahrens { 1333789Sahrens blkptr_t *bp = zio->io_bp; 1334789Sahrens int error; 1335789Sahrens 1336789Sahrens ASSERT(BP_IS_HOLE(bp)); 13371775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 13381775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 13391775Sbillm ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); 1340789Sahrens 1341789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1342789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1343789Sahrens zio_write_allocate_gang_members(zio); 1344789Sahrens return; 1345789Sahrens } 1346789Sahrens 1347789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1348789Sahrens 13491775Sbillm error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, 13503063Sperrin zio->io_txg, NULL, B_FALSE); 1351789Sahrens 1352789Sahrens if (error == 0) { 1353789Sahrens bp->blk_birth = zio->io_txg; 1354789Sahrens } else if (error == ENOSPC) { 1355789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1356789Sahrens panic("really, truly out of space"); 1357789Sahrens zio_write_allocate_gang_members(zio); 1358789Sahrens return; 1359789Sahrens } else { 1360789Sahrens zio->io_error = error; 1361789Sahrens } 1362789Sahrens zio_next_stage(zio); 1363789Sahrens } 1364789Sahrens 1365789Sahrens static void 1366789Sahrens zio_dva_free(zio_t *zio) 1367789Sahrens { 1368789Sahrens blkptr_t *bp = zio->io_bp; 1369789Sahrens 13701807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1371789Sahrens 1372789Sahrens BP_ZERO(bp); 1373789Sahrens 1374789Sahrens zio_next_stage(zio); 1375789Sahrens } 1376789Sahrens 1377789Sahrens static void 1378789Sahrens zio_dva_claim(zio_t *zio) 1379789Sahrens { 13801807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1381789Sahrens 1382789Sahrens zio_next_stage(zio); 1383789Sahrens } 1384789Sahrens 1385789Sahrens /* 1386789Sahrens * ========================================================================== 1387789Sahrens * Read and write to physical devices 1388789Sahrens * ========================================================================== 1389789Sahrens */ 1390789Sahrens 1391789Sahrens static void 13921775Sbillm zio_vdev_io_start(zio_t *zio) 1393789Sahrens { 1394789Sahrens vdev_t *vd = zio->io_vd; 13951775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 13961775Sbillm blkptr_t *bp = zio->io_bp; 13971775Sbillm uint64_t align; 1398789Sahrens 13991775Sbillm if (vd == NULL) { 14001775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14011775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 14021775Sbillm return; 14031775Sbillm } 14041775Sbillm 14051775Sbillm align = 1ULL << tvd->vdev_ashift; 14061775Sbillm 14071732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1408789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1409789Sahrens 14101775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 14111775Sbillm vd->vdev_children == 0) { 1412789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1413789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1414789Sahrens } 1415789Sahrens 14161732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 14171732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 14181732Sbonwick char *abuf = zio_buf_alloc(asize); 14191732Sbonwick ASSERT(vd == tvd); 14201732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 14211732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 14221732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 14231732Sbonwick } 14241732Sbonwick zio_push_transform(zio, abuf, asize, asize); 14251732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 14261732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 14271732Sbonwick } 14281732Sbonwick 14291732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 14301732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 14311732Sbonwick ASSERT(bp == NULL || 14321732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1433789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1434789Sahrens 1435789Sahrens vdev_io_start(zio); 1436789Sahrens 1437789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1438789Sahrens } 1439789Sahrens 1440789Sahrens static void 1441789Sahrens zio_vdev_io_done(zio_t *zio) 1442789Sahrens { 14431775Sbillm if (zio->io_vd == NULL) 14441775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14451775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 14461775Sbillm else 14471775Sbillm vdev_io_done(zio); 1448789Sahrens } 1449789Sahrens 1450789Sahrens /* XXPOLICY */ 14511544Seschrock boolean_t 1452789Sahrens zio_should_retry(zio_t *zio) 1453789Sahrens { 1454789Sahrens vdev_t *vd = zio->io_vd; 1455789Sahrens 1456789Sahrens if (zio->io_error == 0) 1457789Sahrens return (B_FALSE); 1458789Sahrens if (zio->io_delegate_list != NULL) 1459789Sahrens return (B_FALSE); 14601775Sbillm if (vd && vd != vd->vdev_top) 1461789Sahrens return (B_FALSE); 1462789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1463789Sahrens return (B_FALSE); 14641544Seschrock if (zio->io_retries > 0) 1465789Sahrens return (B_FALSE); 1466789Sahrens 1467789Sahrens return (B_TRUE); 1468789Sahrens } 1469789Sahrens 1470789Sahrens static void 1471789Sahrens zio_vdev_io_assess(zio_t *zio) 1472789Sahrens { 1473789Sahrens vdev_t *vd = zio->io_vd; 14741775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1475789Sahrens 14761544Seschrock ASSERT(zio->io_vsd == NULL); 1477789Sahrens 14781732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 14791732Sbonwick void *abuf; 14801732Sbonwick uint64_t asize; 14811732Sbonwick ASSERT(vd == tvd); 14821732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 14831732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 14841732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 14851732Sbonwick zio_buf_free(abuf, asize); 14861732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 14871732Sbonwick } 14881732Sbonwick 14891544Seschrock if (zio_injection_enabled && !zio->io_error) 14901544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1491789Sahrens 1492789Sahrens /* 1493789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1494789Sahrens */ 1495789Sahrens /* XXPOLICY */ 1496789Sahrens if (zio_should_retry(zio)) { 1497789Sahrens ASSERT(tvd == vd); 1498789Sahrens 1499789Sahrens zio->io_retries++; 1500789Sahrens zio->io_error = 0; 15013463Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | 15023463Sahrens ZIO_FLAG_CONFIG_GRABBED; 1503789Sahrens /* XXPOLICY */ 1504789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1505789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 15061775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1507789Sahrens 1508789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1509789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1510789Sahrens vdev_description(vd), zio->io_offset); 1511789Sahrens 15121544Seschrock zio_next_stage_async(zio); 15131544Seschrock return; 15141544Seschrock } 1515789Sahrens 15161775Sbillm if (zio->io_error != 0 && zio->io_error != ECKSUM && 15171775Sbillm !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { 1518789Sahrens /* 15191544Seschrock * Poor man's hotplug support. Even if we're done retrying this 15201544Seschrock * I/O, try to reopen the vdev to see if it's still attached. 15211544Seschrock * To avoid excessive thrashing, we only try it once a minute. 15221544Seschrock * This also has the effect of detecting when missing devices 15231544Seschrock * have come back, by polling the device once a minute. 15241544Seschrock * 15251544Seschrock * We need to do this asynchronously because we can't grab 15261544Seschrock * all the necessary locks way down here. 1527789Sahrens */ 15281544Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 15291544Seschrock vd->vdev_last_try = gethrtime(); 15301544Seschrock tvd->vdev_reopen_wanted = 1; 15311544Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 15321544Seschrock } 1533789Sahrens } 1534789Sahrens 1535789Sahrens zio_next_stage(zio); 1536789Sahrens } 1537789Sahrens 1538789Sahrens void 1539789Sahrens zio_vdev_io_reissue(zio_t *zio) 1540789Sahrens { 1541789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1542789Sahrens ASSERT(zio->io_error == 0); 1543789Sahrens 1544789Sahrens zio->io_stage--; 1545789Sahrens } 1546789Sahrens 1547789Sahrens void 1548789Sahrens zio_vdev_io_redone(zio_t *zio) 1549789Sahrens { 1550789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1551789Sahrens 1552789Sahrens zio->io_stage--; 1553789Sahrens } 1554789Sahrens 1555789Sahrens void 1556789Sahrens zio_vdev_io_bypass(zio_t *zio) 1557789Sahrens { 1558789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1559789Sahrens ASSERT(zio->io_error == 0); 1560789Sahrens 1561789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1562789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1563789Sahrens } 1564789Sahrens 1565789Sahrens /* 1566789Sahrens * ========================================================================== 1567789Sahrens * Generate and verify checksums 1568789Sahrens * ========================================================================== 1569789Sahrens */ 1570789Sahrens static void 1571789Sahrens zio_checksum_generate(zio_t *zio) 1572789Sahrens { 1573789Sahrens int checksum = zio->io_checksum; 1574789Sahrens blkptr_t *bp = zio->io_bp; 1575789Sahrens 1576789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1577789Sahrens 1578789Sahrens BP_SET_CHECKSUM(bp, checksum); 1579789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1580789Sahrens 1581789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1582789Sahrens 1583789Sahrens zio_next_stage(zio); 1584789Sahrens } 1585789Sahrens 1586789Sahrens static void 1587789Sahrens zio_gang_checksum_generate(zio_t *zio) 1588789Sahrens { 1589789Sahrens zio_cksum_t zc; 1590789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1591789Sahrens 15921775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1593789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1594789Sahrens 1595789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1596789Sahrens 1597789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1598789Sahrens 1599789Sahrens zio_next_stage(zio); 1600789Sahrens } 1601789Sahrens 1602789Sahrens static void 1603789Sahrens zio_checksum_verify(zio_t *zio) 1604789Sahrens { 1605789Sahrens if (zio->io_bp != NULL) { 1606789Sahrens zio->io_error = zio_checksum_error(zio); 16071544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 16081544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 16091544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1610789Sahrens } 1611789Sahrens 1612789Sahrens zio_next_stage(zio); 1613789Sahrens } 1614789Sahrens 1615789Sahrens /* 1616789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1617789Sahrens */ 1618789Sahrens void 1619789Sahrens zio_checksum_verified(zio_t *zio) 1620789Sahrens { 1621789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1622789Sahrens } 1623789Sahrens 1624789Sahrens /* 1625789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1626789Sahrens */ 1627789Sahrens void 1628789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1629789Sahrens { 16301775Sbillm blkptr_t *bp = zio->io_bp; 16311775Sbillm 16321775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 16331775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 16341775Sbillm zcp->zc_word[2] = bp->blk_birth; 1635789Sahrens zcp->zc_word[3] = 0; 1636789Sahrens } 1637789Sahrens 1638789Sahrens /* 1639789Sahrens * ========================================================================== 1640789Sahrens * Define the pipeline 1641789Sahrens * ========================================================================== 1642789Sahrens */ 1643789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1644789Sahrens 1645789Sahrens static void 1646789Sahrens zio_badop(zio_t *zio) 1647789Sahrens { 1648789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1649789Sahrens } 1650789Sahrens 1651789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1652789Sahrens zio_badop, 1653789Sahrens zio_wait_children_ready, 1654789Sahrens zio_write_compress, 1655789Sahrens zio_checksum_generate, 1656789Sahrens zio_gang_pipeline, 1657789Sahrens zio_get_gang_header, 1658789Sahrens zio_rewrite_gang_members, 1659789Sahrens zio_free_gang_members, 1660789Sahrens zio_claim_gang_members, 1661789Sahrens zio_dva_allocate, 1662789Sahrens zio_dva_free, 1663789Sahrens zio_dva_claim, 1664789Sahrens zio_gang_checksum_generate, 1665789Sahrens zio_ready, 1666789Sahrens zio_vdev_io_start, 1667789Sahrens zio_vdev_io_done, 1668789Sahrens zio_vdev_io_assess, 1669789Sahrens zio_wait_children_done, 1670789Sahrens zio_checksum_verify, 1671789Sahrens zio_read_gang_members, 1672789Sahrens zio_read_decompress, 1673789Sahrens zio_done, 1674789Sahrens zio_badop 1675789Sahrens }; 1676789Sahrens 1677789Sahrens /* 1678789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1679789Sahrens * There's no locking on io_stage because there's no legitimate way for 1680789Sahrens * multiple threads to be attempting to process the same I/O. 1681789Sahrens */ 1682789Sahrens void 1683789Sahrens zio_next_stage(zio_t *zio) 1684789Sahrens { 1685789Sahrens uint32_t pipeline = zio->io_pipeline; 1686789Sahrens 1687789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1688789Sahrens 1689789Sahrens if (zio->io_error) { 1690789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1691789Sahrens zio, vdev_description(zio->io_vd), 1692789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1693789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1694789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1695789Sahrens } 1696789Sahrens 1697789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1698789Sahrens continue; 1699789Sahrens 1700789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1701789Sahrens ASSERT(zio->io_stalled == 0); 1702789Sahrens 17033689Sek110237 /* 17043689Sek110237 * See the comment in zio_next_stage_async() about per-CPU taskqs. 17053689Sek110237 */ 17063689Sek110237 if (((1U << zio->io_stage) & zio->io_async_stages) && 17073689Sek110237 (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && 17083689Sek110237 !(zio->io_flags & ZIO_FLAG_METADATA)) { 17093689Sek110237 taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 17103689Sek110237 (void) taskq_dispatch(tq, 17113689Sek110237 (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 17123689Sek110237 } else { 17133689Sek110237 zio_pipeline[zio->io_stage](zio); 17143689Sek110237 } 1715789Sahrens } 1716789Sahrens 1717789Sahrens void 1718789Sahrens zio_next_stage_async(zio_t *zio) 1719789Sahrens { 1720789Sahrens taskq_t *tq; 1721789Sahrens uint32_t pipeline = zio->io_pipeline; 1722789Sahrens 1723789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1724789Sahrens 1725789Sahrens if (zio->io_error) { 1726789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1727789Sahrens zio, vdev_description(zio->io_vd), 1728789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1729789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1730789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1731789Sahrens } 1732789Sahrens 1733789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1734789Sahrens continue; 1735789Sahrens 1736789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1737789Sahrens ASSERT(zio->io_stalled == 0); 1738789Sahrens 1739789Sahrens /* 1740789Sahrens * For performance, we'll probably want two sets of task queues: 1741789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1742789Sahrens * part is for read performance: since we have to make a pass over 1743789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1744789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1745789Sahrens * that thread is probably still there. Getting this optimization 1746789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1747789Sahrens * 1748789Sahrens * Note that having two sets of task queues is also necessary for 1749789Sahrens * correctness: if all of the issue threads get bogged down waiting 1750789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1751789Sahrens * there won't be any threads available to service I/O completion 1752789Sahrens * interrupts. 1753789Sahrens */ 1754789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1755789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1756789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1757789Sahrens else 1758789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1759789Sahrens (void) taskq_dispatch(tq, 1760789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1761789Sahrens } else { 1762789Sahrens zio_pipeline[zio->io_stage](zio); 1763789Sahrens } 1764789Sahrens } 1765789Sahrens 17663668Sgw25295 static boolean_t 17673668Sgw25295 zio_alloc_should_fail(void) 17683668Sgw25295 { 17693668Sgw25295 static uint16_t allocs = 0; 17703668Sgw25295 17713668Sgw25295 return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0); 17723668Sgw25295 } 17733668Sgw25295 1774789Sahrens /* 1775789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1776789Sahrens */ 1777789Sahrens int 17783063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17793063Sperrin uint64_t txg) 1780789Sahrens { 1781789Sahrens int error; 1782789Sahrens 17831544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1784789Sahrens 17853668Sgw25295 if (zio_zil_fail_shift && zio_alloc_should_fail()) { 17863668Sgw25295 spa_config_exit(spa, FTAG); 17873668Sgw25295 return (ENOSPC); 17883668Sgw25295 } 17893668Sgw25295 17903063Sperrin /* 17913063Sperrin * We were passed the previous log blocks dva_t in bp->blk_dva[0]. 17923063Sperrin */ 17933063Sperrin error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); 1794789Sahrens 1795789Sahrens if (error == 0) { 17963063Sperrin BP_SET_LSIZE(new_bp, size); 17973063Sperrin BP_SET_PSIZE(new_bp, size); 17983063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 17993063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 18003063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 18013063Sperrin BP_SET_LEVEL(new_bp, 0); 18023063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 18033063Sperrin new_bp->blk_birth = txg; 1804789Sahrens } 1805789Sahrens 18061544Seschrock spa_config_exit(spa, FTAG); 1807789Sahrens 1808789Sahrens return (error); 1809789Sahrens } 1810789Sahrens 1811789Sahrens /* 1812789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1813789Sahrens * nothing to do except metaslab_free() it. 1814789Sahrens */ 1815789Sahrens void 1816789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1817789Sahrens { 18181775Sbillm ASSERT(!BP_IS_GANG(bp)); 1819789Sahrens 18201544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1821789Sahrens 18221807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1823789Sahrens 18241544Seschrock spa_config_exit(spa, FTAG); 1825789Sahrens } 1826