1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 67789Sahrens typedef struct zio_sync_pass { 68789Sahrens int zp_defer_free; /* defer frees after this pass */ 69789Sahrens int zp_dontcompress; /* don't compress after this pass */ 70789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 71789Sahrens } zio_sync_pass_t; 72789Sahrens 73789Sahrens zio_sync_pass_t zio_sync_pass = { 74789Sahrens 1, /* zp_defer_free */ 75789Sahrens 4, /* zp_dontcompress */ 76789Sahrens 1, /* zp_rewrite */ 77789Sahrens }; 78789Sahrens 79789Sahrens /* 80789Sahrens * ========================================================================== 81789Sahrens * I/O kmem caches 82789Sahrens * ========================================================================== 83789Sahrens */ 84789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 853290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 863290Sjohansen 873290Sjohansen #ifdef _KERNEL 883290Sjohansen extern vmem_t *zio_alloc_arena; 893290Sjohansen #endif 90789Sahrens 91789Sahrens void 92789Sahrens zio_init(void) 93789Sahrens { 94789Sahrens size_t c; 953290Sjohansen vmem_t *data_alloc_arena = NULL; 963290Sjohansen 973290Sjohansen #ifdef _KERNEL 983290Sjohansen data_alloc_arena = zio_alloc_arena; 993290Sjohansen #endif 100789Sahrens 101789Sahrens /* 102789Sahrens * For small buffers, we want a cache for each multiple of 103789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 104789Sahrens * for each quarter-power of 2. For large buffers, we want 105789Sahrens * a cache for each multiple of PAGESIZE. 106789Sahrens */ 107789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 108789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 109789Sahrens size_t p2 = size; 110789Sahrens size_t align = 0; 111789Sahrens 112789Sahrens while (p2 & (p2 - 1)) 113789Sahrens p2 &= p2 - 1; 114789Sahrens 115789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 116789Sahrens align = SPA_MINBLOCKSIZE; 117789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 118789Sahrens align = PAGESIZE; 119789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 120789Sahrens align = p2 >> 2; 121789Sahrens } 122789Sahrens 123789Sahrens if (align != 0) { 1243290Sjohansen char name[36]; 1252856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 126789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 127849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1283290Sjohansen 1293290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1303290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1313290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1323290Sjohansen KMC_NODEBUG); 1333290Sjohansen 134789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 135789Sahrens size, align); 136789Sahrens } 137789Sahrens } 138789Sahrens 139789Sahrens while (--c != 0) { 140789Sahrens ASSERT(zio_buf_cache[c] != NULL); 141789Sahrens if (zio_buf_cache[c - 1] == NULL) 142789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1433290Sjohansen 1443290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1453290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1463290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 147789Sahrens } 1481544Seschrock 1491544Seschrock zio_inject_init(); 150789Sahrens } 151789Sahrens 152789Sahrens void 153789Sahrens zio_fini(void) 154789Sahrens { 155789Sahrens size_t c; 156789Sahrens kmem_cache_t *last_cache = NULL; 1573290Sjohansen kmem_cache_t *last_data_cache = NULL; 158789Sahrens 159789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 160789Sahrens if (zio_buf_cache[c] != last_cache) { 161789Sahrens last_cache = zio_buf_cache[c]; 162789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 163789Sahrens } 164789Sahrens zio_buf_cache[c] = NULL; 1653290Sjohansen 1663290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1673290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1683290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1693290Sjohansen } 1703290Sjohansen zio_data_buf_cache[c] = NULL; 171789Sahrens } 1721544Seschrock 1731544Seschrock zio_inject_fini(); 174789Sahrens } 175789Sahrens 176789Sahrens /* 177789Sahrens * ========================================================================== 178789Sahrens * Allocate and free I/O buffers 179789Sahrens * ========================================================================== 180789Sahrens */ 1813290Sjohansen 1823290Sjohansen /* 1833290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1843290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1853290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1863290Sjohansen * excess / transient data in-core during a crashdump. 1873290Sjohansen */ 188789Sahrens void * 189789Sahrens zio_buf_alloc(size_t size) 190789Sahrens { 191789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 192789Sahrens 193789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 194789Sahrens 195789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 196789Sahrens } 197789Sahrens 1983290Sjohansen /* 1993290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2003290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2013290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2023290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2033290Sjohansen */ 2043290Sjohansen void * 2053290Sjohansen zio_data_buf_alloc(size_t size) 2063290Sjohansen { 2073290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2083290Sjohansen 2093290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2103290Sjohansen 2113290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2123290Sjohansen } 2133290Sjohansen 214789Sahrens void 215789Sahrens zio_buf_free(void *buf, size_t size) 216789Sahrens { 217789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 218789Sahrens 219789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 220789Sahrens 221789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 222789Sahrens } 223789Sahrens 2243290Sjohansen void 2253290Sjohansen zio_data_buf_free(void *buf, size_t size) 2263290Sjohansen { 2273290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2283290Sjohansen 2293290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2303290Sjohansen 2313290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2323290Sjohansen } 233*3463Sahrens 234789Sahrens /* 235789Sahrens * ========================================================================== 236789Sahrens * Push and pop I/O transform buffers 237789Sahrens * ========================================================================== 238789Sahrens */ 239789Sahrens static void 240789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 241789Sahrens { 242789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 243789Sahrens 244789Sahrens zt->zt_data = data; 245789Sahrens zt->zt_size = size; 246789Sahrens zt->zt_bufsize = bufsize; 247789Sahrens 248789Sahrens zt->zt_next = zio->io_transform_stack; 249789Sahrens zio->io_transform_stack = zt; 250789Sahrens 251789Sahrens zio->io_data = data; 252789Sahrens zio->io_size = size; 253789Sahrens } 254789Sahrens 255789Sahrens static void 256789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 257789Sahrens { 258789Sahrens zio_transform_t *zt = zio->io_transform_stack; 259789Sahrens 260789Sahrens *data = zt->zt_data; 261789Sahrens *size = zt->zt_size; 262789Sahrens *bufsize = zt->zt_bufsize; 263789Sahrens 264789Sahrens zio->io_transform_stack = zt->zt_next; 265789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 266789Sahrens 267789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 268789Sahrens zio->io_data = zt->zt_data; 269789Sahrens zio->io_size = zt->zt_size; 270789Sahrens } 271789Sahrens } 272789Sahrens 273789Sahrens static void 274789Sahrens zio_clear_transform_stack(zio_t *zio) 275789Sahrens { 276789Sahrens void *data; 277789Sahrens uint64_t size, bufsize; 278789Sahrens 279789Sahrens ASSERT(zio->io_transform_stack != NULL); 280789Sahrens 281789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 282789Sahrens while (zio->io_transform_stack != NULL) { 283789Sahrens zio_buf_free(data, bufsize); 284789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 285789Sahrens } 286789Sahrens } 287789Sahrens 288789Sahrens /* 289789Sahrens * ========================================================================== 290789Sahrens * Create the various types of I/O (read, write, free) 291789Sahrens * ========================================================================== 292789Sahrens */ 293789Sahrens static zio_t * 294789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 295789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 296789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 297789Sahrens { 298789Sahrens zio_t *zio; 299789Sahrens 300789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 301789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 302789Sahrens 303789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 304789Sahrens zio->io_parent = pio; 305789Sahrens zio->io_spa = spa; 306789Sahrens zio->io_txg = txg; 307789Sahrens if (bp != NULL) { 308789Sahrens zio->io_bp = bp; 309789Sahrens zio->io_bp_copy = *bp; 310789Sahrens zio->io_bp_orig = *bp; 311789Sahrens } 312789Sahrens zio->io_done = done; 313789Sahrens zio->io_private = private; 314789Sahrens zio->io_type = type; 315789Sahrens zio->io_priority = priority; 316789Sahrens zio->io_stage = stage; 317789Sahrens zio->io_pipeline = pipeline; 318789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 319789Sahrens zio->io_timestamp = lbolt64; 320789Sahrens zio->io_flags = flags; 3212856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 322789Sahrens zio_push_transform(zio, data, size, size); 323789Sahrens 324*3463Sahrens /* 325*3463Sahrens * Note on config lock: 326*3463Sahrens * 327*3463Sahrens * If CONFIG_HELD is set, then the caller already has the config 328*3463Sahrens * lock, so we don't need it for this io. 329*3463Sahrens * 330*3463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 331*3463Sahrens * config lock on behalf of this io, so it should be released 332*3463Sahrens * in zio_done. 333*3463Sahrens * 334*3463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 335*3463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 336*3463Sahrens * The NULL top-level ios rarely have any children, so we delay 337*3463Sahrens * grabbing the lock until the first child is added (but it is 338*3463Sahrens * still grabbed on behalf of the top-level i/o, so additional 339*3463Sahrens * children don't need to also grab it). This greatly reduces 340*3463Sahrens * contention on the config lock. 341*3463Sahrens */ 342789Sahrens if (pio == NULL) { 343*3463Sahrens if (type != ZIO_TYPE_NULL && 344*3463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3451544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 346*3463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 347*3463Sahrens } 348789Sahrens zio->io_root = zio; 349789Sahrens } else { 350789Sahrens zio->io_root = pio->io_root; 3511544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3521544Seschrock zio->io_logical = pio->io_logical; 353789Sahrens mutex_enter(&pio->io_lock); 354*3463Sahrens if (pio->io_parent == NULL && 355*3463Sahrens pio->io_type == ZIO_TYPE_NULL && 356*3463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 357*3463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 358*3463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 359*3463Sahrens spa_config_enter(zio->io_spa, RW_READER, pio); 360*3463Sahrens } 361789Sahrens if (stage < ZIO_STAGE_READY) 362789Sahrens pio->io_children_notready++; 363789Sahrens pio->io_children_notdone++; 364789Sahrens zio->io_sibling_next = pio->io_child; 365789Sahrens zio->io_sibling_prev = NULL; 366789Sahrens if (pio->io_child != NULL) 367789Sahrens pio->io_child->io_sibling_prev = zio; 368789Sahrens pio->io_child = zio; 3691775Sbillm zio->io_ndvas = pio->io_ndvas; 370789Sahrens mutex_exit(&pio->io_lock); 371789Sahrens } 372789Sahrens 373789Sahrens return (zio); 374789Sahrens } 375789Sahrens 376789Sahrens zio_t * 377789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 378789Sahrens int flags) 379789Sahrens { 380789Sahrens zio_t *zio; 381789Sahrens 382789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 383789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 384789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 385789Sahrens 386789Sahrens return (zio); 387789Sahrens } 388789Sahrens 389789Sahrens zio_t * 390789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 391789Sahrens { 392789Sahrens return (zio_null(NULL, spa, done, private, flags)); 393789Sahrens } 394789Sahrens 395789Sahrens zio_t * 396789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 397789Sahrens uint64_t size, zio_done_func_t *done, void *private, 3981544Seschrock int priority, int flags, zbookmark_t *zb) 399789Sahrens { 400789Sahrens zio_t *zio; 401789Sahrens 402789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 403789Sahrens 404789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4052981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4062981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4071544Seschrock zio->io_bookmark = *zb; 4081544Seschrock 4091544Seschrock zio->io_logical = zio; 410789Sahrens 411789Sahrens /* 412789Sahrens * Work off our copy of the bp so the caller can free it. 413789Sahrens */ 414789Sahrens zio->io_bp = &zio->io_bp_copy; 415789Sahrens 416789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 417789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 418789Sahrens void *cbuf = zio_buf_alloc(csize); 419789Sahrens 420789Sahrens zio_push_transform(zio, cbuf, csize, csize); 421789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 422789Sahrens } 423789Sahrens 4241775Sbillm if (BP_IS_GANG(bp)) { 425789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 426789Sahrens void *gbuf = zio_buf_alloc(gsize); 427789Sahrens 428789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 429789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 430789Sahrens } 431789Sahrens 432789Sahrens return (zio); 433789Sahrens } 434789Sahrens 435789Sahrens zio_t * 4361775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 437789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4381544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4391544Seschrock zbookmark_t *zb) 440789Sahrens { 441789Sahrens zio_t *zio; 442789Sahrens 443789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 444789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 445789Sahrens 446789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 447789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 448789Sahrens 449789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4502981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 451789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 452789Sahrens 4531544Seschrock zio->io_bookmark = *zb; 4541544Seschrock 4551544Seschrock zio->io_logical = zio; 4561544Seschrock 457789Sahrens zio->io_checksum = checksum; 458789Sahrens zio->io_compress = compress; 4591775Sbillm zio->io_ndvas = ncopies; 460789Sahrens 461789Sahrens if (compress != ZIO_COMPRESS_OFF) 462789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 463789Sahrens 464789Sahrens if (bp->blk_birth != txg) { 465789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 466789Sahrens BP_ZERO(bp); 467789Sahrens BP_SET_LSIZE(bp, size); 468789Sahrens BP_SET_PSIZE(bp, size); 4691775Sbillm } else { 4701775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 4711775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 4721775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 473789Sahrens } 474789Sahrens 475789Sahrens return (zio); 476789Sahrens } 477789Sahrens 478789Sahrens zio_t * 479789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 480789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4811544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4821544Seschrock zbookmark_t *zb) 483789Sahrens { 484789Sahrens zio_t *zio; 485789Sahrens 486789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4872981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 488789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 489789Sahrens 4901544Seschrock zio->io_bookmark = *zb; 491789Sahrens zio->io_checksum = checksum; 492789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 493789Sahrens 4941775Sbillm if (pio != NULL) 4951775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 4961775Sbillm 497789Sahrens return (zio); 498789Sahrens } 499789Sahrens 500789Sahrens static zio_t * 501789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 502789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 503789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 504789Sahrens { 505789Sahrens zio_t *zio; 506789Sahrens 507789Sahrens BP_ZERO(bp); 508789Sahrens BP_SET_LSIZE(bp, size); 509789Sahrens BP_SET_PSIZE(bp, size); 510789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 511789Sahrens 512789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 513789Sahrens ZIO_TYPE_WRITE, priority, flags, 514789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 515789Sahrens 516789Sahrens zio->io_checksum = checksum; 517789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 518789Sahrens 519789Sahrens return (zio); 520789Sahrens } 521789Sahrens 522789Sahrens zio_t * 523789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 524789Sahrens zio_done_func_t *done, void *private) 525789Sahrens { 526789Sahrens zio_t *zio; 527789Sahrens 528789Sahrens ASSERT(!BP_IS_HOLE(bp)); 529789Sahrens 530789Sahrens if (txg == spa->spa_syncing_txg && 531789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 532789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 533789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 534789Sahrens } 535789Sahrens 536789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5372981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 538789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 539789Sahrens 540789Sahrens zio->io_bp = &zio->io_bp_copy; 541789Sahrens 542789Sahrens return (zio); 543789Sahrens } 544789Sahrens 545789Sahrens zio_t * 546789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 547789Sahrens zio_done_func_t *done, void *private) 548789Sahrens { 549789Sahrens zio_t *zio; 550789Sahrens 551789Sahrens /* 552789Sahrens * A claim is an allocation of a specific block. Claims are needed 553789Sahrens * to support immediate writes in the intent log. The issue is that 554789Sahrens * immediate writes contain committed data, but in a txg that was 555789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 556789Sahrens * the intent log claims all blocks that contain immediate write data 557789Sahrens * so that the SPA knows they're in use. 558789Sahrens * 559789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 560789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 561789Sahrens */ 562789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 563789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 564789Sahrens 565789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 566789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 567789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 568789Sahrens 569789Sahrens zio->io_bp = &zio->io_bp_copy; 570789Sahrens 571789Sahrens return (zio); 572789Sahrens } 573789Sahrens 574789Sahrens zio_t * 575789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 576789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 577789Sahrens { 578789Sahrens zio_t *zio; 579789Sahrens int c; 580789Sahrens 581789Sahrens if (vd->vdev_children == 0) { 582789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 583789Sahrens ZIO_TYPE_IOCTL, priority, flags, 584789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 585789Sahrens 586789Sahrens zio->io_vd = vd; 587789Sahrens zio->io_cmd = cmd; 588789Sahrens } else { 589789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 590789Sahrens 591789Sahrens for (c = 0; c < vd->vdev_children; c++) 592789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 593789Sahrens done, private, priority, flags)); 594789Sahrens } 595789Sahrens 596789Sahrens return (zio); 597789Sahrens } 598789Sahrens 599789Sahrens static void 600789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 601789Sahrens int checksum) 602789Sahrens { 603789Sahrens ASSERT(vd->vdev_children == 0); 604789Sahrens 605789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 606789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 607789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 608789Sahrens 609789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 610789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 611789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 612789Sahrens 613789Sahrens BP_ZERO(bp); 614789Sahrens 615789Sahrens BP_SET_LSIZE(bp, size); 616789Sahrens BP_SET_PSIZE(bp, size); 617789Sahrens 618789Sahrens BP_SET_CHECKSUM(bp, checksum); 619789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 620789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 621789Sahrens 622789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 623789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 624789Sahrens } 625789Sahrens 626789Sahrens zio_t * 627789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 628789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 629789Sahrens int priority, int flags) 630789Sahrens { 631789Sahrens zio_t *zio; 632789Sahrens blkptr_t blk; 633789Sahrens 634789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 635789Sahrens 636789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 637789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 638789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 639789Sahrens 640789Sahrens zio->io_vd = vd; 641789Sahrens zio->io_offset = offset; 642789Sahrens 643789Sahrens /* 644789Sahrens * Work off our copy of the bp so the caller can free it. 645789Sahrens */ 646789Sahrens zio->io_bp = &zio->io_bp_copy; 647789Sahrens 648789Sahrens return (zio); 649789Sahrens } 650789Sahrens 651789Sahrens zio_t * 652789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 653789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 654789Sahrens int priority, int flags) 655789Sahrens { 656789Sahrens zio_block_tail_t *zbt; 657789Sahrens void *wbuf; 658789Sahrens zio_t *zio; 659789Sahrens blkptr_t blk; 660789Sahrens 661789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 662789Sahrens 663789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 664789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 665789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 666789Sahrens 667789Sahrens zio->io_vd = vd; 668789Sahrens zio->io_offset = offset; 669789Sahrens 670789Sahrens zio->io_bp = &zio->io_bp_copy; 671789Sahrens zio->io_checksum = checksum; 672789Sahrens 673789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 674789Sahrens /* 675789Sahrens * zbt checksums are necessarily destructive -- they modify 676789Sahrens * one word of the write buffer to hold the verifier/checksum. 677789Sahrens * Therefore, we must make a local copy in case the data is 678789Sahrens * being written to multiple places. 679789Sahrens */ 680789Sahrens wbuf = zio_buf_alloc(size); 681789Sahrens bcopy(data, wbuf, size); 682789Sahrens zio_push_transform(zio, wbuf, size, size); 683789Sahrens 684789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 685789Sahrens zbt->zbt_cksum = blk.blk_cksum; 686789Sahrens } 687789Sahrens 688789Sahrens return (zio); 689789Sahrens } 690789Sahrens 691789Sahrens /* 692789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 693789Sahrens */ 694789Sahrens zio_t * 695789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 696789Sahrens void *data, uint64_t size, int type, int priority, int flags, 697789Sahrens zio_done_func_t *done, void *private) 698789Sahrens { 699789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 700789Sahrens zio_t *cio; 701789Sahrens 702789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 703789Sahrens /* 704789Sahrens * If we have the bp, then the child should perform the 705789Sahrens * checksum and the parent need not. This pushes error 706789Sahrens * detection as close to the leaves as possible and 707789Sahrens * eliminates redundant checksums in the interior nodes. 708789Sahrens */ 709789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 710789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 711789Sahrens } 712789Sahrens 713789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 714789Sahrens done, private, type, priority, 715789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 7161775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 717789Sahrens 718789Sahrens cio->io_vd = vd; 719789Sahrens cio->io_offset = offset; 720789Sahrens 721789Sahrens return (cio); 722789Sahrens } 723789Sahrens 724789Sahrens /* 725789Sahrens * ========================================================================== 726789Sahrens * Initiate I/O, either sync or async 727789Sahrens * ========================================================================== 728789Sahrens */ 729789Sahrens int 730789Sahrens zio_wait(zio_t *zio) 731789Sahrens { 732789Sahrens int error; 733789Sahrens 734789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 735789Sahrens 736789Sahrens zio->io_waiter = curthread; 737789Sahrens 738789Sahrens zio_next_stage_async(zio); 739789Sahrens 740789Sahrens mutex_enter(&zio->io_lock); 741789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 742789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 743789Sahrens mutex_exit(&zio->io_lock); 744789Sahrens 745789Sahrens error = zio->io_error; 7462856Snd150628 mutex_destroy(&zio->io_lock); 747789Sahrens kmem_free(zio, sizeof (zio_t)); 748789Sahrens 749789Sahrens return (error); 750789Sahrens } 751789Sahrens 752789Sahrens void 753789Sahrens zio_nowait(zio_t *zio) 754789Sahrens { 755789Sahrens zio_next_stage_async(zio); 756789Sahrens } 757789Sahrens 758789Sahrens /* 759789Sahrens * ========================================================================== 760789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 761789Sahrens * ========================================================================== 762789Sahrens */ 763789Sahrens static void 764789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 765789Sahrens { 766789Sahrens mutex_enter(&zio->io_lock); 767789Sahrens if (*countp == 0) { 768789Sahrens ASSERT(zio->io_stalled == 0); 769789Sahrens mutex_exit(&zio->io_lock); 770789Sahrens zio_next_stage(zio); 771789Sahrens } else { 772789Sahrens zio->io_stalled = stage; 773789Sahrens mutex_exit(&zio->io_lock); 774789Sahrens } 775789Sahrens } 776789Sahrens 777789Sahrens static void 778789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 779789Sahrens { 780789Sahrens zio_t *pio = zio->io_parent; 781789Sahrens 782789Sahrens mutex_enter(&pio->io_lock); 783789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 784789Sahrens pio->io_error = zio->io_error; 785789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 786789Sahrens pio->io_stalled = 0; 787789Sahrens mutex_exit(&pio->io_lock); 788789Sahrens zio_next_stage_async(pio); 789789Sahrens } else { 790789Sahrens mutex_exit(&pio->io_lock); 791789Sahrens } 792789Sahrens } 793789Sahrens 794789Sahrens static void 795789Sahrens zio_wait_children_ready(zio_t *zio) 796789Sahrens { 797789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 798789Sahrens &zio->io_children_notready); 799789Sahrens } 800789Sahrens 801789Sahrens void 802789Sahrens zio_wait_children_done(zio_t *zio) 803789Sahrens { 804789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 805789Sahrens &zio->io_children_notdone); 806789Sahrens } 807789Sahrens 808789Sahrens static void 809789Sahrens zio_ready(zio_t *zio) 810789Sahrens { 811789Sahrens zio_t *pio = zio->io_parent; 812789Sahrens 813789Sahrens if (pio != NULL) 814789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 815789Sahrens &pio->io_children_notready); 816789Sahrens 817789Sahrens if (zio->io_bp) 818789Sahrens zio->io_bp_copy = *zio->io_bp; 819789Sahrens 820789Sahrens zio_next_stage(zio); 821789Sahrens } 822789Sahrens 823789Sahrens static void 824789Sahrens zio_done(zio_t *zio) 825789Sahrens { 826789Sahrens zio_t *pio = zio->io_parent; 827789Sahrens spa_t *spa = zio->io_spa; 828789Sahrens blkptr_t *bp = zio->io_bp; 829789Sahrens vdev_t *vd = zio->io_vd; 830789Sahrens 831789Sahrens ASSERT(zio->io_children_notready == 0); 832789Sahrens ASSERT(zio->io_children_notdone == 0); 833789Sahrens 834789Sahrens if (bp != NULL) { 835789Sahrens ASSERT(bp->blk_pad[0] == 0); 836789Sahrens ASSERT(bp->blk_pad[1] == 0); 837789Sahrens ASSERT(bp->blk_pad[2] == 0); 838789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 839789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 8401775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 841789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 8421775Sbillm if (zio->io_ndvas != 0) 8431775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 8441775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 8451775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 8461775Sbillm } 847789Sahrens } 848789Sahrens 849789Sahrens if (vd != NULL) 850789Sahrens vdev_stat_update(zio); 851789Sahrens 852789Sahrens if (zio->io_error) { 8531544Seschrock /* 8541544Seschrock * If this I/O is attached to a particular vdev, 8551544Seschrock * generate an error message describing the I/O failure 8561544Seschrock * at the block level. We ignore these errors if the 8571544Seschrock * device is currently unavailable. 8581544Seschrock */ 8591732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 8601544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 8611732Sbonwick zio->io_spa, vd, zio, 0, 0); 862789Sahrens 8631544Seschrock if ((zio->io_error == EIO || 8641544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 8651544Seschrock zio->io_logical == zio) { 8661544Seschrock /* 8671544Seschrock * For root I/O requests, tell the SPA to log the error 8681544Seschrock * appropriately. Also, generate a logical data 8691544Seschrock * ereport. 8701544Seschrock */ 8711544Seschrock spa_log_error(zio->io_spa, zio); 8721544Seschrock 8731544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 8741544Seschrock zio->io_spa, NULL, zio, 0, 0); 8751544Seschrock } 876789Sahrens 8771544Seschrock /* 8781544Seschrock * For I/O requests that cannot fail, panic appropriately. 8791544Seschrock */ 8801544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 8813459Sek110237 char *blkbuf; 8823459Sek110237 8833459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 8843459Sek110237 if (blkbuf) { 8853459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 8863459Sek110237 bp ? bp : &zio->io_bp_copy); 8873459Sek110237 } 8881544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 8891544Seschrock "%d", zio->io_error == ECKSUM ? 8901544Seschrock "bad checksum" : "I/O failure", 8911544Seschrock zio_type_name[zio->io_type], 8921544Seschrock vdev_description(vd), 8931544Seschrock (u_longlong_t)zio->io_offset, 8943459Sek110237 zio, blkbuf ? blkbuf : "", zio->io_error); 8951544Seschrock } 896789Sahrens } 897789Sahrens zio_clear_transform_stack(zio); 898789Sahrens 899789Sahrens if (zio->io_done) 900789Sahrens zio->io_done(zio); 901789Sahrens 902789Sahrens ASSERT(zio->io_delegate_list == NULL); 903789Sahrens ASSERT(zio->io_delegate_next == NULL); 904789Sahrens 905789Sahrens if (pio != NULL) { 906789Sahrens zio_t *next, *prev; 907789Sahrens 908789Sahrens mutex_enter(&pio->io_lock); 909789Sahrens next = zio->io_sibling_next; 910789Sahrens prev = zio->io_sibling_prev; 911789Sahrens if (next != NULL) 912789Sahrens next->io_sibling_prev = prev; 913789Sahrens if (prev != NULL) 914789Sahrens prev->io_sibling_next = next; 915789Sahrens if (pio->io_child == zio) 916789Sahrens pio->io_child = next; 917789Sahrens mutex_exit(&pio->io_lock); 918789Sahrens 919789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 920789Sahrens &pio->io_children_notdone); 921789Sahrens } 922789Sahrens 923*3463Sahrens /* 924*3463Sahrens * Note: this I/O is now done, and will shortly be 925*3463Sahrens * kmem_free()'d, so there is no need to clear this (or any 926*3463Sahrens * other) flag. 927*3463Sahrens */ 928*3463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 9291544Seschrock spa_config_exit(spa, zio); 930789Sahrens 931789Sahrens if (zio->io_waiter != NULL) { 932789Sahrens mutex_enter(&zio->io_lock); 933789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 934789Sahrens zio->io_stalled = zio->io_stage; 935789Sahrens cv_broadcast(&zio->io_cv); 936789Sahrens mutex_exit(&zio->io_lock); 937789Sahrens } else { 938789Sahrens kmem_free(zio, sizeof (zio_t)); 939789Sahrens } 940789Sahrens } 941789Sahrens 942789Sahrens /* 943789Sahrens * ========================================================================== 944789Sahrens * Compression support 945789Sahrens * ========================================================================== 946789Sahrens */ 947789Sahrens static void 948789Sahrens zio_write_compress(zio_t *zio) 949789Sahrens { 950789Sahrens int compress = zio->io_compress; 951789Sahrens blkptr_t *bp = zio->io_bp; 952789Sahrens void *cbuf; 953789Sahrens uint64_t lsize = zio->io_size; 954789Sahrens uint64_t csize = lsize; 955789Sahrens uint64_t cbufsize = 0; 956789Sahrens int pass; 957789Sahrens 958789Sahrens if (bp->blk_birth == zio->io_txg) { 959789Sahrens /* 960789Sahrens * We're rewriting an existing block, which means we're 961789Sahrens * working on behalf of spa_sync(). For spa_sync() to 962789Sahrens * converge, it must eventually be the case that we don't 963789Sahrens * have to allocate new blocks. But compression changes 964789Sahrens * the blocksize, which forces a reallocate, and makes 965789Sahrens * convergence take longer. Therefore, after the first 966789Sahrens * few passes, stop compressing to ensure convergence. 967789Sahrens */ 968789Sahrens pass = spa_sync_pass(zio->io_spa); 969789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 970789Sahrens compress = ZIO_COMPRESS_OFF; 971789Sahrens } else { 972789Sahrens ASSERT(BP_IS_HOLE(bp)); 973789Sahrens pass = 1; 974789Sahrens } 975789Sahrens 976789Sahrens if (compress != ZIO_COMPRESS_OFF) 977789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 978789Sahrens &cbuf, &csize, &cbufsize)) 979789Sahrens compress = ZIO_COMPRESS_OFF; 980789Sahrens 981789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 982789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 983789Sahrens 984789Sahrens /* 985789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 986789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 987789Sahrens * but newly allocated blocks are sequential, so they can be written 988789Sahrens * to disk faster. Therefore, we allow the first few passes of 989789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 990789Sahrens * There should only be a handful of blocks after pass 1 in any case. 991789Sahrens */ 992789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 993789Sahrens pass > zio_sync_pass.zp_rewrite) { 994789Sahrens ASSERT(csize != 0); 9952885Sahrens BP_SET_LSIZE(bp, lsize); 9962885Sahrens BP_SET_COMPRESS(bp, compress); 997789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 998789Sahrens } else { 999789Sahrens if (bp->blk_birth == zio->io_txg) { 1000789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 1001789Sahrens bzero(bp, sizeof (blkptr_t)); 1002789Sahrens } 1003789Sahrens if (csize == 0) { 1004789Sahrens BP_ZERO(bp); 1005789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1006789Sahrens } else { 10071775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1008789Sahrens BP_SET_LSIZE(bp, lsize); 1009789Sahrens BP_SET_PSIZE(bp, csize); 1010789Sahrens BP_SET_COMPRESS(bp, compress); 1011789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 1012789Sahrens } 1013789Sahrens } 1014789Sahrens 1015789Sahrens zio_next_stage(zio); 1016789Sahrens } 1017789Sahrens 1018789Sahrens static void 1019789Sahrens zio_read_decompress(zio_t *zio) 1020789Sahrens { 1021789Sahrens blkptr_t *bp = zio->io_bp; 1022789Sahrens void *data; 1023789Sahrens uint64_t size; 1024789Sahrens uint64_t bufsize; 1025789Sahrens int compress = BP_GET_COMPRESS(bp); 1026789Sahrens 1027789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1028789Sahrens 1029789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1030789Sahrens 1031789Sahrens if (zio_decompress_data(compress, data, size, 1032789Sahrens zio->io_data, zio->io_size)) 1033789Sahrens zio->io_error = EIO; 1034789Sahrens 1035789Sahrens zio_buf_free(data, bufsize); 1036789Sahrens 1037789Sahrens zio_next_stage(zio); 1038789Sahrens } 1039789Sahrens 1040789Sahrens /* 1041789Sahrens * ========================================================================== 1042789Sahrens * Gang block support 1043789Sahrens * ========================================================================== 1044789Sahrens */ 1045789Sahrens static void 1046789Sahrens zio_gang_pipeline(zio_t *zio) 1047789Sahrens { 1048789Sahrens /* 1049789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1050789Sahrens * block. If we're not, strip out any gang-specific stages. 1051789Sahrens */ 10521775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1053789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1054789Sahrens 1055789Sahrens zio_next_stage(zio); 1056789Sahrens } 1057789Sahrens 1058789Sahrens static void 1059789Sahrens zio_gang_byteswap(zio_t *zio) 1060789Sahrens { 1061789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1062789Sahrens 1063789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1064789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1065789Sahrens } 1066789Sahrens 1067789Sahrens static void 1068789Sahrens zio_get_gang_header(zio_t *zio) 1069789Sahrens { 1070789Sahrens blkptr_t *bp = zio->io_bp; 1071789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1072789Sahrens void *gbuf = zio_buf_alloc(gsize); 1073789Sahrens 10741775Sbillm ASSERT(BP_IS_GANG(bp)); 1075789Sahrens 1076789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1077789Sahrens 1078789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1079789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1080789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1081789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 1082789Sahrens 1083789Sahrens zio_wait_children_done(zio); 1084789Sahrens } 1085789Sahrens 1086789Sahrens static void 1087789Sahrens zio_read_gang_members(zio_t *zio) 1088789Sahrens { 1089789Sahrens zio_gbh_phys_t *gbh; 1090789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1091789Sahrens int i; 1092789Sahrens 10931775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1094789Sahrens 1095789Sahrens zio_gang_byteswap(zio); 1096789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1097789Sahrens 1098789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1099789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1100789Sahrens lsize = BP_GET_PSIZE(gbp); 1101789Sahrens 1102789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1103789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1104789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1105789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1106789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1107789Sahrens 1108789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1109789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 11101544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 11111544Seschrock &zio->io_bookmark)); 1112789Sahrens } 1113789Sahrens 1114789Sahrens zio_buf_free(gbh, gbufsize); 1115789Sahrens zio_wait_children_done(zio); 1116789Sahrens } 1117789Sahrens 1118789Sahrens static void 1119789Sahrens zio_rewrite_gang_members(zio_t *zio) 1120789Sahrens { 1121789Sahrens zio_gbh_phys_t *gbh; 1122789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1123789Sahrens int i; 1124789Sahrens 11251775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1126789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1127789Sahrens 1128789Sahrens zio_gang_byteswap(zio); 1129789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1130789Sahrens 1131789Sahrens ASSERT(gsize == gbufsize); 1132789Sahrens 1133789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1134789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1135789Sahrens lsize = BP_GET_PSIZE(gbp); 1136789Sahrens 1137789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1138789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1139789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1140789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1141789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1142789Sahrens 1143789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1144789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 11451544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 11461544Seschrock &zio->io_bookmark)); 1147789Sahrens } 1148789Sahrens 1149789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1150789Sahrens zio_wait_children_ready(zio); 1151789Sahrens } 1152789Sahrens 1153789Sahrens static void 1154789Sahrens zio_free_gang_members(zio_t *zio) 1155789Sahrens { 1156789Sahrens zio_gbh_phys_t *gbh; 1157789Sahrens uint64_t gsize, gbufsize; 1158789Sahrens int i; 1159789Sahrens 11601775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1161789Sahrens 1162789Sahrens zio_gang_byteswap(zio); 1163789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1164789Sahrens 1165789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1166789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1167789Sahrens 1168789Sahrens if (BP_IS_HOLE(gbp)) 1169789Sahrens continue; 1170789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1171789Sahrens gbp, NULL, NULL)); 1172789Sahrens } 1173789Sahrens 1174789Sahrens zio_buf_free(gbh, gbufsize); 1175789Sahrens zio_next_stage(zio); 1176789Sahrens } 1177789Sahrens 1178789Sahrens static void 1179789Sahrens zio_claim_gang_members(zio_t *zio) 1180789Sahrens { 1181789Sahrens zio_gbh_phys_t *gbh; 1182789Sahrens uint64_t gsize, gbufsize; 1183789Sahrens int i; 1184789Sahrens 11851775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1186789Sahrens 1187789Sahrens zio_gang_byteswap(zio); 1188789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1189789Sahrens 1190789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1191789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1192789Sahrens if (BP_IS_HOLE(gbp)) 1193789Sahrens continue; 1194789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1195789Sahrens gbp, NULL, NULL)); 1196789Sahrens } 1197789Sahrens 1198789Sahrens zio_buf_free(gbh, gbufsize); 1199789Sahrens zio_next_stage(zio); 1200789Sahrens } 1201789Sahrens 1202789Sahrens static void 1203789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1204789Sahrens { 1205789Sahrens zio_t *pio = zio->io_parent; 12061775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 12071775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1208789Sahrens uint64_t asize; 12091775Sbillm int d; 1210789Sahrens 12111775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 12121775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 12131775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 12141775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 12151775Sbillm 1216789Sahrens mutex_enter(&pio->io_lock); 12171775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 12181775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 12191775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 12201775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 12211775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 12221775Sbillm } 1223789Sahrens mutex_exit(&pio->io_lock); 1224789Sahrens } 1225789Sahrens 1226789Sahrens static void 1227789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1228789Sahrens { 1229789Sahrens blkptr_t *bp = zio->io_bp; 12301775Sbillm dva_t *dva = bp->blk_dva; 12311775Sbillm spa_t *spa = zio->io_spa; 1232789Sahrens zio_gbh_phys_t *gbh; 12331775Sbillm uint64_t txg = zio->io_txg; 1234789Sahrens uint64_t resid = zio->io_size; 1235789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1236789Sahrens uint64_t gsize, loff, lsize; 1237789Sahrens uint32_t gbps_left; 12381775Sbillm int ndvas = zio->io_ndvas; 12391775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1240789Sahrens int error; 12411775Sbillm int i, d; 1242789Sahrens 1243789Sahrens gsize = SPA_GANGBLOCKSIZE; 1244789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1245789Sahrens 12463063Sperrin error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); 1247789Sahrens if (error == ENOSPC) 1248789Sahrens panic("can't allocate gang block header"); 1249789Sahrens ASSERT(error == 0); 1250789Sahrens 12511775Sbillm for (d = 0; d < gbh_ndvas; d++) 12521775Sbillm DVA_SET_GANG(&dva[d], 1); 1253789Sahrens 12541775Sbillm bp->blk_birth = txg; 1255789Sahrens 1256789Sahrens gbh = zio_buf_alloc(gsize); 1257789Sahrens bzero(gbh, gsize); 1258789Sahrens 12591775Sbillm /* We need to test multi-level gang blocks */ 12601775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 12611775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 12621775Sbillm 1263789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1264789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1265789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 12661775Sbillm dva = gbp->blk_dva; 1267789Sahrens 1268789Sahrens ASSERT(gbps_left != 0); 1269789Sahrens maxalloc = MIN(maxalloc, resid); 1270789Sahrens 1271789Sahrens while (resid <= maxalloc * gbps_left) { 12721775Sbillm error = metaslab_alloc(spa, maxalloc, gbp, ndvas, 12733063Sperrin txg, bp, B_FALSE); 1274789Sahrens if (error == 0) 1275789Sahrens break; 1276789Sahrens ASSERT3U(error, ==, ENOSPC); 1277789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1278789Sahrens panic("really out of space"); 1279789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1280789Sahrens } 1281789Sahrens 1282789Sahrens if (resid <= maxalloc * gbps_left) { 1283789Sahrens lsize = maxalloc; 1284789Sahrens BP_SET_LSIZE(gbp, lsize); 1285789Sahrens BP_SET_PSIZE(gbp, lsize); 1286789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 12871775Sbillm gbp->blk_birth = txg; 12881775Sbillm zio_nowait(zio_rewrite(zio, spa, 12891775Sbillm zio->io_checksum, txg, gbp, 1290789Sahrens (char *)zio->io_data + loff, lsize, 1291789Sahrens zio_write_allocate_gang_member_done, NULL, 12921544Seschrock zio->io_priority, zio->io_flags, 12931544Seschrock &zio->io_bookmark)); 1294789Sahrens } else { 1295789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1296789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 12971775Sbillm zio_nowait(zio_write_allocate(zio, spa, 12981775Sbillm zio->io_checksum, txg, gbp, 1299789Sahrens (char *)zio->io_data + loff, lsize, 1300789Sahrens zio_write_allocate_gang_member_done, NULL, 1301789Sahrens zio->io_priority, zio->io_flags)); 1302789Sahrens } 1303789Sahrens } 1304789Sahrens 1305789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1306789Sahrens 1307789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1308789Sahrens 1309789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 13101775Sbillm /* 13111775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 13121775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 13131775Sbillm * so we have to wait for that to finish in order for our BP 13141775Sbillm * to be stable. 13151775Sbillm */ 1316789Sahrens zio_wait_children_done(zio); 1317789Sahrens } 1318789Sahrens 1319789Sahrens /* 1320789Sahrens * ========================================================================== 1321789Sahrens * Allocate and free blocks 1322789Sahrens * ========================================================================== 1323789Sahrens */ 1324789Sahrens static void 1325789Sahrens zio_dva_allocate(zio_t *zio) 1326789Sahrens { 1327789Sahrens blkptr_t *bp = zio->io_bp; 1328789Sahrens int error; 1329789Sahrens 1330789Sahrens ASSERT(BP_IS_HOLE(bp)); 13311775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 13321775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 13331775Sbillm ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); 1334789Sahrens 1335789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1336789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1337789Sahrens zio_write_allocate_gang_members(zio); 1338789Sahrens return; 1339789Sahrens } 1340789Sahrens 1341789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1342789Sahrens 13431775Sbillm error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, 13443063Sperrin zio->io_txg, NULL, B_FALSE); 1345789Sahrens 1346789Sahrens if (error == 0) { 1347789Sahrens bp->blk_birth = zio->io_txg; 1348789Sahrens } else if (error == ENOSPC) { 1349789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1350789Sahrens panic("really, truly out of space"); 1351789Sahrens zio_write_allocate_gang_members(zio); 1352789Sahrens return; 1353789Sahrens } else { 1354789Sahrens zio->io_error = error; 1355789Sahrens } 1356789Sahrens zio_next_stage(zio); 1357789Sahrens } 1358789Sahrens 1359789Sahrens static void 1360789Sahrens zio_dva_free(zio_t *zio) 1361789Sahrens { 1362789Sahrens blkptr_t *bp = zio->io_bp; 1363789Sahrens 13641807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1365789Sahrens 1366789Sahrens BP_ZERO(bp); 1367789Sahrens 1368789Sahrens zio_next_stage(zio); 1369789Sahrens } 1370789Sahrens 1371789Sahrens static void 1372789Sahrens zio_dva_claim(zio_t *zio) 1373789Sahrens { 13741807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1375789Sahrens 1376789Sahrens zio_next_stage(zio); 1377789Sahrens } 1378789Sahrens 1379789Sahrens /* 1380789Sahrens * ========================================================================== 1381789Sahrens * Read and write to physical devices 1382789Sahrens * ========================================================================== 1383789Sahrens */ 1384789Sahrens 1385789Sahrens static void 13861775Sbillm zio_vdev_io_start(zio_t *zio) 1387789Sahrens { 1388789Sahrens vdev_t *vd = zio->io_vd; 13891775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 13901775Sbillm blkptr_t *bp = zio->io_bp; 13911775Sbillm uint64_t align; 1392789Sahrens 13931775Sbillm if (vd == NULL) { 13941775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 13951775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 13961775Sbillm return; 13971775Sbillm } 13981775Sbillm 13991775Sbillm align = 1ULL << tvd->vdev_ashift; 14001775Sbillm 14011732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1402789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1403789Sahrens 14041775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 14051775Sbillm vd->vdev_children == 0) { 1406789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1407789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1408789Sahrens } 1409789Sahrens 14101732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 14111732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 14121732Sbonwick char *abuf = zio_buf_alloc(asize); 14131732Sbonwick ASSERT(vd == tvd); 14141732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 14151732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 14161732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 14171732Sbonwick } 14181732Sbonwick zio_push_transform(zio, abuf, asize, asize); 14191732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 14201732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 14211732Sbonwick } 14221732Sbonwick 14231732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 14241732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 14251732Sbonwick ASSERT(bp == NULL || 14261732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1427789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1428789Sahrens 1429789Sahrens vdev_io_start(zio); 1430789Sahrens 1431789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1432789Sahrens } 1433789Sahrens 1434789Sahrens static void 1435789Sahrens zio_vdev_io_done(zio_t *zio) 1436789Sahrens { 14371775Sbillm if (zio->io_vd == NULL) 14381775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14391775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 14401775Sbillm else 14411775Sbillm vdev_io_done(zio); 1442789Sahrens } 1443789Sahrens 1444789Sahrens /* XXPOLICY */ 14451544Seschrock boolean_t 1446789Sahrens zio_should_retry(zio_t *zio) 1447789Sahrens { 1448789Sahrens vdev_t *vd = zio->io_vd; 1449789Sahrens 1450789Sahrens if (zio->io_error == 0) 1451789Sahrens return (B_FALSE); 1452789Sahrens if (zio->io_delegate_list != NULL) 1453789Sahrens return (B_FALSE); 14541775Sbillm if (vd && vd != vd->vdev_top) 1455789Sahrens return (B_FALSE); 1456789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1457789Sahrens return (B_FALSE); 14581544Seschrock if (zio->io_retries > 0) 1459789Sahrens return (B_FALSE); 1460789Sahrens 1461789Sahrens return (B_TRUE); 1462789Sahrens } 1463789Sahrens 1464789Sahrens static void 1465789Sahrens zio_vdev_io_assess(zio_t *zio) 1466789Sahrens { 1467789Sahrens vdev_t *vd = zio->io_vd; 14681775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1469789Sahrens 14701544Seschrock ASSERT(zio->io_vsd == NULL); 1471789Sahrens 14721732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 14731732Sbonwick void *abuf; 14741732Sbonwick uint64_t asize; 14751732Sbonwick ASSERT(vd == tvd); 14761732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 14771732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 14781732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 14791732Sbonwick zio_buf_free(abuf, asize); 14801732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 14811732Sbonwick } 14821732Sbonwick 14831544Seschrock if (zio_injection_enabled && !zio->io_error) 14841544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1485789Sahrens 1486789Sahrens /* 1487789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1488789Sahrens */ 1489789Sahrens /* XXPOLICY */ 1490789Sahrens if (zio_should_retry(zio)) { 1491789Sahrens ASSERT(tvd == vd); 1492789Sahrens 1493789Sahrens zio->io_retries++; 1494789Sahrens zio->io_error = 0; 1495*3463Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | 1496*3463Sahrens ZIO_FLAG_CONFIG_GRABBED; 1497789Sahrens /* XXPOLICY */ 1498789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1499789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 15001775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1501789Sahrens 1502789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1503789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1504789Sahrens vdev_description(vd), zio->io_offset); 1505789Sahrens 15061544Seschrock zio_next_stage_async(zio); 15071544Seschrock return; 15081544Seschrock } 1509789Sahrens 15101775Sbillm if (zio->io_error != 0 && zio->io_error != ECKSUM && 15111775Sbillm !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { 1512789Sahrens /* 15131544Seschrock * Poor man's hotplug support. Even if we're done retrying this 15141544Seschrock * I/O, try to reopen the vdev to see if it's still attached. 15151544Seschrock * To avoid excessive thrashing, we only try it once a minute. 15161544Seschrock * This also has the effect of detecting when missing devices 15171544Seschrock * have come back, by polling the device once a minute. 15181544Seschrock * 15191544Seschrock * We need to do this asynchronously because we can't grab 15201544Seschrock * all the necessary locks way down here. 1521789Sahrens */ 15221544Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 15231544Seschrock vd->vdev_last_try = gethrtime(); 15241544Seschrock tvd->vdev_reopen_wanted = 1; 15251544Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 15261544Seschrock } 1527789Sahrens } 1528789Sahrens 1529789Sahrens zio_next_stage(zio); 1530789Sahrens } 1531789Sahrens 1532789Sahrens void 1533789Sahrens zio_vdev_io_reissue(zio_t *zio) 1534789Sahrens { 1535789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1536789Sahrens ASSERT(zio->io_error == 0); 1537789Sahrens 1538789Sahrens zio->io_stage--; 1539789Sahrens } 1540789Sahrens 1541789Sahrens void 1542789Sahrens zio_vdev_io_redone(zio_t *zio) 1543789Sahrens { 1544789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1545789Sahrens 1546789Sahrens zio->io_stage--; 1547789Sahrens } 1548789Sahrens 1549789Sahrens void 1550789Sahrens zio_vdev_io_bypass(zio_t *zio) 1551789Sahrens { 1552789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1553789Sahrens ASSERT(zio->io_error == 0); 1554789Sahrens 1555789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1556789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1557789Sahrens } 1558789Sahrens 1559789Sahrens /* 1560789Sahrens * ========================================================================== 1561789Sahrens * Generate and verify checksums 1562789Sahrens * ========================================================================== 1563789Sahrens */ 1564789Sahrens static void 1565789Sahrens zio_checksum_generate(zio_t *zio) 1566789Sahrens { 1567789Sahrens int checksum = zio->io_checksum; 1568789Sahrens blkptr_t *bp = zio->io_bp; 1569789Sahrens 1570789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1571789Sahrens 1572789Sahrens BP_SET_CHECKSUM(bp, checksum); 1573789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1574789Sahrens 1575789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1576789Sahrens 1577789Sahrens zio_next_stage(zio); 1578789Sahrens } 1579789Sahrens 1580789Sahrens static void 1581789Sahrens zio_gang_checksum_generate(zio_t *zio) 1582789Sahrens { 1583789Sahrens zio_cksum_t zc; 1584789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1585789Sahrens 15861775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1587789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1588789Sahrens 1589789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1590789Sahrens 1591789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1592789Sahrens 1593789Sahrens zio_next_stage(zio); 1594789Sahrens } 1595789Sahrens 1596789Sahrens static void 1597789Sahrens zio_checksum_verify(zio_t *zio) 1598789Sahrens { 1599789Sahrens if (zio->io_bp != NULL) { 1600789Sahrens zio->io_error = zio_checksum_error(zio); 16011544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 16021544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 16031544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1604789Sahrens } 1605789Sahrens 1606789Sahrens zio_next_stage(zio); 1607789Sahrens } 1608789Sahrens 1609789Sahrens /* 1610789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1611789Sahrens */ 1612789Sahrens void 1613789Sahrens zio_checksum_verified(zio_t *zio) 1614789Sahrens { 1615789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1616789Sahrens } 1617789Sahrens 1618789Sahrens /* 1619789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1620789Sahrens */ 1621789Sahrens void 1622789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1623789Sahrens { 16241775Sbillm blkptr_t *bp = zio->io_bp; 16251775Sbillm 16261775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 16271775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 16281775Sbillm zcp->zc_word[2] = bp->blk_birth; 1629789Sahrens zcp->zc_word[3] = 0; 1630789Sahrens } 1631789Sahrens 1632789Sahrens /* 1633789Sahrens * ========================================================================== 1634789Sahrens * Define the pipeline 1635789Sahrens * ========================================================================== 1636789Sahrens */ 1637789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1638789Sahrens 1639789Sahrens static void 1640789Sahrens zio_badop(zio_t *zio) 1641789Sahrens { 1642789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1643789Sahrens } 1644789Sahrens 1645789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1646789Sahrens zio_badop, 1647789Sahrens zio_wait_children_ready, 1648789Sahrens zio_write_compress, 1649789Sahrens zio_checksum_generate, 1650789Sahrens zio_gang_pipeline, 1651789Sahrens zio_get_gang_header, 1652789Sahrens zio_rewrite_gang_members, 1653789Sahrens zio_free_gang_members, 1654789Sahrens zio_claim_gang_members, 1655789Sahrens zio_dva_allocate, 1656789Sahrens zio_dva_free, 1657789Sahrens zio_dva_claim, 1658789Sahrens zio_gang_checksum_generate, 1659789Sahrens zio_ready, 1660789Sahrens zio_vdev_io_start, 1661789Sahrens zio_vdev_io_done, 1662789Sahrens zio_vdev_io_assess, 1663789Sahrens zio_wait_children_done, 1664789Sahrens zio_checksum_verify, 1665789Sahrens zio_read_gang_members, 1666789Sahrens zio_read_decompress, 1667789Sahrens zio_done, 1668789Sahrens zio_badop 1669789Sahrens }; 1670789Sahrens 1671789Sahrens /* 1672789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1673789Sahrens * There's no locking on io_stage because there's no legitimate way for 1674789Sahrens * multiple threads to be attempting to process the same I/O. 1675789Sahrens */ 1676789Sahrens void 1677789Sahrens zio_next_stage(zio_t *zio) 1678789Sahrens { 1679789Sahrens uint32_t pipeline = zio->io_pipeline; 1680789Sahrens 1681789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1682789Sahrens 1683789Sahrens if (zio->io_error) { 1684789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1685789Sahrens zio, vdev_description(zio->io_vd), 1686789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1687789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1688789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1689789Sahrens } 1690789Sahrens 1691789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1692789Sahrens continue; 1693789Sahrens 1694789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1695789Sahrens ASSERT(zio->io_stalled == 0); 1696789Sahrens 1697789Sahrens zio_pipeline[zio->io_stage](zio); 1698789Sahrens } 1699789Sahrens 1700789Sahrens void 1701789Sahrens zio_next_stage_async(zio_t *zio) 1702789Sahrens { 1703789Sahrens taskq_t *tq; 1704789Sahrens uint32_t pipeline = zio->io_pipeline; 1705789Sahrens 1706789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1707789Sahrens 1708789Sahrens if (zio->io_error) { 1709789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1710789Sahrens zio, vdev_description(zio->io_vd), 1711789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1712789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1713789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1714789Sahrens } 1715789Sahrens 1716789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1717789Sahrens continue; 1718789Sahrens 1719789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1720789Sahrens ASSERT(zio->io_stalled == 0); 1721789Sahrens 1722789Sahrens /* 1723789Sahrens * For performance, we'll probably want two sets of task queues: 1724789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1725789Sahrens * part is for read performance: since we have to make a pass over 1726789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1727789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1728789Sahrens * that thread is probably still there. Getting this optimization 1729789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1730789Sahrens * 1731789Sahrens * Note that having two sets of task queues is also necessary for 1732789Sahrens * correctness: if all of the issue threads get bogged down waiting 1733789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1734789Sahrens * there won't be any threads available to service I/O completion 1735789Sahrens * interrupts. 1736789Sahrens */ 1737789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1738789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1739789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1740789Sahrens else 1741789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1742789Sahrens (void) taskq_dispatch(tq, 1743789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1744789Sahrens } else { 1745789Sahrens zio_pipeline[zio->io_stage](zio); 1746789Sahrens } 1747789Sahrens } 1748789Sahrens 1749789Sahrens /* 1750789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1751789Sahrens */ 1752789Sahrens int 17533063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17543063Sperrin uint64_t txg) 1755789Sahrens { 1756789Sahrens int error; 1757789Sahrens 17581544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1759789Sahrens 17603063Sperrin /* 17613063Sperrin * We were passed the previous log blocks dva_t in bp->blk_dva[0]. 17623063Sperrin */ 17633063Sperrin error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); 1764789Sahrens 1765789Sahrens if (error == 0) { 17663063Sperrin BP_SET_LSIZE(new_bp, size); 17673063Sperrin BP_SET_PSIZE(new_bp, size); 17683063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 17693063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 17703063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 17713063Sperrin BP_SET_LEVEL(new_bp, 0); 17723063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 17733063Sperrin new_bp->blk_birth = txg; 1774789Sahrens } 1775789Sahrens 17761544Seschrock spa_config_exit(spa, FTAG); 1777789Sahrens 1778789Sahrens return (error); 1779789Sahrens } 1780789Sahrens 1781789Sahrens /* 1782789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1783789Sahrens * nothing to do except metaslab_free() it. 1784789Sahrens */ 1785789Sahrens void 1786789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1787789Sahrens { 17881775Sbillm ASSERT(!BP_IS_GANG(bp)); 1789789Sahrens 17901544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1791789Sahrens 17921807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1793789Sahrens 17941544Seschrock spa_config_exit(spa, FTAG); 1795789Sahrens } 1796