1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 22*3459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 67789Sahrens typedef struct zio_sync_pass { 68789Sahrens int zp_defer_free; /* defer frees after this pass */ 69789Sahrens int zp_dontcompress; /* don't compress after this pass */ 70789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 71789Sahrens } zio_sync_pass_t; 72789Sahrens 73789Sahrens zio_sync_pass_t zio_sync_pass = { 74789Sahrens 1, /* zp_defer_free */ 75789Sahrens 4, /* zp_dontcompress */ 76789Sahrens 1, /* zp_rewrite */ 77789Sahrens }; 78789Sahrens 79789Sahrens /* 80789Sahrens * ========================================================================== 81789Sahrens * I/O kmem caches 82789Sahrens * ========================================================================== 83789Sahrens */ 84789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 853290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 863290Sjohansen 873290Sjohansen #ifdef _KERNEL 883290Sjohansen extern vmem_t *zio_alloc_arena; 893290Sjohansen #endif 90789Sahrens 91789Sahrens void 92789Sahrens zio_init(void) 93789Sahrens { 94789Sahrens size_t c; 953290Sjohansen vmem_t *data_alloc_arena = NULL; 963290Sjohansen 973290Sjohansen #ifdef _KERNEL 983290Sjohansen data_alloc_arena = zio_alloc_arena; 993290Sjohansen #endif 100789Sahrens 101789Sahrens /* 102789Sahrens * For small buffers, we want a cache for each multiple of 103789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 104789Sahrens * for each quarter-power of 2. For large buffers, we want 105789Sahrens * a cache for each multiple of PAGESIZE. 106789Sahrens */ 107789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 108789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 109789Sahrens size_t p2 = size; 110789Sahrens size_t align = 0; 111789Sahrens 112789Sahrens while (p2 & (p2 - 1)) 113789Sahrens p2 &= p2 - 1; 114789Sahrens 115789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 116789Sahrens align = SPA_MINBLOCKSIZE; 117789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 118789Sahrens align = PAGESIZE; 119789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 120789Sahrens align = p2 >> 2; 121789Sahrens } 122789Sahrens 123789Sahrens if (align != 0) { 1243290Sjohansen char name[36]; 1252856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 126789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 127849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1283290Sjohansen 1293290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1303290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1313290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1323290Sjohansen KMC_NODEBUG); 1333290Sjohansen 134789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 135789Sahrens size, align); 136789Sahrens } 137789Sahrens } 138789Sahrens 139789Sahrens while (--c != 0) { 140789Sahrens ASSERT(zio_buf_cache[c] != NULL); 141789Sahrens if (zio_buf_cache[c - 1] == NULL) 142789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1433290Sjohansen 1443290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1453290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1463290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 147789Sahrens } 1481544Seschrock 1491544Seschrock zio_inject_init(); 150789Sahrens } 151789Sahrens 152789Sahrens void 153789Sahrens zio_fini(void) 154789Sahrens { 155789Sahrens size_t c; 156789Sahrens kmem_cache_t *last_cache = NULL; 1573290Sjohansen kmem_cache_t *last_data_cache = NULL; 158789Sahrens 159789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 160789Sahrens if (zio_buf_cache[c] != last_cache) { 161789Sahrens last_cache = zio_buf_cache[c]; 162789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 163789Sahrens } 164789Sahrens zio_buf_cache[c] = NULL; 1653290Sjohansen 1663290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 1673290Sjohansen last_data_cache = zio_data_buf_cache[c]; 1683290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 1693290Sjohansen } 1703290Sjohansen zio_data_buf_cache[c] = NULL; 171789Sahrens } 1721544Seschrock 1731544Seschrock zio_inject_fini(); 174789Sahrens } 175789Sahrens 176789Sahrens /* 177789Sahrens * ========================================================================== 178789Sahrens * Allocate and free I/O buffers 179789Sahrens * ========================================================================== 180789Sahrens */ 1813290Sjohansen 1823290Sjohansen /* 1833290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 1843290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 1853290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 1863290Sjohansen * excess / transient data in-core during a crashdump. 1873290Sjohansen */ 188789Sahrens void * 189789Sahrens zio_buf_alloc(size_t size) 190789Sahrens { 191789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 192789Sahrens 193789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 194789Sahrens 195789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 196789Sahrens } 197789Sahrens 1983290Sjohansen /* 1993290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2003290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2013290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2023290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2033290Sjohansen */ 2043290Sjohansen void * 2053290Sjohansen zio_data_buf_alloc(size_t size) 2063290Sjohansen { 2073290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2083290Sjohansen 2093290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2103290Sjohansen 2113290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2123290Sjohansen } 2133290Sjohansen 214789Sahrens void 215789Sahrens zio_buf_free(void *buf, size_t size) 216789Sahrens { 217789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 218789Sahrens 219789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 220789Sahrens 221789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 222789Sahrens } 223789Sahrens 2243290Sjohansen void 2253290Sjohansen zio_data_buf_free(void *buf, size_t size) 2263290Sjohansen { 2273290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2283290Sjohansen 2293290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2303290Sjohansen 2313290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2323290Sjohansen } 233789Sahrens /* 234789Sahrens * ========================================================================== 235789Sahrens * Push and pop I/O transform buffers 236789Sahrens * ========================================================================== 237789Sahrens */ 238789Sahrens static void 239789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 240789Sahrens { 241789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 242789Sahrens 243789Sahrens zt->zt_data = data; 244789Sahrens zt->zt_size = size; 245789Sahrens zt->zt_bufsize = bufsize; 246789Sahrens 247789Sahrens zt->zt_next = zio->io_transform_stack; 248789Sahrens zio->io_transform_stack = zt; 249789Sahrens 250789Sahrens zio->io_data = data; 251789Sahrens zio->io_size = size; 252789Sahrens } 253789Sahrens 254789Sahrens static void 255789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 256789Sahrens { 257789Sahrens zio_transform_t *zt = zio->io_transform_stack; 258789Sahrens 259789Sahrens *data = zt->zt_data; 260789Sahrens *size = zt->zt_size; 261789Sahrens *bufsize = zt->zt_bufsize; 262789Sahrens 263789Sahrens zio->io_transform_stack = zt->zt_next; 264789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 265789Sahrens 266789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 267789Sahrens zio->io_data = zt->zt_data; 268789Sahrens zio->io_size = zt->zt_size; 269789Sahrens } 270789Sahrens } 271789Sahrens 272789Sahrens static void 273789Sahrens zio_clear_transform_stack(zio_t *zio) 274789Sahrens { 275789Sahrens void *data; 276789Sahrens uint64_t size, bufsize; 277789Sahrens 278789Sahrens ASSERT(zio->io_transform_stack != NULL); 279789Sahrens 280789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 281789Sahrens while (zio->io_transform_stack != NULL) { 282789Sahrens zio_buf_free(data, bufsize); 283789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 284789Sahrens } 285789Sahrens } 286789Sahrens 287789Sahrens /* 288789Sahrens * ========================================================================== 289789Sahrens * Create the various types of I/O (read, write, free) 290789Sahrens * ========================================================================== 291789Sahrens */ 292789Sahrens static zio_t * 293789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 294789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 295789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 296789Sahrens { 297789Sahrens zio_t *zio; 298789Sahrens 299789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 300789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 301789Sahrens 302789Sahrens zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 303789Sahrens zio->io_parent = pio; 304789Sahrens zio->io_spa = spa; 305789Sahrens zio->io_txg = txg; 306789Sahrens if (bp != NULL) { 307789Sahrens zio->io_bp = bp; 308789Sahrens zio->io_bp_copy = *bp; 309789Sahrens zio->io_bp_orig = *bp; 310789Sahrens } 311789Sahrens zio->io_done = done; 312789Sahrens zio->io_private = private; 313789Sahrens zio->io_type = type; 314789Sahrens zio->io_priority = priority; 315789Sahrens zio->io_stage = stage; 316789Sahrens zio->io_pipeline = pipeline; 317789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 318789Sahrens zio->io_timestamp = lbolt64; 319789Sahrens zio->io_flags = flags; 3202856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 321789Sahrens zio_push_transform(zio, data, size, size); 322789Sahrens 323789Sahrens if (pio == NULL) { 324789Sahrens if (!(flags & ZIO_FLAG_CONFIG_HELD)) 3251544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 326789Sahrens zio->io_root = zio; 327789Sahrens } else { 328789Sahrens zio->io_root = pio->io_root; 3291544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3301544Seschrock zio->io_logical = pio->io_logical; 331789Sahrens mutex_enter(&pio->io_lock); 332789Sahrens if (stage < ZIO_STAGE_READY) 333789Sahrens pio->io_children_notready++; 334789Sahrens pio->io_children_notdone++; 335789Sahrens zio->io_sibling_next = pio->io_child; 336789Sahrens zio->io_sibling_prev = NULL; 337789Sahrens if (pio->io_child != NULL) 338789Sahrens pio->io_child->io_sibling_prev = zio; 339789Sahrens pio->io_child = zio; 3401775Sbillm zio->io_ndvas = pio->io_ndvas; 341789Sahrens mutex_exit(&pio->io_lock); 342789Sahrens } 343789Sahrens 344789Sahrens return (zio); 345789Sahrens } 346789Sahrens 347789Sahrens zio_t * 348789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 349789Sahrens int flags) 350789Sahrens { 351789Sahrens zio_t *zio; 352789Sahrens 353789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 354789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 355789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 356789Sahrens 357789Sahrens return (zio); 358789Sahrens } 359789Sahrens 360789Sahrens zio_t * 361789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 362789Sahrens { 363789Sahrens return (zio_null(NULL, spa, done, private, flags)); 364789Sahrens } 365789Sahrens 366789Sahrens zio_t * 367789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 368789Sahrens uint64_t size, zio_done_func_t *done, void *private, 3691544Seschrock int priority, int flags, zbookmark_t *zb) 370789Sahrens { 371789Sahrens zio_t *zio; 372789Sahrens 373789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 374789Sahrens 375789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 3762981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 3772981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 3781544Seschrock zio->io_bookmark = *zb; 3791544Seschrock 3801544Seschrock zio->io_logical = zio; 381789Sahrens 382789Sahrens /* 383789Sahrens * Work off our copy of the bp so the caller can free it. 384789Sahrens */ 385789Sahrens zio->io_bp = &zio->io_bp_copy; 386789Sahrens 387789Sahrens if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 388789Sahrens uint64_t csize = BP_GET_PSIZE(bp); 389789Sahrens void *cbuf = zio_buf_alloc(csize); 390789Sahrens 391789Sahrens zio_push_transform(zio, cbuf, csize, csize); 392789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 393789Sahrens } 394789Sahrens 3951775Sbillm if (BP_IS_GANG(bp)) { 396789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 397789Sahrens void *gbuf = zio_buf_alloc(gsize); 398789Sahrens 399789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 400789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 401789Sahrens } 402789Sahrens 403789Sahrens return (zio); 404789Sahrens } 405789Sahrens 406789Sahrens zio_t * 4071775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 408789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4091544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4101544Seschrock zbookmark_t *zb) 411789Sahrens { 412789Sahrens zio_t *zio; 413789Sahrens 414789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 415789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 416789Sahrens 417789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 418789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 419789Sahrens 420789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4212981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 422789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 423789Sahrens 4241544Seschrock zio->io_bookmark = *zb; 4251544Seschrock 4261544Seschrock zio->io_logical = zio; 4271544Seschrock 428789Sahrens zio->io_checksum = checksum; 429789Sahrens zio->io_compress = compress; 4301775Sbillm zio->io_ndvas = ncopies; 431789Sahrens 432789Sahrens if (compress != ZIO_COMPRESS_OFF) 433789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 434789Sahrens 435789Sahrens if (bp->blk_birth != txg) { 436789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 437789Sahrens BP_ZERO(bp); 438789Sahrens BP_SET_LSIZE(bp, size); 439789Sahrens BP_SET_PSIZE(bp, size); 4401775Sbillm } else { 4411775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 4421775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 4431775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 444789Sahrens } 445789Sahrens 446789Sahrens return (zio); 447789Sahrens } 448789Sahrens 449789Sahrens zio_t * 450789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 451789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4521544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 4531544Seschrock zbookmark_t *zb) 454789Sahrens { 455789Sahrens zio_t *zio; 456789Sahrens 457789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 4582981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 459789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 460789Sahrens 4611544Seschrock zio->io_bookmark = *zb; 462789Sahrens zio->io_checksum = checksum; 463789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 464789Sahrens 4651775Sbillm if (pio != NULL) 4661775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 4671775Sbillm 468789Sahrens return (zio); 469789Sahrens } 470789Sahrens 471789Sahrens static zio_t * 472789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 473789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 474789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 475789Sahrens { 476789Sahrens zio_t *zio; 477789Sahrens 478789Sahrens BP_ZERO(bp); 479789Sahrens BP_SET_LSIZE(bp, size); 480789Sahrens BP_SET_PSIZE(bp, size); 481789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 482789Sahrens 483789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 484789Sahrens ZIO_TYPE_WRITE, priority, flags, 485789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 486789Sahrens 487789Sahrens zio->io_checksum = checksum; 488789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 489789Sahrens 490789Sahrens return (zio); 491789Sahrens } 492789Sahrens 493789Sahrens zio_t * 494789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 495789Sahrens zio_done_func_t *done, void *private) 496789Sahrens { 497789Sahrens zio_t *zio; 498789Sahrens 499789Sahrens ASSERT(!BP_IS_HOLE(bp)); 500789Sahrens 501789Sahrens if (txg == spa->spa_syncing_txg && 502789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 503789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 504789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 505789Sahrens } 506789Sahrens 507789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5082981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 509789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 510789Sahrens 511789Sahrens zio->io_bp = &zio->io_bp_copy; 512789Sahrens 513789Sahrens return (zio); 514789Sahrens } 515789Sahrens 516789Sahrens zio_t * 517789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 518789Sahrens zio_done_func_t *done, void *private) 519789Sahrens { 520789Sahrens zio_t *zio; 521789Sahrens 522789Sahrens /* 523789Sahrens * A claim is an allocation of a specific block. Claims are needed 524789Sahrens * to support immediate writes in the intent log. The issue is that 525789Sahrens * immediate writes contain committed data, but in a txg that was 526789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 527789Sahrens * the intent log claims all blocks that contain immediate write data 528789Sahrens * so that the SPA knows they're in use. 529789Sahrens * 530789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 531789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 532789Sahrens */ 533789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 534789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 535789Sahrens 536789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 537789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 538789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 539789Sahrens 540789Sahrens zio->io_bp = &zio->io_bp_copy; 541789Sahrens 542789Sahrens return (zio); 543789Sahrens } 544789Sahrens 545789Sahrens zio_t * 546789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 547789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 548789Sahrens { 549789Sahrens zio_t *zio; 550789Sahrens int c; 551789Sahrens 552789Sahrens if (vd->vdev_children == 0) { 553789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 554789Sahrens ZIO_TYPE_IOCTL, priority, flags, 555789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 556789Sahrens 557789Sahrens zio->io_vd = vd; 558789Sahrens zio->io_cmd = cmd; 559789Sahrens } else { 560789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 561789Sahrens 562789Sahrens for (c = 0; c < vd->vdev_children; c++) 563789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 564789Sahrens done, private, priority, flags)); 565789Sahrens } 566789Sahrens 567789Sahrens return (zio); 568789Sahrens } 569789Sahrens 570789Sahrens static void 571789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 572789Sahrens int checksum) 573789Sahrens { 574789Sahrens ASSERT(vd->vdev_children == 0); 575789Sahrens 576789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 577789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 578789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 579789Sahrens 580789Sahrens ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 581789Sahrens offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 582789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 583789Sahrens 584789Sahrens BP_ZERO(bp); 585789Sahrens 586789Sahrens BP_SET_LSIZE(bp, size); 587789Sahrens BP_SET_PSIZE(bp, size); 588789Sahrens 589789Sahrens BP_SET_CHECKSUM(bp, checksum); 590789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 591789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 592789Sahrens 593789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 594789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 595789Sahrens } 596789Sahrens 597789Sahrens zio_t * 598789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 599789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 600789Sahrens int priority, int flags) 601789Sahrens { 602789Sahrens zio_t *zio; 603789Sahrens blkptr_t blk; 604789Sahrens 605789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 606789Sahrens 607789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 608789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 609789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 610789Sahrens 611789Sahrens zio->io_vd = vd; 612789Sahrens zio->io_offset = offset; 613789Sahrens 614789Sahrens /* 615789Sahrens * Work off our copy of the bp so the caller can free it. 616789Sahrens */ 617789Sahrens zio->io_bp = &zio->io_bp_copy; 618789Sahrens 619789Sahrens return (zio); 620789Sahrens } 621789Sahrens 622789Sahrens zio_t * 623789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 624789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 625789Sahrens int priority, int flags) 626789Sahrens { 627789Sahrens zio_block_tail_t *zbt; 628789Sahrens void *wbuf; 629789Sahrens zio_t *zio; 630789Sahrens blkptr_t blk; 631789Sahrens 632789Sahrens zio_phys_bp_init(vd, &blk, offset, size, checksum); 633789Sahrens 634789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 635789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 636789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 637789Sahrens 638789Sahrens zio->io_vd = vd; 639789Sahrens zio->io_offset = offset; 640789Sahrens 641789Sahrens zio->io_bp = &zio->io_bp_copy; 642789Sahrens zio->io_checksum = checksum; 643789Sahrens 644789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 645789Sahrens /* 646789Sahrens * zbt checksums are necessarily destructive -- they modify 647789Sahrens * one word of the write buffer to hold the verifier/checksum. 648789Sahrens * Therefore, we must make a local copy in case the data is 649789Sahrens * being written to multiple places. 650789Sahrens */ 651789Sahrens wbuf = zio_buf_alloc(size); 652789Sahrens bcopy(data, wbuf, size); 653789Sahrens zio_push_transform(zio, wbuf, size, size); 654789Sahrens 655789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 656789Sahrens zbt->zbt_cksum = blk.blk_cksum; 657789Sahrens } 658789Sahrens 659789Sahrens return (zio); 660789Sahrens } 661789Sahrens 662789Sahrens /* 663789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 664789Sahrens */ 665789Sahrens zio_t * 666789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 667789Sahrens void *data, uint64_t size, int type, int priority, int flags, 668789Sahrens zio_done_func_t *done, void *private) 669789Sahrens { 670789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 671789Sahrens zio_t *cio; 672789Sahrens 673789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 674789Sahrens /* 675789Sahrens * If we have the bp, then the child should perform the 676789Sahrens * checksum and the parent need not. This pushes error 677789Sahrens * detection as close to the leaves as possible and 678789Sahrens * eliminates redundant checksums in the interior nodes. 679789Sahrens */ 680789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 681789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 682789Sahrens } 683789Sahrens 684789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 685789Sahrens done, private, type, priority, 686789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 6871775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 688789Sahrens 689789Sahrens cio->io_vd = vd; 690789Sahrens cio->io_offset = offset; 691789Sahrens 692789Sahrens return (cio); 693789Sahrens } 694789Sahrens 695789Sahrens /* 696789Sahrens * ========================================================================== 697789Sahrens * Initiate I/O, either sync or async 698789Sahrens * ========================================================================== 699789Sahrens */ 700789Sahrens int 701789Sahrens zio_wait(zio_t *zio) 702789Sahrens { 703789Sahrens int error; 704789Sahrens 705789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 706789Sahrens 707789Sahrens zio->io_waiter = curthread; 708789Sahrens 709789Sahrens zio_next_stage_async(zio); 710789Sahrens 711789Sahrens mutex_enter(&zio->io_lock); 712789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 713789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 714789Sahrens mutex_exit(&zio->io_lock); 715789Sahrens 716789Sahrens error = zio->io_error; 7172856Snd150628 mutex_destroy(&zio->io_lock); 718789Sahrens kmem_free(zio, sizeof (zio_t)); 719789Sahrens 720789Sahrens return (error); 721789Sahrens } 722789Sahrens 723789Sahrens void 724789Sahrens zio_nowait(zio_t *zio) 725789Sahrens { 726789Sahrens zio_next_stage_async(zio); 727789Sahrens } 728789Sahrens 729789Sahrens /* 730789Sahrens * ========================================================================== 731789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 732789Sahrens * ========================================================================== 733789Sahrens */ 734789Sahrens static void 735789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 736789Sahrens { 737789Sahrens mutex_enter(&zio->io_lock); 738789Sahrens if (*countp == 0) { 739789Sahrens ASSERT(zio->io_stalled == 0); 740789Sahrens mutex_exit(&zio->io_lock); 741789Sahrens zio_next_stage(zio); 742789Sahrens } else { 743789Sahrens zio->io_stalled = stage; 744789Sahrens mutex_exit(&zio->io_lock); 745789Sahrens } 746789Sahrens } 747789Sahrens 748789Sahrens static void 749789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 750789Sahrens { 751789Sahrens zio_t *pio = zio->io_parent; 752789Sahrens 753789Sahrens mutex_enter(&pio->io_lock); 754789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 755789Sahrens pio->io_error = zio->io_error; 756789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 757789Sahrens pio->io_stalled = 0; 758789Sahrens mutex_exit(&pio->io_lock); 759789Sahrens zio_next_stage_async(pio); 760789Sahrens } else { 761789Sahrens mutex_exit(&pio->io_lock); 762789Sahrens } 763789Sahrens } 764789Sahrens 765789Sahrens static void 766789Sahrens zio_wait_children_ready(zio_t *zio) 767789Sahrens { 768789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 769789Sahrens &zio->io_children_notready); 770789Sahrens } 771789Sahrens 772789Sahrens void 773789Sahrens zio_wait_children_done(zio_t *zio) 774789Sahrens { 775789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 776789Sahrens &zio->io_children_notdone); 777789Sahrens } 778789Sahrens 779789Sahrens static void 780789Sahrens zio_ready(zio_t *zio) 781789Sahrens { 782789Sahrens zio_t *pio = zio->io_parent; 783789Sahrens 784789Sahrens if (pio != NULL) 785789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 786789Sahrens &pio->io_children_notready); 787789Sahrens 788789Sahrens if (zio->io_bp) 789789Sahrens zio->io_bp_copy = *zio->io_bp; 790789Sahrens 791789Sahrens zio_next_stage(zio); 792789Sahrens } 793789Sahrens 794789Sahrens static void 795789Sahrens zio_done(zio_t *zio) 796789Sahrens { 797789Sahrens zio_t *pio = zio->io_parent; 798789Sahrens spa_t *spa = zio->io_spa; 799789Sahrens blkptr_t *bp = zio->io_bp; 800789Sahrens vdev_t *vd = zio->io_vd; 801789Sahrens 802789Sahrens ASSERT(zio->io_children_notready == 0); 803789Sahrens ASSERT(zio->io_children_notdone == 0); 804789Sahrens 805789Sahrens if (bp != NULL) { 806789Sahrens ASSERT(bp->blk_pad[0] == 0); 807789Sahrens ASSERT(bp->blk_pad[1] == 0); 808789Sahrens ASSERT(bp->blk_pad[2] == 0); 809789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 810789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 8111775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 812789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 8131775Sbillm if (zio->io_ndvas != 0) 8141775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 8151775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 8161775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 8171775Sbillm } 818789Sahrens } 819789Sahrens 820789Sahrens if (vd != NULL) 821789Sahrens vdev_stat_update(zio); 822789Sahrens 823789Sahrens if (zio->io_error) { 8241544Seschrock /* 8251544Seschrock * If this I/O is attached to a particular vdev, 8261544Seschrock * generate an error message describing the I/O failure 8271544Seschrock * at the block level. We ignore these errors if the 8281544Seschrock * device is currently unavailable. 8291544Seschrock */ 8301732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 8311544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_IO, 8321732Sbonwick zio->io_spa, vd, zio, 0, 0); 833789Sahrens 8341544Seschrock if ((zio->io_error == EIO || 8351544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 8361544Seschrock zio->io_logical == zio) { 8371544Seschrock /* 8381544Seschrock * For root I/O requests, tell the SPA to log the error 8391544Seschrock * appropriately. Also, generate a logical data 8401544Seschrock * ereport. 8411544Seschrock */ 8421544Seschrock spa_log_error(zio->io_spa, zio); 8431544Seschrock 8441544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_DATA, 8451544Seschrock zio->io_spa, NULL, zio, 0, 0); 8461544Seschrock } 847789Sahrens 8481544Seschrock /* 8491544Seschrock * For I/O requests that cannot fail, panic appropriately. 8501544Seschrock */ 8511544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 852*3459Sek110237 char *blkbuf; 853*3459Sek110237 854*3459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 855*3459Sek110237 if (blkbuf) { 856*3459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 857*3459Sek110237 bp ? bp : &zio->io_bp_copy); 858*3459Sek110237 } 8591544Seschrock panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " 8601544Seschrock "%d", zio->io_error == ECKSUM ? 8611544Seschrock "bad checksum" : "I/O failure", 8621544Seschrock zio_type_name[zio->io_type], 8631544Seschrock vdev_description(vd), 8641544Seschrock (u_longlong_t)zio->io_offset, 865*3459Sek110237 zio, blkbuf ? blkbuf : "", zio->io_error); 8661544Seschrock } 867789Sahrens } 868789Sahrens zio_clear_transform_stack(zio); 869789Sahrens 870789Sahrens if (zio->io_done) 871789Sahrens zio->io_done(zio); 872789Sahrens 873789Sahrens ASSERT(zio->io_delegate_list == NULL); 874789Sahrens ASSERT(zio->io_delegate_next == NULL); 875789Sahrens 876789Sahrens if (pio != NULL) { 877789Sahrens zio_t *next, *prev; 878789Sahrens 879789Sahrens mutex_enter(&pio->io_lock); 880789Sahrens next = zio->io_sibling_next; 881789Sahrens prev = zio->io_sibling_prev; 882789Sahrens if (next != NULL) 883789Sahrens next->io_sibling_prev = prev; 884789Sahrens if (prev != NULL) 885789Sahrens prev->io_sibling_next = next; 886789Sahrens if (pio->io_child == zio) 887789Sahrens pio->io_child = next; 888789Sahrens mutex_exit(&pio->io_lock); 889789Sahrens 890789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 891789Sahrens &pio->io_children_notdone); 892789Sahrens } 893789Sahrens 894789Sahrens if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) 8951544Seschrock spa_config_exit(spa, zio); 896789Sahrens 897789Sahrens if (zio->io_waiter != NULL) { 898789Sahrens mutex_enter(&zio->io_lock); 899789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 900789Sahrens zio->io_stalled = zio->io_stage; 901789Sahrens cv_broadcast(&zio->io_cv); 902789Sahrens mutex_exit(&zio->io_lock); 903789Sahrens } else { 904789Sahrens kmem_free(zio, sizeof (zio_t)); 905789Sahrens } 906789Sahrens } 907789Sahrens 908789Sahrens /* 909789Sahrens * ========================================================================== 910789Sahrens * Compression support 911789Sahrens * ========================================================================== 912789Sahrens */ 913789Sahrens static void 914789Sahrens zio_write_compress(zio_t *zio) 915789Sahrens { 916789Sahrens int compress = zio->io_compress; 917789Sahrens blkptr_t *bp = zio->io_bp; 918789Sahrens void *cbuf; 919789Sahrens uint64_t lsize = zio->io_size; 920789Sahrens uint64_t csize = lsize; 921789Sahrens uint64_t cbufsize = 0; 922789Sahrens int pass; 923789Sahrens 924789Sahrens if (bp->blk_birth == zio->io_txg) { 925789Sahrens /* 926789Sahrens * We're rewriting an existing block, which means we're 927789Sahrens * working on behalf of spa_sync(). For spa_sync() to 928789Sahrens * converge, it must eventually be the case that we don't 929789Sahrens * have to allocate new blocks. But compression changes 930789Sahrens * the blocksize, which forces a reallocate, and makes 931789Sahrens * convergence take longer. Therefore, after the first 932789Sahrens * few passes, stop compressing to ensure convergence. 933789Sahrens */ 934789Sahrens pass = spa_sync_pass(zio->io_spa); 935789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 936789Sahrens compress = ZIO_COMPRESS_OFF; 937789Sahrens } else { 938789Sahrens ASSERT(BP_IS_HOLE(bp)); 939789Sahrens pass = 1; 940789Sahrens } 941789Sahrens 942789Sahrens if (compress != ZIO_COMPRESS_OFF) 943789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 944789Sahrens &cbuf, &csize, &cbufsize)) 945789Sahrens compress = ZIO_COMPRESS_OFF; 946789Sahrens 947789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 948789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 949789Sahrens 950789Sahrens /* 951789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 952789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 953789Sahrens * but newly allocated blocks are sequential, so they can be written 954789Sahrens * to disk faster. Therefore, we allow the first few passes of 955789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 956789Sahrens * There should only be a handful of blocks after pass 1 in any case. 957789Sahrens */ 958789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 959789Sahrens pass > zio_sync_pass.zp_rewrite) { 960789Sahrens ASSERT(csize != 0); 9612885Sahrens BP_SET_LSIZE(bp, lsize); 9622885Sahrens BP_SET_COMPRESS(bp, compress); 963789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 964789Sahrens } else { 965789Sahrens if (bp->blk_birth == zio->io_txg) { 966789Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, lsize); 967789Sahrens bzero(bp, sizeof (blkptr_t)); 968789Sahrens } 969789Sahrens if (csize == 0) { 970789Sahrens BP_ZERO(bp); 971789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 972789Sahrens } else { 9731775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 974789Sahrens BP_SET_LSIZE(bp, lsize); 975789Sahrens BP_SET_PSIZE(bp, csize); 976789Sahrens BP_SET_COMPRESS(bp, compress); 977789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 978789Sahrens } 979789Sahrens } 980789Sahrens 981789Sahrens zio_next_stage(zio); 982789Sahrens } 983789Sahrens 984789Sahrens static void 985789Sahrens zio_read_decompress(zio_t *zio) 986789Sahrens { 987789Sahrens blkptr_t *bp = zio->io_bp; 988789Sahrens void *data; 989789Sahrens uint64_t size; 990789Sahrens uint64_t bufsize; 991789Sahrens int compress = BP_GET_COMPRESS(bp); 992789Sahrens 993789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 994789Sahrens 995789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 996789Sahrens 997789Sahrens if (zio_decompress_data(compress, data, size, 998789Sahrens zio->io_data, zio->io_size)) 999789Sahrens zio->io_error = EIO; 1000789Sahrens 1001789Sahrens zio_buf_free(data, bufsize); 1002789Sahrens 1003789Sahrens zio_next_stage(zio); 1004789Sahrens } 1005789Sahrens 1006789Sahrens /* 1007789Sahrens * ========================================================================== 1008789Sahrens * Gang block support 1009789Sahrens * ========================================================================== 1010789Sahrens */ 1011789Sahrens static void 1012789Sahrens zio_gang_pipeline(zio_t *zio) 1013789Sahrens { 1014789Sahrens /* 1015789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1016789Sahrens * block. If we're not, strip out any gang-specific stages. 1017789Sahrens */ 10181775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1019789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1020789Sahrens 1021789Sahrens zio_next_stage(zio); 1022789Sahrens } 1023789Sahrens 1024789Sahrens static void 1025789Sahrens zio_gang_byteswap(zio_t *zio) 1026789Sahrens { 1027789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1028789Sahrens 1029789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1030789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1031789Sahrens } 1032789Sahrens 1033789Sahrens static void 1034789Sahrens zio_get_gang_header(zio_t *zio) 1035789Sahrens { 1036789Sahrens blkptr_t *bp = zio->io_bp; 1037789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1038789Sahrens void *gbuf = zio_buf_alloc(gsize); 1039789Sahrens 10401775Sbillm ASSERT(BP_IS_GANG(bp)); 1041789Sahrens 1042789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1043789Sahrens 1044789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1045789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1046789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1047789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE)); 1048789Sahrens 1049789Sahrens zio_wait_children_done(zio); 1050789Sahrens } 1051789Sahrens 1052789Sahrens static void 1053789Sahrens zio_read_gang_members(zio_t *zio) 1054789Sahrens { 1055789Sahrens zio_gbh_phys_t *gbh; 1056789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1057789Sahrens int i; 1058789Sahrens 10591775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1060789Sahrens 1061789Sahrens zio_gang_byteswap(zio); 1062789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1063789Sahrens 1064789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1065789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1066789Sahrens lsize = BP_GET_PSIZE(gbp); 1067789Sahrens 1068789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1069789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1070789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1071789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1072789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1073789Sahrens 1074789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1075789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 10761544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 10771544Seschrock &zio->io_bookmark)); 1078789Sahrens } 1079789Sahrens 1080789Sahrens zio_buf_free(gbh, gbufsize); 1081789Sahrens zio_wait_children_done(zio); 1082789Sahrens } 1083789Sahrens 1084789Sahrens static void 1085789Sahrens zio_rewrite_gang_members(zio_t *zio) 1086789Sahrens { 1087789Sahrens zio_gbh_phys_t *gbh; 1088789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1089789Sahrens int i; 1090789Sahrens 10911775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1092789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1093789Sahrens 1094789Sahrens zio_gang_byteswap(zio); 1095789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1096789Sahrens 1097789Sahrens ASSERT(gsize == gbufsize); 1098789Sahrens 1099789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1100789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1101789Sahrens lsize = BP_GET_PSIZE(gbp); 1102789Sahrens 1103789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1104789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1105789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1106789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1107789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1108789Sahrens 1109789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1110789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 11111544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 11121544Seschrock &zio->io_bookmark)); 1113789Sahrens } 1114789Sahrens 1115789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1116789Sahrens zio_wait_children_ready(zio); 1117789Sahrens } 1118789Sahrens 1119789Sahrens static void 1120789Sahrens zio_free_gang_members(zio_t *zio) 1121789Sahrens { 1122789Sahrens zio_gbh_phys_t *gbh; 1123789Sahrens uint64_t gsize, gbufsize; 1124789Sahrens int i; 1125789Sahrens 11261775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1127789Sahrens 1128789Sahrens zio_gang_byteswap(zio); 1129789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1130789Sahrens 1131789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1132789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1133789Sahrens 1134789Sahrens if (BP_IS_HOLE(gbp)) 1135789Sahrens continue; 1136789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1137789Sahrens gbp, NULL, NULL)); 1138789Sahrens } 1139789Sahrens 1140789Sahrens zio_buf_free(gbh, gbufsize); 1141789Sahrens zio_next_stage(zio); 1142789Sahrens } 1143789Sahrens 1144789Sahrens static void 1145789Sahrens zio_claim_gang_members(zio_t *zio) 1146789Sahrens { 1147789Sahrens zio_gbh_phys_t *gbh; 1148789Sahrens uint64_t gsize, gbufsize; 1149789Sahrens int i; 1150789Sahrens 11511775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1152789Sahrens 1153789Sahrens zio_gang_byteswap(zio); 1154789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1155789Sahrens 1156789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1157789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1158789Sahrens if (BP_IS_HOLE(gbp)) 1159789Sahrens continue; 1160789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1161789Sahrens gbp, NULL, NULL)); 1162789Sahrens } 1163789Sahrens 1164789Sahrens zio_buf_free(gbh, gbufsize); 1165789Sahrens zio_next_stage(zio); 1166789Sahrens } 1167789Sahrens 1168789Sahrens static void 1169789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1170789Sahrens { 1171789Sahrens zio_t *pio = zio->io_parent; 11721775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 11731775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1174789Sahrens uint64_t asize; 11751775Sbillm int d; 1176789Sahrens 11771775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 11781775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 11791775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 11801775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 11811775Sbillm 1182789Sahrens mutex_enter(&pio->io_lock); 11831775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 11841775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 11851775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 11861775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 11871775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 11881775Sbillm } 1189789Sahrens mutex_exit(&pio->io_lock); 1190789Sahrens } 1191789Sahrens 1192789Sahrens static void 1193789Sahrens zio_write_allocate_gang_members(zio_t *zio) 1194789Sahrens { 1195789Sahrens blkptr_t *bp = zio->io_bp; 11961775Sbillm dva_t *dva = bp->blk_dva; 11971775Sbillm spa_t *spa = zio->io_spa; 1198789Sahrens zio_gbh_phys_t *gbh; 11991775Sbillm uint64_t txg = zio->io_txg; 1200789Sahrens uint64_t resid = zio->io_size; 1201789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1202789Sahrens uint64_t gsize, loff, lsize; 1203789Sahrens uint32_t gbps_left; 12041775Sbillm int ndvas = zio->io_ndvas; 12051775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1206789Sahrens int error; 12071775Sbillm int i, d; 1208789Sahrens 1209789Sahrens gsize = SPA_GANGBLOCKSIZE; 1210789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1211789Sahrens 12123063Sperrin error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); 1213789Sahrens if (error == ENOSPC) 1214789Sahrens panic("can't allocate gang block header"); 1215789Sahrens ASSERT(error == 0); 1216789Sahrens 12171775Sbillm for (d = 0; d < gbh_ndvas; d++) 12181775Sbillm DVA_SET_GANG(&dva[d], 1); 1219789Sahrens 12201775Sbillm bp->blk_birth = txg; 1221789Sahrens 1222789Sahrens gbh = zio_buf_alloc(gsize); 1223789Sahrens bzero(gbh, gsize); 1224789Sahrens 12251775Sbillm /* We need to test multi-level gang blocks */ 12261775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 12271775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 12281775Sbillm 1229789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1230789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1231789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 12321775Sbillm dva = gbp->blk_dva; 1233789Sahrens 1234789Sahrens ASSERT(gbps_left != 0); 1235789Sahrens maxalloc = MIN(maxalloc, resid); 1236789Sahrens 1237789Sahrens while (resid <= maxalloc * gbps_left) { 12381775Sbillm error = metaslab_alloc(spa, maxalloc, gbp, ndvas, 12393063Sperrin txg, bp, B_FALSE); 1240789Sahrens if (error == 0) 1241789Sahrens break; 1242789Sahrens ASSERT3U(error, ==, ENOSPC); 1243789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 1244789Sahrens panic("really out of space"); 1245789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1246789Sahrens } 1247789Sahrens 1248789Sahrens if (resid <= maxalloc * gbps_left) { 1249789Sahrens lsize = maxalloc; 1250789Sahrens BP_SET_LSIZE(gbp, lsize); 1251789Sahrens BP_SET_PSIZE(gbp, lsize); 1252789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 12531775Sbillm gbp->blk_birth = txg; 12541775Sbillm zio_nowait(zio_rewrite(zio, spa, 12551775Sbillm zio->io_checksum, txg, gbp, 1256789Sahrens (char *)zio->io_data + loff, lsize, 1257789Sahrens zio_write_allocate_gang_member_done, NULL, 12581544Seschrock zio->io_priority, zio->io_flags, 12591544Seschrock &zio->io_bookmark)); 1260789Sahrens } else { 1261789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1262789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 12631775Sbillm zio_nowait(zio_write_allocate(zio, spa, 12641775Sbillm zio->io_checksum, txg, gbp, 1265789Sahrens (char *)zio->io_data + loff, lsize, 1266789Sahrens zio_write_allocate_gang_member_done, NULL, 1267789Sahrens zio->io_priority, zio->io_flags)); 1268789Sahrens } 1269789Sahrens } 1270789Sahrens 1271789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1272789Sahrens 1273789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1274789Sahrens 1275789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 12761775Sbillm /* 12771775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 12781775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 12791775Sbillm * so we have to wait for that to finish in order for our BP 12801775Sbillm * to be stable. 12811775Sbillm */ 1282789Sahrens zio_wait_children_done(zio); 1283789Sahrens } 1284789Sahrens 1285789Sahrens /* 1286789Sahrens * ========================================================================== 1287789Sahrens * Allocate and free blocks 1288789Sahrens * ========================================================================== 1289789Sahrens */ 1290789Sahrens static void 1291789Sahrens zio_dva_allocate(zio_t *zio) 1292789Sahrens { 1293789Sahrens blkptr_t *bp = zio->io_bp; 1294789Sahrens int error; 1295789Sahrens 1296789Sahrens ASSERT(BP_IS_HOLE(bp)); 12971775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 12981775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 12991775Sbillm ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa)); 1300789Sahrens 1301789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1302789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 1303789Sahrens zio_write_allocate_gang_members(zio); 1304789Sahrens return; 1305789Sahrens } 1306789Sahrens 1307789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1308789Sahrens 13091775Sbillm error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas, 13103063Sperrin zio->io_txg, NULL, B_FALSE); 1311789Sahrens 1312789Sahrens if (error == 0) { 1313789Sahrens bp->blk_birth = zio->io_txg; 1314789Sahrens } else if (error == ENOSPC) { 1315789Sahrens if (zio->io_size == SPA_MINBLOCKSIZE) 1316789Sahrens panic("really, truly out of space"); 1317789Sahrens zio_write_allocate_gang_members(zio); 1318789Sahrens return; 1319789Sahrens } else { 1320789Sahrens zio->io_error = error; 1321789Sahrens } 1322789Sahrens zio_next_stage(zio); 1323789Sahrens } 1324789Sahrens 1325789Sahrens static void 1326789Sahrens zio_dva_free(zio_t *zio) 1327789Sahrens { 1328789Sahrens blkptr_t *bp = zio->io_bp; 1329789Sahrens 13301807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1331789Sahrens 1332789Sahrens BP_ZERO(bp); 1333789Sahrens 1334789Sahrens zio_next_stage(zio); 1335789Sahrens } 1336789Sahrens 1337789Sahrens static void 1338789Sahrens zio_dva_claim(zio_t *zio) 1339789Sahrens { 13401807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1341789Sahrens 1342789Sahrens zio_next_stage(zio); 1343789Sahrens } 1344789Sahrens 1345789Sahrens /* 1346789Sahrens * ========================================================================== 1347789Sahrens * Read and write to physical devices 1348789Sahrens * ========================================================================== 1349789Sahrens */ 1350789Sahrens 1351789Sahrens static void 13521775Sbillm zio_vdev_io_start(zio_t *zio) 1353789Sahrens { 1354789Sahrens vdev_t *vd = zio->io_vd; 13551775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 13561775Sbillm blkptr_t *bp = zio->io_bp; 13571775Sbillm uint64_t align; 1358789Sahrens 13591775Sbillm if (vd == NULL) { 13601775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 13611775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 13621775Sbillm return; 13631775Sbillm } 13641775Sbillm 13651775Sbillm align = 1ULL << tvd->vdev_ashift; 13661775Sbillm 13671732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1368789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1369789Sahrens 13701775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 13711775Sbillm vd->vdev_children == 0) { 1372789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1373789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1374789Sahrens } 1375789Sahrens 13761732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 13771732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 13781732Sbonwick char *abuf = zio_buf_alloc(asize); 13791732Sbonwick ASSERT(vd == tvd); 13801732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 13811732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 13821732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 13831732Sbonwick } 13841732Sbonwick zio_push_transform(zio, abuf, asize, asize); 13851732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 13861732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 13871732Sbonwick } 13881732Sbonwick 13891732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 13901732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 13911732Sbonwick ASSERT(bp == NULL || 13921732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1393789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1394789Sahrens 1395789Sahrens vdev_io_start(zio); 1396789Sahrens 1397789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1398789Sahrens } 1399789Sahrens 1400789Sahrens static void 1401789Sahrens zio_vdev_io_done(zio_t *zio) 1402789Sahrens { 14031775Sbillm if (zio->io_vd == NULL) 14041775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 14051775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 14061775Sbillm else 14071775Sbillm vdev_io_done(zio); 1408789Sahrens } 1409789Sahrens 1410789Sahrens /* XXPOLICY */ 14111544Seschrock boolean_t 1412789Sahrens zio_should_retry(zio_t *zio) 1413789Sahrens { 1414789Sahrens vdev_t *vd = zio->io_vd; 1415789Sahrens 1416789Sahrens if (zio->io_error == 0) 1417789Sahrens return (B_FALSE); 1418789Sahrens if (zio->io_delegate_list != NULL) 1419789Sahrens return (B_FALSE); 14201775Sbillm if (vd && vd != vd->vdev_top) 1421789Sahrens return (B_FALSE); 1422789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1423789Sahrens return (B_FALSE); 14241544Seschrock if (zio->io_retries > 0) 1425789Sahrens return (B_FALSE); 1426789Sahrens 1427789Sahrens return (B_TRUE); 1428789Sahrens } 1429789Sahrens 1430789Sahrens static void 1431789Sahrens zio_vdev_io_assess(zio_t *zio) 1432789Sahrens { 1433789Sahrens vdev_t *vd = zio->io_vd; 14341775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1435789Sahrens 14361544Seschrock ASSERT(zio->io_vsd == NULL); 1437789Sahrens 14381732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 14391732Sbonwick void *abuf; 14401732Sbonwick uint64_t asize; 14411732Sbonwick ASSERT(vd == tvd); 14421732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 14431732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 14441732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 14451732Sbonwick zio_buf_free(abuf, asize); 14461732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 14471732Sbonwick } 14481732Sbonwick 14491544Seschrock if (zio_injection_enabled && !zio->io_error) 14501544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1451789Sahrens 1452789Sahrens /* 1453789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1454789Sahrens */ 1455789Sahrens /* XXPOLICY */ 1456789Sahrens if (zio_should_retry(zio)) { 1457789Sahrens ASSERT(tvd == vd); 1458789Sahrens 1459789Sahrens zio->io_retries++; 1460789Sahrens zio->io_error = 0; 1461789Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT; 1462789Sahrens /* XXPOLICY */ 1463789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1464789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 14651775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1466789Sahrens 1467789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1468789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1469789Sahrens vdev_description(vd), zio->io_offset); 1470789Sahrens 14711544Seschrock zio_next_stage_async(zio); 14721544Seschrock return; 14731544Seschrock } 1474789Sahrens 14751775Sbillm if (zio->io_error != 0 && zio->io_error != ECKSUM && 14761775Sbillm !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) { 1477789Sahrens /* 14781544Seschrock * Poor man's hotplug support. Even if we're done retrying this 14791544Seschrock * I/O, try to reopen the vdev to see if it's still attached. 14801544Seschrock * To avoid excessive thrashing, we only try it once a minute. 14811544Seschrock * This also has the effect of detecting when missing devices 14821544Seschrock * have come back, by polling the device once a minute. 14831544Seschrock * 14841544Seschrock * We need to do this asynchronously because we can't grab 14851544Seschrock * all the necessary locks way down here. 1486789Sahrens */ 14871544Seschrock if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { 14881544Seschrock vd->vdev_last_try = gethrtime(); 14891544Seschrock tvd->vdev_reopen_wanted = 1; 14901544Seschrock spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); 14911544Seschrock } 1492789Sahrens } 1493789Sahrens 1494789Sahrens zio_next_stage(zio); 1495789Sahrens } 1496789Sahrens 1497789Sahrens void 1498789Sahrens zio_vdev_io_reissue(zio_t *zio) 1499789Sahrens { 1500789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1501789Sahrens ASSERT(zio->io_error == 0); 1502789Sahrens 1503789Sahrens zio->io_stage--; 1504789Sahrens } 1505789Sahrens 1506789Sahrens void 1507789Sahrens zio_vdev_io_redone(zio_t *zio) 1508789Sahrens { 1509789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1510789Sahrens 1511789Sahrens zio->io_stage--; 1512789Sahrens } 1513789Sahrens 1514789Sahrens void 1515789Sahrens zio_vdev_io_bypass(zio_t *zio) 1516789Sahrens { 1517789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1518789Sahrens ASSERT(zio->io_error == 0); 1519789Sahrens 1520789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1521789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1522789Sahrens } 1523789Sahrens 1524789Sahrens /* 1525789Sahrens * ========================================================================== 1526789Sahrens * Generate and verify checksums 1527789Sahrens * ========================================================================== 1528789Sahrens */ 1529789Sahrens static void 1530789Sahrens zio_checksum_generate(zio_t *zio) 1531789Sahrens { 1532789Sahrens int checksum = zio->io_checksum; 1533789Sahrens blkptr_t *bp = zio->io_bp; 1534789Sahrens 1535789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1536789Sahrens 1537789Sahrens BP_SET_CHECKSUM(bp, checksum); 1538789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1539789Sahrens 1540789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1541789Sahrens 1542789Sahrens zio_next_stage(zio); 1543789Sahrens } 1544789Sahrens 1545789Sahrens static void 1546789Sahrens zio_gang_checksum_generate(zio_t *zio) 1547789Sahrens { 1548789Sahrens zio_cksum_t zc; 1549789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1550789Sahrens 15511775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1552789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1553789Sahrens 1554789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1555789Sahrens 1556789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1557789Sahrens 1558789Sahrens zio_next_stage(zio); 1559789Sahrens } 1560789Sahrens 1561789Sahrens static void 1562789Sahrens zio_checksum_verify(zio_t *zio) 1563789Sahrens { 1564789Sahrens if (zio->io_bp != NULL) { 1565789Sahrens zio->io_error = zio_checksum_error(zio); 15661544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 15671544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 15681544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1569789Sahrens } 1570789Sahrens 1571789Sahrens zio_next_stage(zio); 1572789Sahrens } 1573789Sahrens 1574789Sahrens /* 1575789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1576789Sahrens */ 1577789Sahrens void 1578789Sahrens zio_checksum_verified(zio_t *zio) 1579789Sahrens { 1580789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1581789Sahrens } 1582789Sahrens 1583789Sahrens /* 1584789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1585789Sahrens */ 1586789Sahrens void 1587789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1588789Sahrens { 15891775Sbillm blkptr_t *bp = zio->io_bp; 15901775Sbillm 15911775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 15921775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 15931775Sbillm zcp->zc_word[2] = bp->blk_birth; 1594789Sahrens zcp->zc_word[3] = 0; 1595789Sahrens } 1596789Sahrens 1597789Sahrens /* 1598789Sahrens * ========================================================================== 1599789Sahrens * Define the pipeline 1600789Sahrens * ========================================================================== 1601789Sahrens */ 1602789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1603789Sahrens 1604789Sahrens static void 1605789Sahrens zio_badop(zio_t *zio) 1606789Sahrens { 1607789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1608789Sahrens } 1609789Sahrens 1610789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1611789Sahrens zio_badop, 1612789Sahrens zio_wait_children_ready, 1613789Sahrens zio_write_compress, 1614789Sahrens zio_checksum_generate, 1615789Sahrens zio_gang_pipeline, 1616789Sahrens zio_get_gang_header, 1617789Sahrens zio_rewrite_gang_members, 1618789Sahrens zio_free_gang_members, 1619789Sahrens zio_claim_gang_members, 1620789Sahrens zio_dva_allocate, 1621789Sahrens zio_dva_free, 1622789Sahrens zio_dva_claim, 1623789Sahrens zio_gang_checksum_generate, 1624789Sahrens zio_ready, 1625789Sahrens zio_vdev_io_start, 1626789Sahrens zio_vdev_io_done, 1627789Sahrens zio_vdev_io_assess, 1628789Sahrens zio_wait_children_done, 1629789Sahrens zio_checksum_verify, 1630789Sahrens zio_read_gang_members, 1631789Sahrens zio_read_decompress, 1632789Sahrens zio_done, 1633789Sahrens zio_badop 1634789Sahrens }; 1635789Sahrens 1636789Sahrens /* 1637789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1638789Sahrens * There's no locking on io_stage because there's no legitimate way for 1639789Sahrens * multiple threads to be attempting to process the same I/O. 1640789Sahrens */ 1641789Sahrens void 1642789Sahrens zio_next_stage(zio_t *zio) 1643789Sahrens { 1644789Sahrens uint32_t pipeline = zio->io_pipeline; 1645789Sahrens 1646789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1647789Sahrens 1648789Sahrens if (zio->io_error) { 1649789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1650789Sahrens zio, vdev_description(zio->io_vd), 1651789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1652789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1653789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1654789Sahrens } 1655789Sahrens 1656789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1657789Sahrens continue; 1658789Sahrens 1659789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1660789Sahrens ASSERT(zio->io_stalled == 0); 1661789Sahrens 1662789Sahrens zio_pipeline[zio->io_stage](zio); 1663789Sahrens } 1664789Sahrens 1665789Sahrens void 1666789Sahrens zio_next_stage_async(zio_t *zio) 1667789Sahrens { 1668789Sahrens taskq_t *tq; 1669789Sahrens uint32_t pipeline = zio->io_pipeline; 1670789Sahrens 1671789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1672789Sahrens 1673789Sahrens if (zio->io_error) { 1674789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 1675789Sahrens zio, vdev_description(zio->io_vd), 1676789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 1677789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 1678789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1679789Sahrens } 1680789Sahrens 1681789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 1682789Sahrens continue; 1683789Sahrens 1684789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1685789Sahrens ASSERT(zio->io_stalled == 0); 1686789Sahrens 1687789Sahrens /* 1688789Sahrens * For performance, we'll probably want two sets of task queues: 1689789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 1690789Sahrens * part is for read performance: since we have to make a pass over 1691789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 1692789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 1693789Sahrens * that thread is probably still there. Getting this optimization 1694789Sahrens * right avoids performance-hostile cache-to-cache transfers. 1695789Sahrens * 1696789Sahrens * Note that having two sets of task queues is also necessary for 1697789Sahrens * correctness: if all of the issue threads get bogged down waiting 1698789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 1699789Sahrens * there won't be any threads available to service I/O completion 1700789Sahrens * interrupts. 1701789Sahrens */ 1702789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 1703789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 1704789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 1705789Sahrens else 1706789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 1707789Sahrens (void) taskq_dispatch(tq, 1708789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 1709789Sahrens } else { 1710789Sahrens zio_pipeline[zio->io_stage](zio); 1711789Sahrens } 1712789Sahrens } 1713789Sahrens 1714789Sahrens /* 1715789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 1716789Sahrens */ 1717789Sahrens int 17183063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 17193063Sperrin uint64_t txg) 1720789Sahrens { 1721789Sahrens int error; 1722789Sahrens 17231544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1724789Sahrens 17253063Sperrin /* 17263063Sperrin * We were passed the previous log blocks dva_t in bp->blk_dva[0]. 17273063Sperrin */ 17283063Sperrin error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE); 1729789Sahrens 1730789Sahrens if (error == 0) { 17313063Sperrin BP_SET_LSIZE(new_bp, size); 17323063Sperrin BP_SET_PSIZE(new_bp, size); 17333063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 17343063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 17353063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 17363063Sperrin BP_SET_LEVEL(new_bp, 0); 17373063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 17383063Sperrin new_bp->blk_birth = txg; 1739789Sahrens } 1740789Sahrens 17411544Seschrock spa_config_exit(spa, FTAG); 1742789Sahrens 1743789Sahrens return (error); 1744789Sahrens } 1745789Sahrens 1746789Sahrens /* 1747789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 1748789Sahrens * nothing to do except metaslab_free() it. 1749789Sahrens */ 1750789Sahrens void 1751789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 1752789Sahrens { 17531775Sbillm ASSERT(!BP_IS_GANG(bp)); 1754789Sahrens 17551544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1756789Sahrens 17571807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 1758789Sahrens 17591544Seschrock spa_config_exit(spa, FTAG); 1760789Sahrens } 1761