1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 223459Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 64789Sahrens /* At or above this size, force gang blocking - for testing */ 65789Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; 66789Sahrens 673668Sgw25295 /* Force an allocation failure when non-zero */ 683668Sgw25295 uint16_t zio_zil_fail_shift = 0; 695329Sgw25295 uint16_t zio_io_fail_shift = 0; 705329Sgw25295 715329Sgw25295 /* Enable/disable the write-retry logic */ 725329Sgw25295 int zio_write_retry = 1; 735329Sgw25295 745329Sgw25295 /* Taskq to handle reissuing of I/Os */ 755329Sgw25295 taskq_t *zio_taskq; 765329Sgw25295 int zio_resume_threads = 4; 773668Sgw25295 78789Sahrens typedef struct zio_sync_pass { 79789Sahrens int zp_defer_free; /* defer frees after this pass */ 80789Sahrens int zp_dontcompress; /* don't compress after this pass */ 81789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 82789Sahrens } zio_sync_pass_t; 83789Sahrens 84789Sahrens zio_sync_pass_t zio_sync_pass = { 85789Sahrens 1, /* zp_defer_free */ 86789Sahrens 4, /* zp_dontcompress */ 87789Sahrens 1, /* zp_rewrite */ 88789Sahrens }; 89789Sahrens 905329Sgw25295 static boolean_t zio_io_should_fail(uint16_t); 915329Sgw25295 92789Sahrens /* 93789Sahrens * ========================================================================== 94789Sahrens * I/O kmem caches 95789Sahrens * ========================================================================== 96789Sahrens */ 974055Seschrock kmem_cache_t *zio_cache; 98789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 993290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 1003290Sjohansen 1013290Sjohansen #ifdef _KERNEL 1023290Sjohansen extern vmem_t *zio_alloc_arena; 1033290Sjohansen #endif 104789Sahrens 1055329Sgw25295 /* 1065329Sgw25295 * Determine if we are allowed to issue the IO based on the 1075329Sgw25295 * pool state. If we must wait then block until we are told 1085329Sgw25295 * that we may continue. 1095329Sgw25295 */ 1105329Sgw25295 #define ZIO_ENTER(spa) { \ 1115329Sgw25295 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 1125329Sgw25295 mutex_enter(&spa->spa_zio_lock); \ 1135329Sgw25295 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 1145329Sgw25295 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 1155329Sgw25295 mutex_exit(&spa->spa_zio_lock); \ 1165329Sgw25295 } \ 1175329Sgw25295 } 1185329Sgw25295 1195329Sgw25295 /* 1205329Sgw25295 * An allocation zio is one that either currently has the DVA allocate 1215329Sgw25295 * stage set or will have it later in it's lifetime. 1225329Sgw25295 */ 1235329Sgw25295 #define IO_IS_ALLOCATING(zio) \ 1245329Sgw25295 ((zio)->io_orig_pipeline == ZIO_WRITE_PIPELINE || \ 1255329Sgw25295 (zio)->io_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 1265329Sgw25295 127789Sahrens void 128789Sahrens zio_init(void) 129789Sahrens { 130789Sahrens size_t c; 1313290Sjohansen vmem_t *data_alloc_arena = NULL; 1323290Sjohansen 1333290Sjohansen #ifdef _KERNEL 1343290Sjohansen data_alloc_arena = zio_alloc_arena; 1353290Sjohansen #endif 136789Sahrens 1374055Seschrock zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 1384055Seschrock NULL, NULL, NULL, NULL, NULL, 0); 1394055Seschrock 140789Sahrens /* 141789Sahrens * For small buffers, we want a cache for each multiple of 142789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 143789Sahrens * for each quarter-power of 2. For large buffers, we want 144789Sahrens * a cache for each multiple of PAGESIZE. 145789Sahrens */ 146789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 147789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 148789Sahrens size_t p2 = size; 149789Sahrens size_t align = 0; 150789Sahrens 151789Sahrens while (p2 & (p2 - 1)) 152789Sahrens p2 &= p2 - 1; 153789Sahrens 154789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 155789Sahrens align = SPA_MINBLOCKSIZE; 156789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 157789Sahrens align = PAGESIZE; 158789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 159789Sahrens align = p2 >> 2; 160789Sahrens } 161789Sahrens 162789Sahrens if (align != 0) { 1633290Sjohansen char name[36]; 1642856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 165789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 166849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1673290Sjohansen 1683290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1693290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1703290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1713290Sjohansen KMC_NODEBUG); 1723290Sjohansen 173789Sahrens dprintf("creating cache for size %5lx align %5lx\n", 174789Sahrens size, align); 175789Sahrens } 176789Sahrens } 177789Sahrens 178789Sahrens while (--c != 0) { 179789Sahrens ASSERT(zio_buf_cache[c] != NULL); 180789Sahrens if (zio_buf_cache[c - 1] == NULL) 181789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1823290Sjohansen 1833290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1843290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1853290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 186789Sahrens } 1871544Seschrock 1885329Sgw25295 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 1895329Sgw25295 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 1905329Sgw25295 1911544Seschrock zio_inject_init(); 192789Sahrens } 193789Sahrens 194789Sahrens void 195789Sahrens zio_fini(void) 196789Sahrens { 197789Sahrens size_t c; 198789Sahrens kmem_cache_t *last_cache = NULL; 1993290Sjohansen kmem_cache_t *last_data_cache = NULL; 200789Sahrens 201789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 202789Sahrens if (zio_buf_cache[c] != last_cache) { 203789Sahrens last_cache = zio_buf_cache[c]; 204789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 205789Sahrens } 206789Sahrens zio_buf_cache[c] = NULL; 2073290Sjohansen 2083290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 2093290Sjohansen last_data_cache = zio_data_buf_cache[c]; 2103290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 2113290Sjohansen } 2123290Sjohansen zio_data_buf_cache[c] = NULL; 213789Sahrens } 2141544Seschrock 2155329Sgw25295 taskq_destroy(zio_taskq); 2165329Sgw25295 2174055Seschrock kmem_cache_destroy(zio_cache); 2184055Seschrock 2191544Seschrock zio_inject_fini(); 220789Sahrens } 221789Sahrens 222789Sahrens /* 223789Sahrens * ========================================================================== 224789Sahrens * Allocate and free I/O buffers 225789Sahrens * ========================================================================== 226789Sahrens */ 2273290Sjohansen 2283290Sjohansen /* 2293290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 2303290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 2313290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 2323290Sjohansen * excess / transient data in-core during a crashdump. 2333290Sjohansen */ 234789Sahrens void * 235789Sahrens zio_buf_alloc(size_t size) 236789Sahrens { 237789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 238789Sahrens 239789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 240789Sahrens 241789Sahrens return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP)); 242789Sahrens } 243789Sahrens 2443290Sjohansen /* 2453290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2463290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2473290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2483290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2493290Sjohansen */ 2503290Sjohansen void * 2513290Sjohansen zio_data_buf_alloc(size_t size) 2523290Sjohansen { 2533290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2543290Sjohansen 2553290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2563290Sjohansen 2573290Sjohansen return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP)); 2583290Sjohansen } 2593290Sjohansen 260789Sahrens void 261789Sahrens zio_buf_free(void *buf, size_t size) 262789Sahrens { 263789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 264789Sahrens 265789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 266789Sahrens 267789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 268789Sahrens } 269789Sahrens 2703290Sjohansen void 2713290Sjohansen zio_data_buf_free(void *buf, size_t size) 2723290Sjohansen { 2733290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2743290Sjohansen 2753290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2763290Sjohansen 2773290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2783290Sjohansen } 2793463Sahrens 280789Sahrens /* 281789Sahrens * ========================================================================== 282789Sahrens * Push and pop I/O transform buffers 283789Sahrens * ========================================================================== 284789Sahrens */ 285789Sahrens static void 286789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 287789Sahrens { 288789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 289789Sahrens 290789Sahrens zt->zt_data = data; 291789Sahrens zt->zt_size = size; 292789Sahrens zt->zt_bufsize = bufsize; 293789Sahrens 294789Sahrens zt->zt_next = zio->io_transform_stack; 295789Sahrens zio->io_transform_stack = zt; 296789Sahrens 297789Sahrens zio->io_data = data; 298789Sahrens zio->io_size = size; 299789Sahrens } 300789Sahrens 301789Sahrens static void 302789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 303789Sahrens { 304789Sahrens zio_transform_t *zt = zio->io_transform_stack; 305789Sahrens 306789Sahrens *data = zt->zt_data; 307789Sahrens *size = zt->zt_size; 308789Sahrens *bufsize = zt->zt_bufsize; 309789Sahrens 310789Sahrens zio->io_transform_stack = zt->zt_next; 311789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 312789Sahrens 313789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 314789Sahrens zio->io_data = zt->zt_data; 315789Sahrens zio->io_size = zt->zt_size; 316789Sahrens } 317789Sahrens } 318789Sahrens 319789Sahrens static void 320789Sahrens zio_clear_transform_stack(zio_t *zio) 321789Sahrens { 322789Sahrens void *data; 323789Sahrens uint64_t size, bufsize; 324789Sahrens 325789Sahrens ASSERT(zio->io_transform_stack != NULL); 326789Sahrens 327789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 328789Sahrens while (zio->io_transform_stack != NULL) { 329789Sahrens zio_buf_free(data, bufsize); 330789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 331789Sahrens } 332789Sahrens } 333789Sahrens 334789Sahrens /* 335789Sahrens * ========================================================================== 336789Sahrens * Create the various types of I/O (read, write, free) 337789Sahrens * ========================================================================== 338789Sahrens */ 339789Sahrens static zio_t * 340789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 341789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 342789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 343789Sahrens { 344789Sahrens zio_t *zio; 345789Sahrens 346789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 347789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 348789Sahrens 3494055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 3504055Seschrock bzero(zio, sizeof (zio_t)); 351789Sahrens zio->io_parent = pio; 352789Sahrens zio->io_spa = spa; 353789Sahrens zio->io_txg = txg; 3544634Sek110237 zio->io_flags = flags; 355789Sahrens if (bp != NULL) { 356789Sahrens zio->io_bp = bp; 357789Sahrens zio->io_bp_copy = *bp; 358789Sahrens zio->io_bp_orig = *bp; 3594634Sek110237 if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata || 3604634Sek110237 BP_GET_LEVEL(bp) != 0) 3614634Sek110237 zio->io_flags |= ZIO_FLAG_METADATA; 362789Sahrens } 363789Sahrens zio->io_done = done; 364789Sahrens zio->io_private = private; 365789Sahrens zio->io_type = type; 366789Sahrens zio->io_priority = priority; 367789Sahrens zio->io_stage = stage; 368789Sahrens zio->io_pipeline = pipeline; 369789Sahrens zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; 370789Sahrens zio->io_timestamp = lbolt64; 3714634Sek110237 if (pio != NULL) 3724634Sek110237 zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA); 3732856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 3744831Sgw25295 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 375789Sahrens zio_push_transform(zio, data, size, size); 376789Sahrens 3773463Sahrens /* 3783463Sahrens * Note on config lock: 3793463Sahrens * 3803463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3813463Sahrens * lock, so we don't need it for this io. 3823463Sahrens * 3833463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3843463Sahrens * config lock on behalf of this io, so it should be released 3853463Sahrens * in zio_done. 3863463Sahrens * 3873463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3883463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3893463Sahrens * The NULL top-level ios rarely have any children, so we delay 3903463Sahrens * grabbing the lock until the first child is added (but it is 3913463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3923463Sahrens * children don't need to also grab it). This greatly reduces 3933463Sahrens * contention on the config lock. 3943463Sahrens */ 395789Sahrens if (pio == NULL) { 3963463Sahrens if (type != ZIO_TYPE_NULL && 3973463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3981544Seschrock spa_config_enter(zio->io_spa, RW_READER, zio); 3993463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 4003463Sahrens } 401789Sahrens zio->io_root = zio; 402789Sahrens } else { 403789Sahrens zio->io_root = pio->io_root; 4041544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 4051544Seschrock zio->io_logical = pio->io_logical; 406789Sahrens mutex_enter(&pio->io_lock); 4073463Sahrens if (pio->io_parent == NULL && 4083463Sahrens pio->io_type == ZIO_TYPE_NULL && 4093463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 4103463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 4113463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 4123463Sahrens spa_config_enter(zio->io_spa, RW_READER, pio); 4133463Sahrens } 414789Sahrens if (stage < ZIO_STAGE_READY) 415789Sahrens pio->io_children_notready++; 416789Sahrens pio->io_children_notdone++; 417789Sahrens zio->io_sibling_next = pio->io_child; 418789Sahrens zio->io_sibling_prev = NULL; 419789Sahrens if (pio->io_child != NULL) 420789Sahrens pio->io_child->io_sibling_prev = zio; 421789Sahrens pio->io_child = zio; 4221775Sbillm zio->io_ndvas = pio->io_ndvas; 423789Sahrens mutex_exit(&pio->io_lock); 424789Sahrens } 425789Sahrens 4265329Sgw25295 /* 4275329Sgw25295 * Save off the original state incase we need to retry later. 4285329Sgw25295 */ 4295329Sgw25295 zio->io_orig_stage = zio->io_stage; 4305329Sgw25295 zio->io_orig_pipeline = zio->io_pipeline; 4315329Sgw25295 zio->io_orig_flags = zio->io_flags; 4325329Sgw25295 433789Sahrens return (zio); 434789Sahrens } 435789Sahrens 4365329Sgw25295 static void 4375329Sgw25295 zio_reset(zio_t *zio) 4385329Sgw25295 { 4395329Sgw25295 zio_clear_transform_stack(zio); 4405329Sgw25295 4415329Sgw25295 zio->io_flags = zio->io_orig_flags; 4425329Sgw25295 zio->io_stage = zio->io_orig_stage; 4435329Sgw25295 zio->io_pipeline = zio->io_orig_pipeline; 4445329Sgw25295 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 4455329Sgw25295 } 4465329Sgw25295 447789Sahrens zio_t * 448789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 449789Sahrens int flags) 450789Sahrens { 451789Sahrens zio_t *zio; 452789Sahrens 453789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 454789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 455789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 456789Sahrens 457789Sahrens return (zio); 458789Sahrens } 459789Sahrens 460789Sahrens zio_t * 461789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 462789Sahrens { 463789Sahrens return (zio_null(NULL, spa, done, private, flags)); 464789Sahrens } 465789Sahrens 466789Sahrens zio_t * 467789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 468789Sahrens uint64_t size, zio_done_func_t *done, void *private, 4691544Seschrock int priority, int flags, zbookmark_t *zb) 470789Sahrens { 471789Sahrens zio_t *zio; 472789Sahrens 473789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 474789Sahrens 4755329Sgw25295 /* 4765329Sgw25295 * If the user has specified that we allow I/Os to continue 4775329Sgw25295 * then attempt to satisfy the read. 4785329Sgw25295 */ 4795329Sgw25295 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 4805329Sgw25295 ZIO_ENTER(spa); 4815329Sgw25295 482789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4832981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4842981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4851544Seschrock zio->io_bookmark = *zb; 4861544Seschrock 4871544Seschrock zio->io_logical = zio; 488789Sahrens 489789Sahrens /* 490789Sahrens * Work off our copy of the bp so the caller can free it. 491789Sahrens */ 492789Sahrens zio->io_bp = &zio->io_bp_copy; 493789Sahrens 494789Sahrens return (zio); 495789Sahrens } 496789Sahrens 497789Sahrens zio_t * 4981775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 499789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 5003547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 5013547Smaybee int flags, zbookmark_t *zb) 502789Sahrens { 503789Sahrens zio_t *zio; 504789Sahrens 505789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 506789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 507789Sahrens 508789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 509789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 510789Sahrens 5115329Sgw25295 ZIO_ENTER(spa); 5125329Sgw25295 513789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5142981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 515789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 516789Sahrens 5173547Smaybee zio->io_ready = ready; 5183547Smaybee 5191544Seschrock zio->io_bookmark = *zb; 5201544Seschrock 5211544Seschrock zio->io_logical = zio; 5221544Seschrock 523789Sahrens zio->io_checksum = checksum; 524789Sahrens zio->io_compress = compress; 5251775Sbillm zio->io_ndvas = ncopies; 526789Sahrens 527789Sahrens if (compress != ZIO_COMPRESS_OFF) 528789Sahrens zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; 529789Sahrens 530789Sahrens if (bp->blk_birth != txg) { 531789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 532789Sahrens BP_ZERO(bp); 533789Sahrens BP_SET_LSIZE(bp, size); 534789Sahrens BP_SET_PSIZE(bp, size); 5351775Sbillm } else { 5361775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 5371775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 5381775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 539789Sahrens } 540789Sahrens 541789Sahrens return (zio); 542789Sahrens } 543789Sahrens 544789Sahrens zio_t * 545789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 546789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 5471544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 5481544Seschrock zbookmark_t *zb) 549789Sahrens { 550789Sahrens zio_t *zio; 551789Sahrens 552789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5532981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 554789Sahrens ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 555789Sahrens 5561544Seschrock zio->io_bookmark = *zb; 557789Sahrens zio->io_checksum = checksum; 558789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 559789Sahrens 5601775Sbillm if (pio != NULL) 5611775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5621775Sbillm 563789Sahrens return (zio); 564789Sahrens } 565789Sahrens 5665329Sgw25295 static void 5675329Sgw25295 zio_write_allocate_ready(zio_t *zio) 5685329Sgw25295 { 5695329Sgw25295 /* Free up the previous block */ 5705329Sgw25295 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 5715329Sgw25295 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 5725329Sgw25295 &zio->io_bp_orig, NULL, NULL)); 5735329Sgw25295 } 5745329Sgw25295 } 5755329Sgw25295 576789Sahrens static zio_t * 577789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 578789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 579789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 580789Sahrens { 581789Sahrens zio_t *zio; 582789Sahrens 583789Sahrens BP_ZERO(bp); 584789Sahrens BP_SET_LSIZE(bp, size); 585789Sahrens BP_SET_PSIZE(bp, size); 586789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 587789Sahrens 588789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 589789Sahrens ZIO_TYPE_WRITE, priority, flags, 590789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 591789Sahrens 592789Sahrens zio->io_checksum = checksum; 593789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 5945329Sgw25295 zio->io_ready = zio_write_allocate_ready; 595789Sahrens 596789Sahrens return (zio); 597789Sahrens } 598789Sahrens 599789Sahrens zio_t * 600789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 601789Sahrens zio_done_func_t *done, void *private) 602789Sahrens { 603789Sahrens zio_t *zio; 604789Sahrens 605789Sahrens ASSERT(!BP_IS_HOLE(bp)); 606789Sahrens 607789Sahrens if (txg == spa->spa_syncing_txg && 608789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 609789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 610789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 611789Sahrens } 612789Sahrens 613789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 6142981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 615789Sahrens ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 616789Sahrens 617789Sahrens zio->io_bp = &zio->io_bp_copy; 618789Sahrens 619789Sahrens return (zio); 620789Sahrens } 621789Sahrens 622789Sahrens zio_t * 623789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 624789Sahrens zio_done_func_t *done, void *private) 625789Sahrens { 626789Sahrens zio_t *zio; 627789Sahrens 628789Sahrens /* 629789Sahrens * A claim is an allocation of a specific block. Claims are needed 630789Sahrens * to support immediate writes in the intent log. The issue is that 631789Sahrens * immediate writes contain committed data, but in a txg that was 632789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 633789Sahrens * the intent log claims all blocks that contain immediate write data 634789Sahrens * so that the SPA knows they're in use. 635789Sahrens * 636789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 637789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 638789Sahrens */ 639789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 640789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 641789Sahrens 642789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 643789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 644789Sahrens ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 645789Sahrens 646789Sahrens zio->io_bp = &zio->io_bp_copy; 647789Sahrens 648789Sahrens return (zio); 649789Sahrens } 650789Sahrens 651789Sahrens zio_t * 652789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 653789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 654789Sahrens { 655789Sahrens zio_t *zio; 656789Sahrens int c; 657789Sahrens 658789Sahrens if (vd->vdev_children == 0) { 659789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 660789Sahrens ZIO_TYPE_IOCTL, priority, flags, 661789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 662789Sahrens 663789Sahrens zio->io_vd = vd; 664789Sahrens zio->io_cmd = cmd; 665789Sahrens } else { 666789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 667789Sahrens 668789Sahrens for (c = 0; c < vd->vdev_children; c++) 669789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 670789Sahrens done, private, priority, flags)); 671789Sahrens } 672789Sahrens 673789Sahrens return (zio); 674789Sahrens } 675789Sahrens 676789Sahrens static void 677789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 678*5450Sbrendan int checksum, boolean_t labels) 679789Sahrens { 680789Sahrens ASSERT(vd->vdev_children == 0); 681789Sahrens 682789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 683789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 684789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 685789Sahrens 686*5450Sbrendan #ifdef ZFS_DEBUG 687*5450Sbrendan if (labels) { 688*5450Sbrendan ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 689*5450Sbrendan offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 690*5450Sbrendan } 691*5450Sbrendan #endif 692789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 693789Sahrens 694789Sahrens BP_ZERO(bp); 695789Sahrens 696789Sahrens BP_SET_LSIZE(bp, size); 697789Sahrens BP_SET_PSIZE(bp, size); 698789Sahrens 699789Sahrens BP_SET_CHECKSUM(bp, checksum); 700789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 701789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 702789Sahrens 703789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 704789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 705789Sahrens } 706789Sahrens 707789Sahrens zio_t * 708789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 709789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 710*5450Sbrendan int priority, int flags, boolean_t labels) 711789Sahrens { 712789Sahrens zio_t *zio; 713789Sahrens blkptr_t blk; 714789Sahrens 7155329Sgw25295 ZIO_ENTER(vd->vdev_spa); 7165329Sgw25295 717*5450Sbrendan zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 718789Sahrens 719789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 720789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 721789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 722789Sahrens 723789Sahrens zio->io_vd = vd; 724789Sahrens zio->io_offset = offset; 725789Sahrens 726789Sahrens /* 727789Sahrens * Work off our copy of the bp so the caller can free it. 728789Sahrens */ 729789Sahrens zio->io_bp = &zio->io_bp_copy; 730789Sahrens 731789Sahrens return (zio); 732789Sahrens } 733789Sahrens 734789Sahrens zio_t * 735789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 736789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 737*5450Sbrendan int priority, int flags, boolean_t labels) 738789Sahrens { 739789Sahrens zio_block_tail_t *zbt; 740789Sahrens void *wbuf; 741789Sahrens zio_t *zio; 742789Sahrens blkptr_t blk; 743789Sahrens 7445329Sgw25295 ZIO_ENTER(vd->vdev_spa); 7455329Sgw25295 746*5450Sbrendan zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 747789Sahrens 748789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 749789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 750789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 751789Sahrens 752789Sahrens zio->io_vd = vd; 753789Sahrens zio->io_offset = offset; 754789Sahrens 755789Sahrens zio->io_bp = &zio->io_bp_copy; 756789Sahrens zio->io_checksum = checksum; 757789Sahrens 758789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 759789Sahrens /* 760789Sahrens * zbt checksums are necessarily destructive -- they modify 761789Sahrens * one word of the write buffer to hold the verifier/checksum. 762789Sahrens * Therefore, we must make a local copy in case the data is 763789Sahrens * being written to multiple places. 764789Sahrens */ 765789Sahrens wbuf = zio_buf_alloc(size); 766789Sahrens bcopy(data, wbuf, size); 767789Sahrens zio_push_transform(zio, wbuf, size, size); 768789Sahrens 769789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 770789Sahrens zbt->zbt_cksum = blk.blk_cksum; 771789Sahrens } 772789Sahrens 773789Sahrens return (zio); 774789Sahrens } 775789Sahrens 776789Sahrens /* 777789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 778789Sahrens */ 779789Sahrens zio_t * 780789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 781789Sahrens void *data, uint64_t size, int type, int priority, int flags, 782789Sahrens zio_done_func_t *done, void *private) 783789Sahrens { 784789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 785789Sahrens zio_t *cio; 786789Sahrens 787789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 788789Sahrens /* 789789Sahrens * If we have the bp, then the child should perform the 790789Sahrens * checksum and the parent need not. This pushes error 791789Sahrens * detection as close to the leaves as possible and 792789Sahrens * eliminates redundant checksums in the interior nodes. 793789Sahrens */ 794789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 795789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 796789Sahrens } 797789Sahrens 798789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 799789Sahrens done, private, type, priority, 800789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 8011775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 802789Sahrens 803789Sahrens cio->io_vd = vd; 804789Sahrens cio->io_offset = offset; 805789Sahrens 806789Sahrens return (cio); 807789Sahrens } 808789Sahrens 809789Sahrens /* 810789Sahrens * ========================================================================== 811789Sahrens * Initiate I/O, either sync or async 812789Sahrens * ========================================================================== 813789Sahrens */ 814789Sahrens int 815789Sahrens zio_wait(zio_t *zio) 816789Sahrens { 817789Sahrens int error; 818789Sahrens 819789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 820789Sahrens 821789Sahrens zio->io_waiter = curthread; 822789Sahrens 823789Sahrens zio_next_stage_async(zio); 824789Sahrens 825789Sahrens mutex_enter(&zio->io_lock); 826789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 827789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 828789Sahrens mutex_exit(&zio->io_lock); 829789Sahrens 830789Sahrens error = zio->io_error; 8312856Snd150628 mutex_destroy(&zio->io_lock); 8324831Sgw25295 cv_destroy(&zio->io_cv); 8334055Seschrock kmem_cache_free(zio_cache, zio); 834789Sahrens 835789Sahrens return (error); 836789Sahrens } 837789Sahrens 838789Sahrens void 839789Sahrens zio_nowait(zio_t *zio) 840789Sahrens { 841789Sahrens zio_next_stage_async(zio); 842789Sahrens } 843789Sahrens 844789Sahrens /* 845789Sahrens * ========================================================================== 846789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 847789Sahrens * ========================================================================== 848789Sahrens */ 849789Sahrens static void 850789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 851789Sahrens { 852789Sahrens mutex_enter(&zio->io_lock); 853789Sahrens if (*countp == 0) { 854789Sahrens ASSERT(zio->io_stalled == 0); 855789Sahrens mutex_exit(&zio->io_lock); 856789Sahrens zio_next_stage(zio); 857789Sahrens } else { 858789Sahrens zio->io_stalled = stage; 859789Sahrens mutex_exit(&zio->io_lock); 860789Sahrens } 861789Sahrens } 862789Sahrens 863789Sahrens static void 864789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 865789Sahrens { 866789Sahrens zio_t *pio = zio->io_parent; 867789Sahrens 868789Sahrens mutex_enter(&pio->io_lock); 869789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 870789Sahrens pio->io_error = zio->io_error; 8715329Sgw25295 ASSERT3U(*countp, >, 0); 872789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 873789Sahrens pio->io_stalled = 0; 874789Sahrens mutex_exit(&pio->io_lock); 875789Sahrens zio_next_stage_async(pio); 876789Sahrens } else { 877789Sahrens mutex_exit(&pio->io_lock); 878789Sahrens } 879789Sahrens } 880789Sahrens 881789Sahrens static void 882789Sahrens zio_wait_children_ready(zio_t *zio) 883789Sahrens { 884789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 885789Sahrens &zio->io_children_notready); 886789Sahrens } 887789Sahrens 888789Sahrens void 889789Sahrens zio_wait_children_done(zio_t *zio) 890789Sahrens { 891789Sahrens zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 892789Sahrens &zio->io_children_notdone); 893789Sahrens } 894789Sahrens 895789Sahrens static void 8965329Sgw25295 zio_read_init(zio_t *zio) 8975329Sgw25295 { 8985329Sgw25295 if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) { 8995329Sgw25295 uint64_t csize = BP_GET_PSIZE(zio->io_bp); 9005329Sgw25295 void *cbuf = zio_buf_alloc(csize); 9015329Sgw25295 9025329Sgw25295 zio_push_transform(zio, cbuf, csize, csize); 9035329Sgw25295 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 9045329Sgw25295 } 9055329Sgw25295 9065329Sgw25295 if (BP_IS_GANG(zio->io_bp)) { 9075329Sgw25295 uint64_t gsize = SPA_GANGBLOCKSIZE; 9085329Sgw25295 void *gbuf = zio_buf_alloc(gsize); 9095329Sgw25295 9105329Sgw25295 zio_push_transform(zio, gbuf, gsize, gsize); 9115329Sgw25295 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 9125329Sgw25295 } 9135329Sgw25295 zio_next_stage(zio); 9145329Sgw25295 } 9155329Sgw25295 9165329Sgw25295 static void 917789Sahrens zio_ready(zio_t *zio) 918789Sahrens { 919789Sahrens zio_t *pio = zio->io_parent; 920789Sahrens 9213547Smaybee if (zio->io_ready) 9223547Smaybee zio->io_ready(zio); 9233547Smaybee 924789Sahrens if (pio != NULL) 925789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, 926789Sahrens &pio->io_children_notready); 927789Sahrens 928789Sahrens if (zio->io_bp) 929789Sahrens zio->io_bp_copy = *zio->io_bp; 930789Sahrens 931789Sahrens zio_next_stage(zio); 932789Sahrens } 933789Sahrens 934789Sahrens static void 9355329Sgw25295 zio_vdev_retry_io(zio_t *zio) 936789Sahrens { 937789Sahrens zio_t *pio = zio->io_parent; 9385329Sgw25295 9395329Sgw25295 /* 9405329Sgw25295 * Preserve the failed bp so that the io_ready() callback can 9415329Sgw25295 * update the accounting accordingly. The callback will also be 9425329Sgw25295 * responsible for freeing the previously allocated block, if one 9435329Sgw25295 * exists. 9445329Sgw25295 */ 9455329Sgw25295 zio->io_bp_orig = *zio->io_bp; 9465329Sgw25295 9475329Sgw25295 /* 9485329Sgw25295 * We must zero out the old DVA and blk_birth before reallocating 9495403Sgw25295 * the bp. 9505329Sgw25295 */ 9515403Sgw25295 BP_ZERO_DVAS(zio->io_bp); 9525329Sgw25295 zio_reset(zio); 9535329Sgw25295 9545329Sgw25295 if (pio) { 9555329Sgw25295 /* 9565329Sgw25295 * Let the parent know that we will 9575329Sgw25295 * re-alloc the write (=> new bp info). 9585329Sgw25295 */ 9595329Sgw25295 mutex_enter(&pio->io_lock); 9605329Sgw25295 pio->io_children_notready++; 9615329Sgw25295 9625329Sgw25295 /* 9635329Sgw25295 * If the parent I/O is still in the open stage, then 9645329Sgw25295 * don't bother telling it to retry since it hasn't 9655329Sgw25295 * progressed far enough for it to care. 9665329Sgw25295 */ 9675329Sgw25295 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 9685329Sgw25295 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 9695329Sgw25295 9705329Sgw25295 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE); 9715329Sgw25295 mutex_exit(&pio->io_lock); 9725329Sgw25295 } 9735329Sgw25295 9745329Sgw25295 /* 9755329Sgw25295 * We are getting ready to process the retry request so clear 9765329Sgw25295 * the flag and the zio's current error status. 9775329Sgw25295 */ 9785329Sgw25295 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 9795329Sgw25295 zio->io_error = 0; 9805329Sgw25295 zio_next_stage_async(zio); 9815329Sgw25295 } 9825329Sgw25295 9835329Sgw25295 int 9845329Sgw25295 zio_vdev_resume_io(spa_t *spa) 9855329Sgw25295 { 9865329Sgw25295 zio_t *zio; 9875329Sgw25295 9885329Sgw25295 mutex_enter(&spa->spa_zio_lock); 9895329Sgw25295 9905329Sgw25295 /* 9915329Sgw25295 * Probe all of vdevs that have experienced an I/O error. 9925329Sgw25295 * If we are still unable to verify the integrity of the vdev 9935329Sgw25295 * then we prevent the resume from proceeeding. 9945329Sgw25295 */ 9955329Sgw25295 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 9965329Sgw25295 zio = list_next(&spa->spa_zio_list, zio)) { 9975329Sgw25295 int error = 0; 9985329Sgw25295 9995329Sgw25295 /* We only care about I/Os that must succeed */ 10005329Sgw25295 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 10015329Sgw25295 continue; 10025329Sgw25295 error = vdev_probe(zio->io_vd); 10035329Sgw25295 if (error) { 10045329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10055329Sgw25295 return (error); 10065329Sgw25295 } 10075329Sgw25295 } 10085329Sgw25295 10095329Sgw25295 /* 10105329Sgw25295 * Clear the vdev stats so that I/O can flow. 10115329Sgw25295 */ 10125329Sgw25295 vdev_clear(spa, NULL, B_FALSE); 10135329Sgw25295 10145329Sgw25295 spa->spa_state = POOL_STATE_ACTIVE; 10155329Sgw25295 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 10165329Sgw25295 list_remove(&spa->spa_zio_list, zio); 10175329Sgw25295 zio->io_error = 0; 10185329Sgw25295 10195329Sgw25295 /* 10205329Sgw25295 * If we are resuming an allocating I/O then we force it 10215329Sgw25295 * to retry and let it resume operation where it left off. 10225329Sgw25295 * Otherwise, go back to the ready stage and pick up from 10235329Sgw25295 * there. 10245329Sgw25295 */ 10255329Sgw25295 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 10265329Sgw25295 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 10275329Sgw25295 zio->io_stage--; 10285329Sgw25295 } else { 10295329Sgw25295 zio->io_stage = ZIO_STAGE_READY; 10305329Sgw25295 } 10315329Sgw25295 10325329Sgw25295 (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async, 10335329Sgw25295 zio, TQ_SLEEP); 10345329Sgw25295 } 10355329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10365329Sgw25295 10375329Sgw25295 /* 10385329Sgw25295 * Wait for the taskqs to finish and recheck the pool state since 10395329Sgw25295 * it's possible that a resumed I/O has failed again. 10405329Sgw25295 */ 10415329Sgw25295 taskq_wait(zio_taskq); 10425329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 10435329Sgw25295 return (EIO); 10445329Sgw25295 10455329Sgw25295 mutex_enter(&spa->spa_zio_lock); 10465329Sgw25295 cv_broadcast(&spa->spa_zio_cv); 10475329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10485329Sgw25295 10495329Sgw25295 return (0); 10505329Sgw25295 } 10515329Sgw25295 10525329Sgw25295 static void 10535329Sgw25295 zio_vdev_suspend_io(zio_t *zio) 10545329Sgw25295 { 10555329Sgw25295 spa_t *spa = zio->io_spa; 10565329Sgw25295 10575329Sgw25295 /* 10585329Sgw25295 * We've experienced an unrecoverable failure so 10595329Sgw25295 * set the pool state accordingly and queue all 10605329Sgw25295 * failed IOs. 10615329Sgw25295 */ 10625329Sgw25295 spa->spa_state = POOL_STATE_IO_FAILURE; 10635329Sgw25295 10645329Sgw25295 mutex_enter(&spa->spa_zio_lock); 10655329Sgw25295 list_insert_tail(&spa->spa_zio_list, zio); 10665329Sgw25295 10675329Sgw25295 #ifndef _KERNEL 10685329Sgw25295 /* Used to notify ztest that the pool has suspended */ 10695329Sgw25295 cv_broadcast(&spa->spa_zio_cv); 10705329Sgw25295 #endif 10715329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10725329Sgw25295 } 10735329Sgw25295 10745329Sgw25295 static void 10755329Sgw25295 zio_assess(zio_t *zio) 10765329Sgw25295 { 1077789Sahrens spa_t *spa = zio->io_spa; 1078789Sahrens blkptr_t *bp = zio->io_bp; 1079789Sahrens vdev_t *vd = zio->io_vd; 1080789Sahrens 1081789Sahrens ASSERT(zio->io_children_notready == 0); 1082789Sahrens ASSERT(zio->io_children_notdone == 0); 1083789Sahrens 1084789Sahrens if (bp != NULL) { 1085789Sahrens ASSERT(bp->blk_pad[0] == 0); 1086789Sahrens ASSERT(bp->blk_pad[1] == 0); 1087789Sahrens ASSERT(bp->blk_pad[2] == 0); 1088789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1089789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 10901775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1091789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 10921775Sbillm if (zio->io_ndvas != 0) 10931775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 10941775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 10951775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 10961775Sbillm } 1097789Sahrens } 1098789Sahrens 10995329Sgw25295 /* 11005329Sgw25295 * Some child I/O has indicated that a retry is necessary, so 11015329Sgw25295 * we set an error on the I/O and let the logic below do the 11025329Sgw25295 * rest. 11035329Sgw25295 */ 11045329Sgw25295 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 11055329Sgw25295 zio->io_error = ERESTART; 11065329Sgw25295 1107789Sahrens if (vd != NULL) 1108789Sahrens vdev_stat_update(zio); 1109789Sahrens 1110789Sahrens if (zio->io_error) { 11111544Seschrock /* 11121544Seschrock * If this I/O is attached to a particular vdev, 11131544Seschrock * generate an error message describing the I/O failure 11141544Seschrock * at the block level. We ignore these errors if the 11151544Seschrock * device is currently unavailable. 11161544Seschrock */ 11171732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 11185329Sgw25295 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1119789Sahrens 11201544Seschrock if ((zio->io_error == EIO || 11211544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 11221544Seschrock zio->io_logical == zio) { 11231544Seschrock /* 11241544Seschrock * For root I/O requests, tell the SPA to log the error 11251544Seschrock * appropriately. Also, generate a logical data 11261544Seschrock * ereport. 11271544Seschrock */ 11285329Sgw25295 spa_log_error(spa, zio); 11291544Seschrock 11305329Sgw25295 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 11315329Sgw25295 0, 0); 11321544Seschrock } 1133789Sahrens 11341544Seschrock /* 11355403Sgw25295 * If we are an allocating I/O then we attempt to reissue 11365403Sgw25295 * the I/O on another vdev unless the pool is out of space. 11375403Sgw25295 * We handle this condition based on the spa's failmode 11385403Sgw25295 * property. 11395329Sgw25295 */ 11405329Sgw25295 if (zio_write_retry && zio->io_error != ENOSPC && 11415403Sgw25295 IO_IS_ALLOCATING(zio)) { 11425329Sgw25295 zio_vdev_retry_io(zio); 11435329Sgw25295 return; 11445329Sgw25295 } 11455329Sgw25295 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 11465329Sgw25295 11475329Sgw25295 /* 11485329Sgw25295 * For I/O requests that cannot fail, we carry out 11495329Sgw25295 * the requested behavior based on the failmode pool 11505329Sgw25295 * property. 11515329Sgw25295 * 11525329Sgw25295 * XXX - Need to differentiate between an ENOSPC as 11535329Sgw25295 * a result of vdev failures vs. a full pool. 11541544Seschrock */ 11551544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 11563459Sek110237 char *blkbuf; 11573459Sek110237 11585329Sgw25295 #ifdef ZFS_DEBUG 11593459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 11603459Sek110237 if (blkbuf) { 11613459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 11623459Sek110237 bp ? bp : &zio->io_bp_copy); 11633459Sek110237 } 11645329Sgw25295 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p " 11655329Sgw25295 "%s): error %d", zio->io_error == ECKSUM ? 11661544Seschrock "bad checksum" : "I/O failure", 11671544Seschrock zio_type_name[zio->io_type], 11681544Seschrock vdev_description(vd), 11691544Seschrock (u_longlong_t)zio->io_offset, 11705329Sgw25295 (void *)zio, blkbuf ? blkbuf : "", zio->io_error); 11715329Sgw25295 #endif 11725329Sgw25295 11735329Sgw25295 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 11745329Sgw25295 fm_panic("Pool '%s' has encountered an " 11755329Sgw25295 "uncorrectable I/O failure and the " 11765329Sgw25295 "failure mode property for this pool " 11775329Sgw25295 "is set to panic.", spa_name(spa)); 11785329Sgw25295 } else { 11795329Sgw25295 cmn_err(CE_WARN, "Pool '%s' has encountered " 11805329Sgw25295 "an uncorrectable I/O error. Manual " 11815329Sgw25295 "intervention is required.", 11825329Sgw25295 spa_name(spa)); 11835329Sgw25295 zio_vdev_suspend_io(zio); 11845329Sgw25295 } 11855329Sgw25295 return; 11861544Seschrock } 1187789Sahrens } 11885329Sgw25295 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 11895329Sgw25295 ASSERT(zio->io_children_notready == 0); 11905329Sgw25295 zio_next_stage(zio); 11915329Sgw25295 } 11925329Sgw25295 11935329Sgw25295 static void 11945329Sgw25295 zio_done(zio_t *zio) 11955329Sgw25295 { 11965329Sgw25295 zio_t *pio = zio->io_parent; 11975329Sgw25295 spa_t *spa = zio->io_spa; 11985329Sgw25295 11995329Sgw25295 ASSERT(zio->io_children_notready == 0); 12005329Sgw25295 ASSERT(zio->io_children_notdone == 0); 12015329Sgw25295 1202789Sahrens zio_clear_transform_stack(zio); 1203789Sahrens 1204789Sahrens if (zio->io_done) 1205789Sahrens zio->io_done(zio); 1206789Sahrens 1207789Sahrens ASSERT(zio->io_delegate_list == NULL); 1208789Sahrens ASSERT(zio->io_delegate_next == NULL); 1209789Sahrens 1210789Sahrens if (pio != NULL) { 1211789Sahrens zio_t *next, *prev; 1212789Sahrens 1213789Sahrens mutex_enter(&pio->io_lock); 1214789Sahrens next = zio->io_sibling_next; 1215789Sahrens prev = zio->io_sibling_prev; 1216789Sahrens if (next != NULL) 1217789Sahrens next->io_sibling_prev = prev; 1218789Sahrens if (prev != NULL) 1219789Sahrens prev->io_sibling_next = next; 1220789Sahrens if (pio->io_child == zio) 1221789Sahrens pio->io_child = next; 1222789Sahrens mutex_exit(&pio->io_lock); 1223789Sahrens 1224789Sahrens zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, 1225789Sahrens &pio->io_children_notdone); 1226789Sahrens } 1227789Sahrens 12283463Sahrens /* 12294055Seschrock * Note: this I/O is now done, and will shortly be freed, so there is no 12304055Seschrock * need to clear this (or any other) flag. 12313463Sahrens */ 12323463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 12331544Seschrock spa_config_exit(spa, zio); 1234789Sahrens 1235789Sahrens if (zio->io_waiter != NULL) { 1236789Sahrens mutex_enter(&zio->io_lock); 1237789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1238789Sahrens zio->io_stalled = zio->io_stage; 1239789Sahrens cv_broadcast(&zio->io_cv); 1240789Sahrens mutex_exit(&zio->io_lock); 1241789Sahrens } else { 12424831Sgw25295 mutex_destroy(&zio->io_lock); 12434831Sgw25295 cv_destroy(&zio->io_cv); 12444055Seschrock kmem_cache_free(zio_cache, zio); 1245789Sahrens } 1246789Sahrens } 1247789Sahrens 1248789Sahrens /* 1249789Sahrens * ========================================================================== 1250789Sahrens * Compression support 1251789Sahrens * ========================================================================== 1252789Sahrens */ 1253789Sahrens static void 1254789Sahrens zio_write_compress(zio_t *zio) 1255789Sahrens { 1256789Sahrens int compress = zio->io_compress; 1257789Sahrens blkptr_t *bp = zio->io_bp; 1258789Sahrens void *cbuf; 1259789Sahrens uint64_t lsize = zio->io_size; 1260789Sahrens uint64_t csize = lsize; 1261789Sahrens uint64_t cbufsize = 0; 1262789Sahrens int pass; 1263789Sahrens 1264789Sahrens if (bp->blk_birth == zio->io_txg) { 1265789Sahrens /* 1266789Sahrens * We're rewriting an existing block, which means we're 1267789Sahrens * working on behalf of spa_sync(). For spa_sync() to 1268789Sahrens * converge, it must eventually be the case that we don't 1269789Sahrens * have to allocate new blocks. But compression changes 1270789Sahrens * the blocksize, which forces a reallocate, and makes 1271789Sahrens * convergence take longer. Therefore, after the first 1272789Sahrens * few passes, stop compressing to ensure convergence. 1273789Sahrens */ 1274789Sahrens pass = spa_sync_pass(zio->io_spa); 1275789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 1276789Sahrens compress = ZIO_COMPRESS_OFF; 1277789Sahrens } else { 1278789Sahrens ASSERT(BP_IS_HOLE(bp)); 1279789Sahrens pass = 1; 1280789Sahrens } 1281789Sahrens 1282789Sahrens if (compress != ZIO_COMPRESS_OFF) 1283789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1284789Sahrens &cbuf, &csize, &cbufsize)) 1285789Sahrens compress = ZIO_COMPRESS_OFF; 1286789Sahrens 1287789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 1288789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 1289789Sahrens 1290789Sahrens /* 1291789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 1292789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 1293789Sahrens * but newly allocated blocks are sequential, so they can be written 1294789Sahrens * to disk faster. Therefore, we allow the first few passes of 1295789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 1296789Sahrens * There should only be a handful of blocks after pass 1 in any case. 1297789Sahrens */ 1298789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1299789Sahrens pass > zio_sync_pass.zp_rewrite) { 1300789Sahrens ASSERT(csize != 0); 13012885Sahrens BP_SET_LSIZE(bp, lsize); 13022885Sahrens BP_SET_COMPRESS(bp, compress); 1303789Sahrens zio->io_pipeline = ZIO_REWRITE_PIPELINE; 1304789Sahrens } else { 13053882Sahrens if (bp->blk_birth == zio->io_txg) 13063882Sahrens BP_ZERO(bp); 1307789Sahrens if (csize == 0) { 1308789Sahrens BP_ZERO(bp); 1309789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1310789Sahrens } else { 13111775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1312789Sahrens BP_SET_LSIZE(bp, lsize); 1313789Sahrens BP_SET_PSIZE(bp, csize); 1314789Sahrens BP_SET_COMPRESS(bp, compress); 1315789Sahrens zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE; 1316789Sahrens } 1317789Sahrens } 1318789Sahrens 1319789Sahrens zio_next_stage(zio); 1320789Sahrens } 1321789Sahrens 1322789Sahrens static void 1323789Sahrens zio_read_decompress(zio_t *zio) 1324789Sahrens { 1325789Sahrens blkptr_t *bp = zio->io_bp; 1326789Sahrens void *data; 1327789Sahrens uint64_t size; 1328789Sahrens uint64_t bufsize; 1329789Sahrens int compress = BP_GET_COMPRESS(bp); 1330789Sahrens 1331789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1332789Sahrens 1333789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1334789Sahrens 1335789Sahrens if (zio_decompress_data(compress, data, size, 1336789Sahrens zio->io_data, zio->io_size)) 1337789Sahrens zio->io_error = EIO; 1338789Sahrens 1339789Sahrens zio_buf_free(data, bufsize); 1340789Sahrens 1341789Sahrens zio_next_stage(zio); 1342789Sahrens } 1343789Sahrens 1344789Sahrens /* 1345789Sahrens * ========================================================================== 1346789Sahrens * Gang block support 1347789Sahrens * ========================================================================== 1348789Sahrens */ 1349789Sahrens static void 1350789Sahrens zio_gang_pipeline(zio_t *zio) 1351789Sahrens { 1352789Sahrens /* 1353789Sahrens * By default, the pipeline assumes that we're dealing with a gang 1354789Sahrens * block. If we're not, strip out any gang-specific stages. 1355789Sahrens */ 13561775Sbillm if (!BP_IS_GANG(zio->io_bp)) 1357789Sahrens zio->io_pipeline &= ~ZIO_GANG_STAGES; 1358789Sahrens 1359789Sahrens zio_next_stage(zio); 1360789Sahrens } 1361789Sahrens 1362789Sahrens static void 1363789Sahrens zio_gang_byteswap(zio_t *zio) 1364789Sahrens { 1365789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1366789Sahrens 1367789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1368789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1369789Sahrens } 1370789Sahrens 1371789Sahrens static void 1372789Sahrens zio_get_gang_header(zio_t *zio) 1373789Sahrens { 1374789Sahrens blkptr_t *bp = zio->io_bp; 1375789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1376789Sahrens void *gbuf = zio_buf_alloc(gsize); 1377789Sahrens 13781775Sbillm ASSERT(BP_IS_GANG(bp)); 1379789Sahrens 1380789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1381789Sahrens 1382789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1383789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1384789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 13855329Sgw25295 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1386789Sahrens 1387789Sahrens zio_wait_children_done(zio); 1388789Sahrens } 1389789Sahrens 1390789Sahrens static void 1391789Sahrens zio_read_gang_members(zio_t *zio) 1392789Sahrens { 1393789Sahrens zio_gbh_phys_t *gbh; 1394789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1395789Sahrens int i; 1396789Sahrens 13971775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1398789Sahrens 1399789Sahrens zio_gang_byteswap(zio); 1400789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1401789Sahrens 1402789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1403789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1404789Sahrens lsize = BP_GET_PSIZE(gbp); 1405789Sahrens 1406789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1407789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1408789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1409789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1410789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1411789Sahrens 1412789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 1413789Sahrens (char *)zio->io_data + loff, lsize, NULL, NULL, 14141544Seschrock zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 14151544Seschrock &zio->io_bookmark)); 1416789Sahrens } 1417789Sahrens 1418789Sahrens zio_buf_free(gbh, gbufsize); 1419789Sahrens zio_wait_children_done(zio); 1420789Sahrens } 1421789Sahrens 1422789Sahrens static void 1423789Sahrens zio_rewrite_gang_members(zio_t *zio) 1424789Sahrens { 1425789Sahrens zio_gbh_phys_t *gbh; 1426789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1427789Sahrens int i; 1428789Sahrens 14291775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1430789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1431789Sahrens 1432789Sahrens zio_gang_byteswap(zio); 1433789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1434789Sahrens 1435789Sahrens ASSERT(gsize == gbufsize); 1436789Sahrens 1437789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1438789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1439789Sahrens lsize = BP_GET_PSIZE(gbp); 1440789Sahrens 1441789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1442789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1443789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1444789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1445789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1446789Sahrens 1447789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1448789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 14491544Seschrock NULL, NULL, zio->io_priority, zio->io_flags, 14501544Seschrock &zio->io_bookmark)); 1451789Sahrens } 1452789Sahrens 1453789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 1454789Sahrens zio_wait_children_ready(zio); 1455789Sahrens } 1456789Sahrens 1457789Sahrens static void 1458789Sahrens zio_free_gang_members(zio_t *zio) 1459789Sahrens { 1460789Sahrens zio_gbh_phys_t *gbh; 1461789Sahrens uint64_t gsize, gbufsize; 1462789Sahrens int i; 1463789Sahrens 14641775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1465789Sahrens 1466789Sahrens zio_gang_byteswap(zio); 1467789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1468789Sahrens 1469789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1470789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1471789Sahrens 1472789Sahrens if (BP_IS_HOLE(gbp)) 1473789Sahrens continue; 1474789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1475789Sahrens gbp, NULL, NULL)); 1476789Sahrens } 1477789Sahrens 1478789Sahrens zio_buf_free(gbh, gbufsize); 1479789Sahrens zio_next_stage(zio); 1480789Sahrens } 1481789Sahrens 1482789Sahrens static void 1483789Sahrens zio_claim_gang_members(zio_t *zio) 1484789Sahrens { 1485789Sahrens zio_gbh_phys_t *gbh; 1486789Sahrens uint64_t gsize, gbufsize; 1487789Sahrens int i; 1488789Sahrens 14891775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1490789Sahrens 1491789Sahrens zio_gang_byteswap(zio); 1492789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1493789Sahrens 1494789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1495789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1496789Sahrens if (BP_IS_HOLE(gbp)) 1497789Sahrens continue; 1498789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1499789Sahrens gbp, NULL, NULL)); 1500789Sahrens } 1501789Sahrens 1502789Sahrens zio_buf_free(gbh, gbufsize); 1503789Sahrens zio_next_stage(zio); 1504789Sahrens } 1505789Sahrens 1506789Sahrens static void 1507789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1508789Sahrens { 1509789Sahrens zio_t *pio = zio->io_parent; 15101775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 15111775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1512789Sahrens uint64_t asize; 15131775Sbillm int d; 1514789Sahrens 15151775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 15161775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 15171775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 15181775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 15191775Sbillm 1520789Sahrens mutex_enter(&pio->io_lock); 15211775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 15221775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 15231775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 15241775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 15251775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 15261775Sbillm } 1527789Sahrens mutex_exit(&pio->io_lock); 1528789Sahrens } 1529789Sahrens 15305329Sgw25295 static int 15314527Sperrin zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1532789Sahrens { 1533789Sahrens blkptr_t *bp = zio->io_bp; 15341775Sbillm dva_t *dva = bp->blk_dva; 15351775Sbillm spa_t *spa = zio->io_spa; 1536789Sahrens zio_gbh_phys_t *gbh; 15371775Sbillm uint64_t txg = zio->io_txg; 1538789Sahrens uint64_t resid = zio->io_size; 1539789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1540789Sahrens uint64_t gsize, loff, lsize; 1541789Sahrens uint32_t gbps_left; 15421775Sbillm int ndvas = zio->io_ndvas; 15431775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1544789Sahrens int error; 15451775Sbillm int i, d; 1546789Sahrens 1547789Sahrens gsize = SPA_GANGBLOCKSIZE; 1548789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1549789Sahrens 15504527Sperrin error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 15514527Sperrin B_FALSE); 15525329Sgw25295 if (error) 15535329Sgw25295 return (error); 1554789Sahrens 15551775Sbillm for (d = 0; d < gbh_ndvas; d++) 15561775Sbillm DVA_SET_GANG(&dva[d], 1); 1557789Sahrens 15581775Sbillm bp->blk_birth = txg; 1559789Sahrens 1560789Sahrens gbh = zio_buf_alloc(gsize); 1561789Sahrens bzero(gbh, gsize); 1562789Sahrens 15631775Sbillm /* We need to test multi-level gang blocks */ 15641775Sbillm if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) 15651775Sbillm maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); 15661775Sbillm 1567789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1568789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1569789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 15701775Sbillm dva = gbp->blk_dva; 1571789Sahrens 1572789Sahrens ASSERT(gbps_left != 0); 1573789Sahrens maxalloc = MIN(maxalloc, resid); 1574789Sahrens 1575789Sahrens while (resid <= maxalloc * gbps_left) { 15764527Sperrin error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 15773063Sperrin txg, bp, B_FALSE); 1578789Sahrens if (error == 0) 1579789Sahrens break; 1580789Sahrens ASSERT3U(error, ==, ENOSPC); 15815329Sgw25295 /* XXX - free up previous allocations? */ 1582789Sahrens if (maxalloc == SPA_MINBLOCKSIZE) 15835329Sgw25295 return (error); 1584789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1585789Sahrens } 1586789Sahrens 1587789Sahrens if (resid <= maxalloc * gbps_left) { 1588789Sahrens lsize = maxalloc; 1589789Sahrens BP_SET_LSIZE(gbp, lsize); 1590789Sahrens BP_SET_PSIZE(gbp, lsize); 1591789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 15921775Sbillm gbp->blk_birth = txg; 15931775Sbillm zio_nowait(zio_rewrite(zio, spa, 15941775Sbillm zio->io_checksum, txg, gbp, 1595789Sahrens (char *)zio->io_data + loff, lsize, 1596789Sahrens zio_write_allocate_gang_member_done, NULL, 15975403Sgw25295 zio->io_priority, 15985403Sgw25295 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 15991544Seschrock &zio->io_bookmark)); 1600789Sahrens } else { 1601789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1602789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 16031775Sbillm zio_nowait(zio_write_allocate(zio, spa, 16041775Sbillm zio->io_checksum, txg, gbp, 1605789Sahrens (char *)zio->io_data + loff, lsize, 1606789Sahrens zio_write_allocate_gang_member_done, NULL, 16075403Sgw25295 zio->io_priority, 16085403Sgw25295 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1609789Sahrens } 1610789Sahrens } 1611789Sahrens 1612789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1613789Sahrens 1614789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1615789Sahrens 1616789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 16171775Sbillm /* 16181775Sbillm * As much as we'd like this to be zio_wait_children_ready(), 16191775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 16201775Sbillm * so we have to wait for that to finish in order for our BP 16211775Sbillm * to be stable. 16221775Sbillm */ 1623789Sahrens zio_wait_children_done(zio); 16245329Sgw25295 return (0); 1625789Sahrens } 1626789Sahrens 1627789Sahrens /* 1628789Sahrens * ========================================================================== 1629789Sahrens * Allocate and free blocks 1630789Sahrens * ========================================================================== 1631789Sahrens */ 1632789Sahrens static void 1633789Sahrens zio_dva_allocate(zio_t *zio) 1634789Sahrens { 16354527Sperrin spa_t *spa = zio->io_spa; 16364527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1637789Sahrens blkptr_t *bp = zio->io_bp; 1638789Sahrens int error; 1639789Sahrens 1640789Sahrens ASSERT(BP_IS_HOLE(bp)); 16411775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 16421775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 16434527Sperrin ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1644789Sahrens 1645789Sahrens /* For testing, make some blocks above a certain size be gang blocks */ 1646789Sahrens if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { 16475329Sgw25295 error = zio_write_allocate_gang_members(zio, mc); 16485329Sgw25295 if (error) 16495329Sgw25295 zio->io_error = error; 1650789Sahrens return; 1651789Sahrens } 1652789Sahrens 16535329Sgw25295 /* 16545329Sgw25295 * For testing purposes, we force I/Os to retry. We don't allow 16555329Sgw25295 * retries beyond the first pass since those I/Os are non-allocating 16565403Sgw25295 * writes. 16575329Sgw25295 */ 16585329Sgw25295 if (zio_io_fail_shift && 16595329Sgw25295 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 16605329Sgw25295 zio_io_should_fail(zio_io_fail_shift)) 16615329Sgw25295 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 16625329Sgw25295 1663789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1664789Sahrens 16654527Sperrin error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 16663063Sperrin zio->io_txg, NULL, B_FALSE); 1667789Sahrens 1668789Sahrens if (error == 0) { 1669789Sahrens bp->blk_birth = zio->io_txg; 16705329Sgw25295 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 16715329Sgw25295 error = zio_write_allocate_gang_members(zio, mc); 16725329Sgw25295 if (error == 0) 16735329Sgw25295 return; 16745329Sgw25295 zio->io_error = error; 1675789Sahrens } else { 1676789Sahrens zio->io_error = error; 1677789Sahrens } 1678789Sahrens zio_next_stage(zio); 1679789Sahrens } 1680789Sahrens 1681789Sahrens static void 1682789Sahrens zio_dva_free(zio_t *zio) 1683789Sahrens { 1684789Sahrens blkptr_t *bp = zio->io_bp; 1685789Sahrens 16861807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1687789Sahrens 1688789Sahrens BP_ZERO(bp); 1689789Sahrens 1690789Sahrens zio_next_stage(zio); 1691789Sahrens } 1692789Sahrens 1693789Sahrens static void 1694789Sahrens zio_dva_claim(zio_t *zio) 1695789Sahrens { 16961807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1697789Sahrens 1698789Sahrens zio_next_stage(zio); 1699789Sahrens } 1700789Sahrens 1701789Sahrens /* 1702789Sahrens * ========================================================================== 1703789Sahrens * Read and write to physical devices 1704789Sahrens * ========================================================================== 1705789Sahrens */ 1706789Sahrens 1707789Sahrens static void 17081775Sbillm zio_vdev_io_start(zio_t *zio) 1709789Sahrens { 1710789Sahrens vdev_t *vd = zio->io_vd; 17111775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 17121775Sbillm blkptr_t *bp = zio->io_bp; 17131775Sbillm uint64_t align; 17145329Sgw25295 spa_t *spa = zio->io_spa; 17155329Sgw25295 17165329Sgw25295 /* 17175329Sgw25295 * If the pool is already in a failure state then just suspend 17185329Sgw25295 * this IO until the problem is resolved. We will reissue them 17195329Sgw25295 * at that time. 17205329Sgw25295 */ 17215329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 17225329Sgw25295 zio->io_type == ZIO_TYPE_WRITE) { 17235329Sgw25295 zio_vdev_suspend_io(zio); 17245329Sgw25295 return; 17255329Sgw25295 } 1726789Sahrens 17271775Sbillm if (vd == NULL) { 17281775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 17291775Sbillm vdev_mirror_ops.vdev_op_io_start(zio); 17301775Sbillm return; 17311775Sbillm } 17321775Sbillm 17331775Sbillm align = 1ULL << tvd->vdev_ashift; 17341775Sbillm 17351732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1736789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1737789Sahrens 17381775Sbillm if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && 17391775Sbillm vd->vdev_children == 0) { 1740789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1741789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1742789Sahrens } 1743789Sahrens 17441732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 17451732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 17461732Sbonwick char *abuf = zio_buf_alloc(asize); 17471732Sbonwick ASSERT(vd == tvd); 17481732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 17491732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 17501732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 17511732Sbonwick } 17521732Sbonwick zio_push_transform(zio, abuf, asize, asize); 17531732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 17541732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 17551732Sbonwick } 17561732Sbonwick 17571732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 17581732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 17591732Sbonwick ASSERT(bp == NULL || 17601732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1761789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1762789Sahrens 1763789Sahrens vdev_io_start(zio); 1764789Sahrens 1765789Sahrens /* zio_next_stage_async() gets called from io completion interrupt */ 1766789Sahrens } 1767789Sahrens 1768789Sahrens static void 1769789Sahrens zio_vdev_io_done(zio_t *zio) 1770789Sahrens { 17711775Sbillm if (zio->io_vd == NULL) 17721775Sbillm /* The mirror_ops handle multiple DVAs in a single BP */ 17731775Sbillm vdev_mirror_ops.vdev_op_io_done(zio); 17741775Sbillm else 17751775Sbillm vdev_io_done(zio); 1776789Sahrens } 1777789Sahrens 1778789Sahrens /* XXPOLICY */ 17791544Seschrock boolean_t 1780789Sahrens zio_should_retry(zio_t *zio) 1781789Sahrens { 1782789Sahrens vdev_t *vd = zio->io_vd; 1783789Sahrens 1784789Sahrens if (zio->io_error == 0) 1785789Sahrens return (B_FALSE); 1786789Sahrens if (zio->io_delegate_list != NULL) 1787789Sahrens return (B_FALSE); 17881775Sbillm if (vd && vd != vd->vdev_top) 1789789Sahrens return (B_FALSE); 1790789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1791789Sahrens return (B_FALSE); 17921544Seschrock if (zio->io_retries > 0) 1793789Sahrens return (B_FALSE); 1794789Sahrens 1795789Sahrens return (B_TRUE); 1796789Sahrens } 1797789Sahrens 1798789Sahrens static void 1799789Sahrens zio_vdev_io_assess(zio_t *zio) 1800789Sahrens { 1801789Sahrens vdev_t *vd = zio->io_vd; 18021775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1803789Sahrens 18041544Seschrock ASSERT(zio->io_vsd == NULL); 1805789Sahrens 18061732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 18071732Sbonwick void *abuf; 18081732Sbonwick uint64_t asize; 18091732Sbonwick ASSERT(vd == tvd); 18101732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 18111732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 18121732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 18131732Sbonwick zio_buf_free(abuf, asize); 18141732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 18151732Sbonwick } 18161732Sbonwick 18171544Seschrock if (zio_injection_enabled && !zio->io_error) 18181544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1819789Sahrens 1820789Sahrens /* 1821789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1822789Sahrens */ 1823789Sahrens /* XXPOLICY */ 1824789Sahrens if (zio_should_retry(zio)) { 1825789Sahrens ASSERT(tvd == vd); 1826789Sahrens 1827789Sahrens zio->io_retries++; 1828789Sahrens zio->io_error = 0; 18293463Sahrens zio->io_flags &= ZIO_FLAG_VDEV_INHERIT | 18303463Sahrens ZIO_FLAG_CONFIG_GRABBED; 1831789Sahrens /* XXPOLICY */ 1832789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1833789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 18341775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1835789Sahrens 1836789Sahrens dprintf("retry #%d for %s to %s offset %llx\n", 1837789Sahrens zio->io_retries, zio_type_name[zio->io_type], 1838789Sahrens vdev_description(vd), zio->io_offset); 1839789Sahrens 18401544Seschrock zio_next_stage_async(zio); 18411544Seschrock return; 18421544Seschrock } 1843789Sahrens 1844789Sahrens zio_next_stage(zio); 1845789Sahrens } 1846789Sahrens 1847789Sahrens void 1848789Sahrens zio_vdev_io_reissue(zio_t *zio) 1849789Sahrens { 1850789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1851789Sahrens ASSERT(zio->io_error == 0); 1852789Sahrens 1853789Sahrens zio->io_stage--; 1854789Sahrens } 1855789Sahrens 1856789Sahrens void 1857789Sahrens zio_vdev_io_redone(zio_t *zio) 1858789Sahrens { 1859789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1860789Sahrens 1861789Sahrens zio->io_stage--; 1862789Sahrens } 1863789Sahrens 1864789Sahrens void 1865789Sahrens zio_vdev_io_bypass(zio_t *zio) 1866789Sahrens { 1867789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1868789Sahrens ASSERT(zio->io_error == 0); 1869789Sahrens 1870789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1871789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1872789Sahrens } 1873789Sahrens 1874789Sahrens /* 1875789Sahrens * ========================================================================== 1876789Sahrens * Generate and verify checksums 1877789Sahrens * ========================================================================== 1878789Sahrens */ 1879789Sahrens static void 1880789Sahrens zio_checksum_generate(zio_t *zio) 1881789Sahrens { 1882789Sahrens int checksum = zio->io_checksum; 1883789Sahrens blkptr_t *bp = zio->io_bp; 1884789Sahrens 1885789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1886789Sahrens 1887789Sahrens BP_SET_CHECKSUM(bp, checksum); 1888789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1889789Sahrens 1890789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1891789Sahrens 1892789Sahrens zio_next_stage(zio); 1893789Sahrens } 1894789Sahrens 1895789Sahrens static void 1896789Sahrens zio_gang_checksum_generate(zio_t *zio) 1897789Sahrens { 1898789Sahrens zio_cksum_t zc; 1899789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1900789Sahrens 19011775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1902789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1903789Sahrens 1904789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1905789Sahrens 1906789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1907789Sahrens 1908789Sahrens zio_next_stage(zio); 1909789Sahrens } 1910789Sahrens 1911789Sahrens static void 1912789Sahrens zio_checksum_verify(zio_t *zio) 1913789Sahrens { 1914789Sahrens if (zio->io_bp != NULL) { 1915789Sahrens zio->io_error = zio_checksum_error(zio); 19161544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 19171544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 19181544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1919789Sahrens } 1920789Sahrens 1921789Sahrens zio_next_stage(zio); 1922789Sahrens } 1923789Sahrens 1924789Sahrens /* 1925789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1926789Sahrens */ 1927789Sahrens void 1928789Sahrens zio_checksum_verified(zio_t *zio) 1929789Sahrens { 1930789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1931789Sahrens } 1932789Sahrens 1933789Sahrens /* 1934789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1935789Sahrens */ 1936789Sahrens void 1937789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1938789Sahrens { 19391775Sbillm blkptr_t *bp = zio->io_bp; 19401775Sbillm 19411775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 19421775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 19431775Sbillm zcp->zc_word[2] = bp->blk_birth; 1944789Sahrens zcp->zc_word[3] = 0; 1945789Sahrens } 1946789Sahrens 1947789Sahrens /* 1948789Sahrens * ========================================================================== 1949789Sahrens * Define the pipeline 1950789Sahrens * ========================================================================== 1951789Sahrens */ 1952789Sahrens typedef void zio_pipe_stage_t(zio_t *zio); 1953789Sahrens 1954789Sahrens static void 1955789Sahrens zio_badop(zio_t *zio) 1956789Sahrens { 1957789Sahrens panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); 1958789Sahrens } 1959789Sahrens 1960789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1961789Sahrens zio_badop, 1962789Sahrens zio_wait_children_ready, 1963789Sahrens zio_write_compress, 1964789Sahrens zio_checksum_generate, 1965789Sahrens zio_gang_pipeline, 1966789Sahrens zio_get_gang_header, 1967789Sahrens zio_rewrite_gang_members, 1968789Sahrens zio_free_gang_members, 1969789Sahrens zio_claim_gang_members, 1970789Sahrens zio_dva_allocate, 1971789Sahrens zio_dva_free, 1972789Sahrens zio_dva_claim, 1973789Sahrens zio_gang_checksum_generate, 1974789Sahrens zio_ready, 19755329Sgw25295 zio_read_init, 1976789Sahrens zio_vdev_io_start, 1977789Sahrens zio_vdev_io_done, 1978789Sahrens zio_vdev_io_assess, 1979789Sahrens zio_wait_children_done, 1980789Sahrens zio_checksum_verify, 1981789Sahrens zio_read_gang_members, 1982789Sahrens zio_read_decompress, 19835329Sgw25295 zio_assess, 1984789Sahrens zio_done, 1985789Sahrens zio_badop 1986789Sahrens }; 1987789Sahrens 1988789Sahrens /* 1989789Sahrens * Move an I/O to the next stage of the pipeline and execute that stage. 1990789Sahrens * There's no locking on io_stage because there's no legitimate way for 1991789Sahrens * multiple threads to be attempting to process the same I/O. 1992789Sahrens */ 1993789Sahrens void 1994789Sahrens zio_next_stage(zio_t *zio) 1995789Sahrens { 1996789Sahrens uint32_t pipeline = zio->io_pipeline; 1997789Sahrens 1998789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 1999789Sahrens 2000789Sahrens if (zio->io_error) { 2001789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 2002789Sahrens zio, vdev_description(zio->io_vd), 2003789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 2004789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 2005789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 2006789Sahrens } 2007789Sahrens 2008789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 2009789Sahrens continue; 2010789Sahrens 2011789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 2012789Sahrens ASSERT(zio->io_stalled == 0); 2013789Sahrens 20143689Sek110237 /* 20153689Sek110237 * See the comment in zio_next_stage_async() about per-CPU taskqs. 20163689Sek110237 */ 20173689Sek110237 if (((1U << zio->io_stage) & zio->io_async_stages) && 20183689Sek110237 (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && 20193689Sek110237 !(zio->io_flags & ZIO_FLAG_METADATA)) { 20203689Sek110237 taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 20213689Sek110237 (void) taskq_dispatch(tq, 20223689Sek110237 (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 20233689Sek110237 } else { 20243689Sek110237 zio_pipeline[zio->io_stage](zio); 20253689Sek110237 } 2026789Sahrens } 2027789Sahrens 2028789Sahrens void 2029789Sahrens zio_next_stage_async(zio_t *zio) 2030789Sahrens { 2031789Sahrens taskq_t *tq; 2032789Sahrens uint32_t pipeline = zio->io_pipeline; 2033789Sahrens 2034789Sahrens ASSERT(!MUTEX_HELD(&zio->io_lock)); 2035789Sahrens 2036789Sahrens if (zio->io_error) { 2037789Sahrens dprintf("zio %p vdev %s offset %llx stage %d error %d\n", 2038789Sahrens zio, vdev_description(zio->io_vd), 2039789Sahrens zio->io_offset, zio->io_stage, zio->io_error); 2040789Sahrens if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) 2041789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 2042789Sahrens } 2043789Sahrens 2044789Sahrens while (((1U << ++zio->io_stage) & pipeline) == 0) 2045789Sahrens continue; 2046789Sahrens 2047789Sahrens ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 2048789Sahrens ASSERT(zio->io_stalled == 0); 2049789Sahrens 2050789Sahrens /* 2051789Sahrens * For performance, we'll probably want two sets of task queues: 2052789Sahrens * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU 2053789Sahrens * part is for read performance: since we have to make a pass over 2054789Sahrens * the data to checksum it anyway, we want to do this on the same CPU 2055789Sahrens * that issued the read, because (assuming CPU scheduling affinity) 2056789Sahrens * that thread is probably still there. Getting this optimization 2057789Sahrens * right avoids performance-hostile cache-to-cache transfers. 2058789Sahrens * 2059789Sahrens * Note that having two sets of task queues is also necessary for 2060789Sahrens * correctness: if all of the issue threads get bogged down waiting 2061789Sahrens * for dependent reads (e.g. metaslab freelist) to complete, then 2062789Sahrens * there won't be any threads available to service I/O completion 2063789Sahrens * interrupts. 2064789Sahrens */ 2065789Sahrens if ((1U << zio->io_stage) & zio->io_async_stages) { 2066789Sahrens if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) 2067789Sahrens tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; 2068789Sahrens else 2069789Sahrens tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; 2070789Sahrens (void) taskq_dispatch(tq, 2071789Sahrens (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); 2072789Sahrens } else { 2073789Sahrens zio_pipeline[zio->io_stage](zio); 2074789Sahrens } 2075789Sahrens } 2076789Sahrens 20775329Sgw25295 void 20785329Sgw25295 zio_resubmit_stage_async(void *arg) 20795329Sgw25295 { 20805329Sgw25295 zio_t *zio = (zio_t *)(uintptr_t)arg; 20815329Sgw25295 20825329Sgw25295 zio_next_stage_async(zio); 20835329Sgw25295 } 20845329Sgw25295 20853668Sgw25295 static boolean_t 20865329Sgw25295 zio_io_should_fail(uint16_t range) 20873668Sgw25295 { 20883668Sgw25295 static uint16_t allocs = 0; 20893668Sgw25295 20905329Sgw25295 return (P2PHASE(allocs++, 1U<<range) == 0); 20913668Sgw25295 } 20923668Sgw25295 2093789Sahrens /* 2094789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 2095789Sahrens */ 2096789Sahrens int 20973063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 20983063Sperrin uint64_t txg) 2099789Sahrens { 2100789Sahrens int error; 2101789Sahrens 21021544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2103789Sahrens 21045329Sgw25295 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 21053668Sgw25295 spa_config_exit(spa, FTAG); 21063668Sgw25295 return (ENOSPC); 21073668Sgw25295 } 21083668Sgw25295 21093063Sperrin /* 21104527Sperrin * We were passed the previous log block's DVA in bp->blk_dva[0]. 21114527Sperrin * We use that as a hint for which vdev to allocate from next. 21123063Sperrin */ 21134527Sperrin error = metaslab_alloc(spa, spa->spa_log_class, size, 21144527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 21154527Sperrin 21164527Sperrin if (error) 21174527Sperrin error = metaslab_alloc(spa, spa->spa_normal_class, size, 21184527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 2119789Sahrens 2120789Sahrens if (error == 0) { 21213063Sperrin BP_SET_LSIZE(new_bp, size); 21223063Sperrin BP_SET_PSIZE(new_bp, size); 21233063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 21243063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 21253063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 21263063Sperrin BP_SET_LEVEL(new_bp, 0); 21273063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 21283063Sperrin new_bp->blk_birth = txg; 2129789Sahrens } 2130789Sahrens 21311544Seschrock spa_config_exit(spa, FTAG); 2132789Sahrens 2133789Sahrens return (error); 2134789Sahrens } 2135789Sahrens 2136789Sahrens /* 2137789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 2138789Sahrens * nothing to do except metaslab_free() it. 2139789Sahrens */ 2140789Sahrens void 2141789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2142789Sahrens { 21431775Sbillm ASSERT(!BP_IS_GANG(bp)); 2144789Sahrens 21451544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2146789Sahrens 21471807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 2148789Sahrens 21491544Seschrock spa_config_exit(spa, FTAG); 2150789Sahrens } 21514469Sperrin 21524469Sperrin /* 21534469Sperrin * start an async flush of the write cache for this vdev 21544469Sperrin */ 21554469Sperrin void 21564469Sperrin zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio) 21574469Sperrin { 21584469Sperrin vdev_t *vd; 21594469Sperrin 21604469Sperrin /* 21614469Sperrin * Lock out configuration changes. 21624469Sperrin */ 21634469Sperrin spa_config_enter(spa, RW_READER, FTAG); 21644469Sperrin 21654469Sperrin if (*zio == NULL) 21664469Sperrin *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 21674469Sperrin 21684469Sperrin vd = vdev_lookup_top(spa, vdev); 21694469Sperrin ASSERT(vd); 21704469Sperrin 21714469Sperrin (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE, 21724469Sperrin NULL, NULL, ZIO_PRIORITY_NOW, 21734469Sperrin ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 21744469Sperrin 21754469Sperrin spa_config_exit(spa, FTAG); 21764469Sperrin } 2177