1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 226245Smaybee * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 643668Sgw25295 /* Force an allocation failure when non-zero */ 653668Sgw25295 uint16_t zio_zil_fail_shift = 0; 665329Sgw25295 uint16_t zio_io_fail_shift = 0; 675329Sgw25295 685329Sgw25295 /* Enable/disable the write-retry logic */ 695329Sgw25295 int zio_write_retry = 1; 705329Sgw25295 715329Sgw25295 /* Taskq to handle reissuing of I/Os */ 725329Sgw25295 taskq_t *zio_taskq; 735329Sgw25295 int zio_resume_threads = 4; 743668Sgw25295 75789Sahrens typedef struct zio_sync_pass { 76789Sahrens int zp_defer_free; /* defer frees after this pass */ 77789Sahrens int zp_dontcompress; /* don't compress after this pass */ 78789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 79789Sahrens } zio_sync_pass_t; 80789Sahrens 81789Sahrens zio_sync_pass_t zio_sync_pass = { 82789Sahrens 1, /* zp_defer_free */ 83789Sahrens 4, /* zp_dontcompress */ 84789Sahrens 1, /* zp_rewrite */ 85789Sahrens }; 86789Sahrens 875329Sgw25295 static boolean_t zio_io_should_fail(uint16_t); 885329Sgw25295 89789Sahrens /* 90789Sahrens * ========================================================================== 91789Sahrens * I/O kmem caches 92789Sahrens * ========================================================================== 93789Sahrens */ 944055Seschrock kmem_cache_t *zio_cache; 95789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 963290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 973290Sjohansen 983290Sjohansen #ifdef _KERNEL 993290Sjohansen extern vmem_t *zio_alloc_arena; 1003290Sjohansen #endif 101789Sahrens 1025329Sgw25295 /* 1035329Sgw25295 * Determine if we are allowed to issue the IO based on the 1045329Sgw25295 * pool state. If we must wait then block until we are told 1055329Sgw25295 * that we may continue. 1065329Sgw25295 */ 1075329Sgw25295 #define ZIO_ENTER(spa) { \ 1085329Sgw25295 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 1095329Sgw25295 mutex_enter(&spa->spa_zio_lock); \ 1105329Sgw25295 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 1115329Sgw25295 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 1125329Sgw25295 mutex_exit(&spa->spa_zio_lock); \ 1135329Sgw25295 } \ 1145329Sgw25295 } 1155329Sgw25295 1165329Sgw25295 /* 1175329Sgw25295 * An allocation zio is one that either currently has the DVA allocate 1185329Sgw25295 * stage set or will have it later in it's lifetime. 1195329Sgw25295 */ 1205329Sgw25295 #define IO_IS_ALLOCATING(zio) \ 1215688Sbonwick ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 1225329Sgw25295 123789Sahrens void 124789Sahrens zio_init(void) 125789Sahrens { 126789Sahrens size_t c; 1273290Sjohansen vmem_t *data_alloc_arena = NULL; 1283290Sjohansen 1293290Sjohansen #ifdef _KERNEL 1303290Sjohansen data_alloc_arena = zio_alloc_arena; 1313290Sjohansen #endif 132789Sahrens 1334055Seschrock zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 1344055Seschrock NULL, NULL, NULL, NULL, NULL, 0); 1354055Seschrock 136789Sahrens /* 137789Sahrens * For small buffers, we want a cache for each multiple of 138789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139789Sahrens * for each quarter-power of 2. For large buffers, we want 140789Sahrens * a cache for each multiple of PAGESIZE. 141789Sahrens */ 142789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144789Sahrens size_t p2 = size; 145789Sahrens size_t align = 0; 146789Sahrens 147789Sahrens while (p2 & (p2 - 1)) 148789Sahrens p2 &= p2 - 1; 149789Sahrens 150789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 151789Sahrens align = SPA_MINBLOCKSIZE; 152789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 153789Sahrens align = PAGESIZE; 154789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 155789Sahrens align = p2 >> 2; 156789Sahrens } 157789Sahrens 158789Sahrens if (align != 0) { 1593290Sjohansen char name[36]; 1602856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 162849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1633290Sjohansen 1643290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1653290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1663290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1673290Sjohansen KMC_NODEBUG); 1683290Sjohansen 169789Sahrens } 170789Sahrens } 171789Sahrens 172789Sahrens while (--c != 0) { 173789Sahrens ASSERT(zio_buf_cache[c] != NULL); 174789Sahrens if (zio_buf_cache[c - 1] == NULL) 175789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1763290Sjohansen 1773290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1783290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1793290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180789Sahrens } 1811544Seschrock 1825329Sgw25295 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 1835329Sgw25295 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 1845329Sgw25295 1851544Seschrock zio_inject_init(); 186789Sahrens } 187789Sahrens 188789Sahrens void 189789Sahrens zio_fini(void) 190789Sahrens { 191789Sahrens size_t c; 192789Sahrens kmem_cache_t *last_cache = NULL; 1933290Sjohansen kmem_cache_t *last_data_cache = NULL; 194789Sahrens 195789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196789Sahrens if (zio_buf_cache[c] != last_cache) { 197789Sahrens last_cache = zio_buf_cache[c]; 198789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 199789Sahrens } 200789Sahrens zio_buf_cache[c] = NULL; 2013290Sjohansen 2023290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 2033290Sjohansen last_data_cache = zio_data_buf_cache[c]; 2043290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 2053290Sjohansen } 2063290Sjohansen zio_data_buf_cache[c] = NULL; 207789Sahrens } 2081544Seschrock 2095329Sgw25295 taskq_destroy(zio_taskq); 2105329Sgw25295 2114055Seschrock kmem_cache_destroy(zio_cache); 2124055Seschrock 2131544Seschrock zio_inject_fini(); 214789Sahrens } 215789Sahrens 216789Sahrens /* 217789Sahrens * ========================================================================== 218789Sahrens * Allocate and free I/O buffers 219789Sahrens * ========================================================================== 220789Sahrens */ 2213290Sjohansen 2223290Sjohansen /* 2233290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 2243290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 2253290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 2263290Sjohansen * excess / transient data in-core during a crashdump. 2273290Sjohansen */ 228789Sahrens void * 229789Sahrens zio_buf_alloc(size_t size) 230789Sahrens { 231789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232789Sahrens 233789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234789Sahrens 2356245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236789Sahrens } 237789Sahrens 2383290Sjohansen /* 2393290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2403290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2413290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2423290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2433290Sjohansen */ 2443290Sjohansen void * 2453290Sjohansen zio_data_buf_alloc(size_t size) 2463290Sjohansen { 2473290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2483290Sjohansen 2493290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2503290Sjohansen 2516245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2523290Sjohansen } 2533290Sjohansen 254789Sahrens void 255789Sahrens zio_buf_free(void *buf, size_t size) 256789Sahrens { 257789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258789Sahrens 259789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260789Sahrens 261789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 262789Sahrens } 263789Sahrens 2643290Sjohansen void 2653290Sjohansen zio_data_buf_free(void *buf, size_t size) 2663290Sjohansen { 2673290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2683290Sjohansen 2693290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2703290Sjohansen 2713290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2723290Sjohansen } 2733463Sahrens 274789Sahrens /* 275789Sahrens * ========================================================================== 276789Sahrens * Push and pop I/O transform buffers 277789Sahrens * ========================================================================== 278789Sahrens */ 279789Sahrens static void 280789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281789Sahrens { 282789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283789Sahrens 284789Sahrens zt->zt_data = data; 285789Sahrens zt->zt_size = size; 286789Sahrens zt->zt_bufsize = bufsize; 287789Sahrens 288789Sahrens zt->zt_next = zio->io_transform_stack; 289789Sahrens zio->io_transform_stack = zt; 290789Sahrens 291789Sahrens zio->io_data = data; 292789Sahrens zio->io_size = size; 293789Sahrens } 294789Sahrens 295789Sahrens static void 296789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297789Sahrens { 298789Sahrens zio_transform_t *zt = zio->io_transform_stack; 299789Sahrens 300789Sahrens *data = zt->zt_data; 301789Sahrens *size = zt->zt_size; 302789Sahrens *bufsize = zt->zt_bufsize; 303789Sahrens 304789Sahrens zio->io_transform_stack = zt->zt_next; 305789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 306789Sahrens 307789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 308789Sahrens zio->io_data = zt->zt_data; 309789Sahrens zio->io_size = zt->zt_size; 310789Sahrens } 311789Sahrens } 312789Sahrens 313789Sahrens static void 314789Sahrens zio_clear_transform_stack(zio_t *zio) 315789Sahrens { 316789Sahrens void *data; 317789Sahrens uint64_t size, bufsize; 318789Sahrens 319789Sahrens ASSERT(zio->io_transform_stack != NULL); 320789Sahrens 321789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 322789Sahrens while (zio->io_transform_stack != NULL) { 323789Sahrens zio_buf_free(data, bufsize); 324789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 325789Sahrens } 326789Sahrens } 327789Sahrens 328789Sahrens /* 329789Sahrens * ========================================================================== 330789Sahrens * Create the various types of I/O (read, write, free) 331789Sahrens * ========================================================================== 332789Sahrens */ 333789Sahrens static zio_t * 334789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 336789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337789Sahrens { 338789Sahrens zio_t *zio; 339789Sahrens 340789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342789Sahrens 343*7046Sahrens /* Only we should set CONFIG_GRABBED */ 344*7046Sahrens ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED)); 345*7046Sahrens 3464055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 3474055Seschrock bzero(zio, sizeof (zio_t)); 348789Sahrens zio->io_parent = pio; 349789Sahrens zio->io_spa = spa; 350789Sahrens zio->io_txg = txg; 3514634Sek110237 zio->io_flags = flags; 352789Sahrens if (bp != NULL) { 353789Sahrens zio->io_bp = bp; 354789Sahrens zio->io_bp_copy = *bp; 355789Sahrens zio->io_bp_orig = *bp; 356789Sahrens } 357789Sahrens zio->io_done = done; 358789Sahrens zio->io_private = private; 359789Sahrens zio->io_type = type; 360789Sahrens zio->io_priority = priority; 361789Sahrens zio->io_stage = stage; 362789Sahrens zio->io_pipeline = pipeline; 363789Sahrens zio->io_timestamp = lbolt64; 3642856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 3654831Sgw25295 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 366789Sahrens zio_push_transform(zio, data, size, size); 367789Sahrens 3683463Sahrens /* 3693463Sahrens * Note on config lock: 3703463Sahrens * 3713463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3723463Sahrens * lock, so we don't need it for this io. 3733463Sahrens * 3743463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3753463Sahrens * config lock on behalf of this io, so it should be released 3763463Sahrens * in zio_done. 3773463Sahrens * 3783463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3793463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3803463Sahrens * The NULL top-level ios rarely have any children, so we delay 3813463Sahrens * grabbing the lock until the first child is added (but it is 3823463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3833463Sahrens * children don't need to also grab it). This greatly reduces 3843463Sahrens * contention on the config lock. 3853463Sahrens */ 386789Sahrens if (pio == NULL) { 3873463Sahrens if (type != ZIO_TYPE_NULL && 3883463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3895530Sbonwick spa_config_enter(spa, RW_READER, zio); 3903463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3913463Sahrens } 392789Sahrens zio->io_root = zio; 393789Sahrens } else { 394789Sahrens zio->io_root = pio->io_root; 3951544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3961544Seschrock zio->io_logical = pio->io_logical; 397789Sahrens mutex_enter(&pio->io_lock); 3983463Sahrens if (pio->io_parent == NULL && 3993463Sahrens pio->io_type == ZIO_TYPE_NULL && 4003463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 4013463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 4023463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 4035530Sbonwick spa_config_enter(spa, RW_READER, pio); 4043463Sahrens } 405789Sahrens if (stage < ZIO_STAGE_READY) 406789Sahrens pio->io_children_notready++; 407789Sahrens pio->io_children_notdone++; 408789Sahrens zio->io_sibling_next = pio->io_child; 409789Sahrens zio->io_sibling_prev = NULL; 410789Sahrens if (pio->io_child != NULL) 411789Sahrens pio->io_child->io_sibling_prev = zio; 412789Sahrens pio->io_child = zio; 4131775Sbillm zio->io_ndvas = pio->io_ndvas; 414789Sahrens mutex_exit(&pio->io_lock); 415789Sahrens } 416789Sahrens 4175329Sgw25295 /* 4185329Sgw25295 * Save off the original state incase we need to retry later. 4195329Sgw25295 */ 4205329Sgw25295 zio->io_orig_stage = zio->io_stage; 4215329Sgw25295 zio->io_orig_pipeline = zio->io_pipeline; 4225329Sgw25295 zio->io_orig_flags = zio->io_flags; 4235329Sgw25295 424*7046Sahrens /* 425*7046Sahrens * If this is not a null zio, and config is not already held, 426*7046Sahrens * then the root zio should have grabbed the config lock. 427*7046Sahrens * If this is not a root zio, it should not have grabbed the 428*7046Sahrens * config lock. 429*7046Sahrens */ 430*7046Sahrens ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) || 431*7046Sahrens zio->io_type == ZIO_TYPE_NULL || 432*7046Sahrens (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED)); 433*7046Sahrens ASSERT(zio->io_root == zio || 434*7046Sahrens !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)); 435*7046Sahrens 436789Sahrens return (zio); 437789Sahrens } 438789Sahrens 4395329Sgw25295 static void 4405329Sgw25295 zio_reset(zio_t *zio) 4415329Sgw25295 { 4425329Sgw25295 zio_clear_transform_stack(zio); 4435329Sgw25295 4445329Sgw25295 zio->io_flags = zio->io_orig_flags; 4455329Sgw25295 zio->io_stage = zio->io_orig_stage; 4465329Sgw25295 zio->io_pipeline = zio->io_orig_pipeline; 4475329Sgw25295 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 4485329Sgw25295 } 4495329Sgw25295 450789Sahrens zio_t * 451789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 452789Sahrens int flags) 453789Sahrens { 454789Sahrens zio_t *zio; 455789Sahrens 456789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 457789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 458789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 459789Sahrens 460789Sahrens return (zio); 461789Sahrens } 462789Sahrens 463789Sahrens zio_t * 464789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 465789Sahrens { 466789Sahrens return (zio_null(NULL, spa, done, private, flags)); 467789Sahrens } 468789Sahrens 469789Sahrens zio_t * 470*7046Sahrens zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, 471789Sahrens uint64_t size, zio_done_func_t *done, void *private, 472*7046Sahrens int priority, int flags, const zbookmark_t *zb) 473789Sahrens { 474789Sahrens zio_t *zio; 475789Sahrens 476789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 477789Sahrens 4785329Sgw25295 /* 4795329Sgw25295 * If the user has specified that we allow I/Os to continue 4805329Sgw25295 * then attempt to satisfy the read. 4815329Sgw25295 */ 4825329Sgw25295 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 4835329Sgw25295 ZIO_ENTER(spa); 4845329Sgw25295 485*7046Sahrens zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, 486*7046Sahrens data, size, done, private, 4872981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4882981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4891544Seschrock zio->io_bookmark = *zb; 4901544Seschrock 4911544Seschrock zio->io_logical = zio; 492789Sahrens 493789Sahrens /* 494789Sahrens * Work off our copy of the bp so the caller can free it. 495789Sahrens */ 496789Sahrens zio->io_bp = &zio->io_bp_copy; 497789Sahrens 498789Sahrens return (zio); 499789Sahrens } 500789Sahrens 501789Sahrens zio_t * 5021775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 503789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 5043547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 505*7046Sahrens int flags, const zbookmark_t *zb) 506789Sahrens { 507789Sahrens zio_t *zio; 508789Sahrens 509789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 510789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 511789Sahrens 512789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 513789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 514789Sahrens 5155329Sgw25295 ZIO_ENTER(spa); 5165329Sgw25295 517789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5182981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 519789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 520789Sahrens 5213547Smaybee zio->io_ready = ready; 5223547Smaybee 5231544Seschrock zio->io_bookmark = *zb; 5241544Seschrock 5251544Seschrock zio->io_logical = zio; 5261544Seschrock 527789Sahrens zio->io_checksum = checksum; 528789Sahrens zio->io_compress = compress; 5291775Sbillm zio->io_ndvas = ncopies; 530789Sahrens 531789Sahrens if (bp->blk_birth != txg) { 532789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 533789Sahrens BP_ZERO(bp); 534789Sahrens BP_SET_LSIZE(bp, size); 535789Sahrens BP_SET_PSIZE(bp, size); 5361775Sbillm } else { 5371775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 5381775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 5391775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 540789Sahrens } 541789Sahrens 542789Sahrens return (zio); 543789Sahrens } 544789Sahrens 545789Sahrens zio_t * 5467030Sperrin zio_rewrite(zio_t *pio, spa_t *spa, int checksum, blkptr_t *bp, void *data, 5477030Sperrin uint64_t size, zio_done_func_t *done, void *private, int priority, 5487030Sperrin int flags, zbookmark_t *zb) 549789Sahrens { 550789Sahrens zio_t *zio; 551789Sahrens 5527030Sperrin zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 5532981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 5545530Sbonwick ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 555789Sahrens 5561544Seschrock zio->io_bookmark = *zb; 557789Sahrens zio->io_checksum = checksum; 558789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 559789Sahrens 5601775Sbillm if (pio != NULL) 5611775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5621775Sbillm 563789Sahrens return (zio); 564789Sahrens } 565789Sahrens 5665329Sgw25295 static void 5675329Sgw25295 zio_write_allocate_ready(zio_t *zio) 5685329Sgw25295 { 5695329Sgw25295 /* Free up the previous block */ 5705329Sgw25295 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 5715329Sgw25295 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 5725329Sgw25295 &zio->io_bp_orig, NULL, NULL)); 5735329Sgw25295 } 5745329Sgw25295 } 5755329Sgw25295 576789Sahrens static zio_t * 577789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 578789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 579789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 580789Sahrens { 581789Sahrens zio_t *zio; 582789Sahrens 583789Sahrens BP_ZERO(bp); 584789Sahrens BP_SET_LSIZE(bp, size); 585789Sahrens BP_SET_PSIZE(bp, size); 586789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 587789Sahrens 588789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 589789Sahrens ZIO_TYPE_WRITE, priority, flags, 590789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 591789Sahrens 592789Sahrens zio->io_checksum = checksum; 593789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 5945329Sgw25295 zio->io_ready = zio_write_allocate_ready; 595789Sahrens 596789Sahrens return (zio); 597789Sahrens } 598789Sahrens 599789Sahrens zio_t * 600789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 601789Sahrens zio_done_func_t *done, void *private) 602789Sahrens { 603789Sahrens zio_t *zio; 604789Sahrens 605789Sahrens ASSERT(!BP_IS_HOLE(bp)); 606789Sahrens 607789Sahrens if (txg == spa->spa_syncing_txg && 608789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 609789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 610789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 611789Sahrens } 612789Sahrens 613789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 6142981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 6155530Sbonwick ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 616789Sahrens 617789Sahrens zio->io_bp = &zio->io_bp_copy; 618789Sahrens 619789Sahrens return (zio); 620789Sahrens } 621789Sahrens 622789Sahrens zio_t * 623789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 624789Sahrens zio_done_func_t *done, void *private) 625789Sahrens { 626789Sahrens zio_t *zio; 627789Sahrens 628789Sahrens /* 629789Sahrens * A claim is an allocation of a specific block. Claims are needed 630789Sahrens * to support immediate writes in the intent log. The issue is that 631789Sahrens * immediate writes contain committed data, but in a txg that was 632789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 633789Sahrens * the intent log claims all blocks that contain immediate write data 634789Sahrens * so that the SPA knows they're in use. 635789Sahrens * 636789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 637789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 638789Sahrens */ 639789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 640789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 641789Sahrens 642789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 643789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 6445530Sbonwick ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 645789Sahrens 646789Sahrens zio->io_bp = &zio->io_bp_copy; 647789Sahrens 648789Sahrens return (zio); 649789Sahrens } 650789Sahrens 651789Sahrens zio_t * 652789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 653789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 654789Sahrens { 655789Sahrens zio_t *zio; 656789Sahrens int c; 657789Sahrens 658789Sahrens if (vd->vdev_children == 0) { 659789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 660789Sahrens ZIO_TYPE_IOCTL, priority, flags, 661789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 662789Sahrens 663789Sahrens zio->io_vd = vd; 664789Sahrens zio->io_cmd = cmd; 665789Sahrens } else { 666789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 667789Sahrens 668789Sahrens for (c = 0; c < vd->vdev_children; c++) 669789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 670789Sahrens done, private, priority, flags)); 671789Sahrens } 672789Sahrens 673789Sahrens return (zio); 674789Sahrens } 675789Sahrens 676789Sahrens static void 677789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 6785450Sbrendan int checksum, boolean_t labels) 679789Sahrens { 680789Sahrens ASSERT(vd->vdev_children == 0); 681789Sahrens 682789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 683789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 684789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 685789Sahrens 6865450Sbrendan #ifdef ZFS_DEBUG 6875450Sbrendan if (labels) { 6885450Sbrendan ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 6895450Sbrendan offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 6905450Sbrendan } 6915450Sbrendan #endif 692789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 693789Sahrens 694789Sahrens BP_ZERO(bp); 695789Sahrens 696789Sahrens BP_SET_LSIZE(bp, size); 697789Sahrens BP_SET_PSIZE(bp, size); 698789Sahrens 699789Sahrens BP_SET_CHECKSUM(bp, checksum); 700789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 701789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 702789Sahrens 703789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 704789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 705789Sahrens } 706789Sahrens 707789Sahrens zio_t * 708789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 709789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 7105450Sbrendan int priority, int flags, boolean_t labels) 711789Sahrens { 712789Sahrens zio_t *zio; 713789Sahrens blkptr_t blk; 714789Sahrens 7155329Sgw25295 ZIO_ENTER(vd->vdev_spa); 7165329Sgw25295 7175450Sbrendan zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 718789Sahrens 719789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 720789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 721789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 722789Sahrens 723789Sahrens zio->io_vd = vd; 724789Sahrens zio->io_offset = offset; 725789Sahrens 726789Sahrens /* 727789Sahrens * Work off our copy of the bp so the caller can free it. 728789Sahrens */ 729789Sahrens zio->io_bp = &zio->io_bp_copy; 730789Sahrens 731789Sahrens return (zio); 732789Sahrens } 733789Sahrens 734789Sahrens zio_t * 735789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 736789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 7375450Sbrendan int priority, int flags, boolean_t labels) 738789Sahrens { 739789Sahrens zio_block_tail_t *zbt; 740789Sahrens void *wbuf; 741789Sahrens zio_t *zio; 742789Sahrens blkptr_t blk; 743789Sahrens 7445329Sgw25295 ZIO_ENTER(vd->vdev_spa); 7455329Sgw25295 7465450Sbrendan zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 747789Sahrens 748789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 749789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 750789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 751789Sahrens 752789Sahrens zio->io_vd = vd; 753789Sahrens zio->io_offset = offset; 754789Sahrens 755789Sahrens zio->io_bp = &zio->io_bp_copy; 756789Sahrens zio->io_checksum = checksum; 757789Sahrens 758789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 759789Sahrens /* 760789Sahrens * zbt checksums are necessarily destructive -- they modify 761789Sahrens * one word of the write buffer to hold the verifier/checksum. 762789Sahrens * Therefore, we must make a local copy in case the data is 763789Sahrens * being written to multiple places. 764789Sahrens */ 765789Sahrens wbuf = zio_buf_alloc(size); 766789Sahrens bcopy(data, wbuf, size); 767789Sahrens zio_push_transform(zio, wbuf, size, size); 768789Sahrens 769789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 770789Sahrens zbt->zbt_cksum = blk.blk_cksum; 771789Sahrens } 772789Sahrens 773789Sahrens return (zio); 774789Sahrens } 775789Sahrens 776789Sahrens /* 777789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 778789Sahrens */ 779789Sahrens zio_t * 780789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 781789Sahrens void *data, uint64_t size, int type, int priority, int flags, 782789Sahrens zio_done_func_t *done, void *private) 783789Sahrens { 784789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 785789Sahrens zio_t *cio; 786789Sahrens 787789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 788789Sahrens /* 789789Sahrens * If we have the bp, then the child should perform the 790789Sahrens * checksum and the parent need not. This pushes error 791789Sahrens * detection as close to the leaves as possible and 792789Sahrens * eliminates redundant checksums in the interior nodes. 793789Sahrens */ 794789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 795789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 796789Sahrens } 797789Sahrens 798789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 799789Sahrens done, private, type, priority, 800789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 8011775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 802789Sahrens 803789Sahrens cio->io_vd = vd; 804789Sahrens cio->io_offset = offset; 805789Sahrens 806789Sahrens return (cio); 807789Sahrens } 808789Sahrens 809789Sahrens /* 810789Sahrens * ========================================================================== 811789Sahrens * Initiate I/O, either sync or async 812789Sahrens * ========================================================================== 813789Sahrens */ 8146523Sek110237 static void 8156523Sek110237 zio_destroy(zio_t *zio) 8166523Sek110237 { 8176523Sek110237 mutex_destroy(&zio->io_lock); 8186523Sek110237 cv_destroy(&zio->io_cv); 8196523Sek110237 if (zio->io_failed_vds != NULL) { 8206523Sek110237 kmem_free(zio->io_failed_vds, 8216523Sek110237 zio->io_failed_vds_count * sizeof (vdev_t *)); 8226523Sek110237 zio->io_failed_vds = NULL; 8236523Sek110237 zio->io_failed_vds_count = 0; 8246523Sek110237 } 8256523Sek110237 kmem_cache_free(zio_cache, zio); 8266523Sek110237 } 8276523Sek110237 828789Sahrens int 829789Sahrens zio_wait(zio_t *zio) 830789Sahrens { 831789Sahrens int error; 832789Sahrens 833789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 834789Sahrens 835789Sahrens zio->io_waiter = curthread; 836789Sahrens 8375530Sbonwick zio_execute(zio); 838789Sahrens 839789Sahrens mutex_enter(&zio->io_lock); 840789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 841789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 842789Sahrens mutex_exit(&zio->io_lock); 843789Sahrens 844789Sahrens error = zio->io_error; 8456523Sek110237 zio_destroy(zio); 846789Sahrens 847789Sahrens return (error); 848789Sahrens } 849789Sahrens 850789Sahrens void 851789Sahrens zio_nowait(zio_t *zio) 852789Sahrens { 8535530Sbonwick zio_execute(zio); 8545530Sbonwick } 8555530Sbonwick 8565530Sbonwick void 8575530Sbonwick zio_interrupt(zio_t *zio) 8585530Sbonwick { 8595530Sbonwick (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 8605530Sbonwick (task_func_t *)zio_execute, zio, TQ_SLEEP); 8615530Sbonwick } 8625530Sbonwick 8635530Sbonwick static int 8645530Sbonwick zio_issue_async(zio_t *zio) 8655530Sbonwick { 8665530Sbonwick (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 8675530Sbonwick (task_func_t *)zio_execute, zio, TQ_SLEEP); 8685530Sbonwick 8695530Sbonwick return (ZIO_PIPELINE_STOP); 870789Sahrens } 871789Sahrens 872789Sahrens /* 873789Sahrens * ========================================================================== 874789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 875789Sahrens * ========================================================================== 876789Sahrens */ 8775530Sbonwick static int 878789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 879789Sahrens { 8805530Sbonwick int rv = ZIO_PIPELINE_CONTINUE; 8815530Sbonwick 882789Sahrens mutex_enter(&zio->io_lock); 8835530Sbonwick ASSERT(zio->io_stalled == 0); 8845530Sbonwick if (*countp != 0) { 885789Sahrens zio->io_stalled = stage; 8865530Sbonwick rv = ZIO_PIPELINE_STOP; 887789Sahrens } 8885530Sbonwick mutex_exit(&zio->io_lock); 8895530Sbonwick 8905530Sbonwick return (rv); 891789Sahrens } 892789Sahrens 893789Sahrens static void 8946523Sek110237 zio_add_failed_vdev(zio_t *pio, zio_t *zio) 8956523Sek110237 { 8966523Sek110237 uint64_t oldcount = pio->io_failed_vds_count; 8976523Sek110237 vdev_t **new_vds; 8986523Sek110237 int i; 8996523Sek110237 9006523Sek110237 ASSERT(MUTEX_HELD(&pio->io_lock)); 9016523Sek110237 9026523Sek110237 if (zio->io_vd == NULL) 9036523Sek110237 return; 9046523Sek110237 9056523Sek110237 for (i = 0; i < oldcount; i++) { 9066523Sek110237 if (pio->io_failed_vds[i] == zio->io_vd) 9076523Sek110237 return; 9086523Sek110237 } 9096523Sek110237 9106523Sek110237 new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP); 9116523Sek110237 if (pio->io_failed_vds != NULL) { 9126523Sek110237 bcopy(pio->io_failed_vds, new_vds, 9136523Sek110237 oldcount * sizeof (vdev_t *)); 9146523Sek110237 kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *)); 9156523Sek110237 } 9166523Sek110237 pio->io_failed_vds = new_vds; 9176523Sek110237 pio->io_failed_vds[oldcount] = zio->io_vd; 9186523Sek110237 pio->io_failed_vds_count++; 9196523Sek110237 } 9206523Sek110237 9216523Sek110237 static void 922789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 923789Sahrens { 924789Sahrens zio_t *pio = zio->io_parent; 925789Sahrens 926789Sahrens mutex_enter(&pio->io_lock); 9276523Sek110237 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) { 928789Sahrens pio->io_error = zio->io_error; 9296523Sek110237 if (zio->io_error && zio->io_error != ENOTSUP) 9306523Sek110237 zio_add_failed_vdev(pio, zio); 9316523Sek110237 } 9325329Sgw25295 ASSERT3U(*countp, >, 0); 933789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 934789Sahrens pio->io_stalled = 0; 935789Sahrens mutex_exit(&pio->io_lock); 9365530Sbonwick zio_execute(pio); 937789Sahrens } else { 938789Sahrens mutex_exit(&pio->io_lock); 939789Sahrens } 940789Sahrens } 941789Sahrens 9425530Sbonwick int 9435530Sbonwick zio_wait_for_children_ready(zio_t *zio) 944789Sahrens { 9455530Sbonwick return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 9465530Sbonwick &zio->io_children_notready)); 947789Sahrens } 948789Sahrens 9495530Sbonwick int 9505530Sbonwick zio_wait_for_children_done(zio_t *zio) 951789Sahrens { 9525530Sbonwick return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 9535530Sbonwick &zio->io_children_notdone)); 954789Sahrens } 955789Sahrens 9565530Sbonwick static int 9575329Sgw25295 zio_read_init(zio_t *zio) 9585329Sgw25295 { 9595530Sbonwick blkptr_t *bp = zio->io_bp; 9605530Sbonwick 9615530Sbonwick if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 9625530Sbonwick uint64_t csize = BP_GET_PSIZE(bp); 9635329Sgw25295 void *cbuf = zio_buf_alloc(csize); 9645329Sgw25295 9655329Sgw25295 zio_push_transform(zio, cbuf, csize, csize); 9665329Sgw25295 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 9675329Sgw25295 } 9685329Sgw25295 9695530Sbonwick if (BP_IS_GANG(bp)) { 9705329Sgw25295 uint64_t gsize = SPA_GANGBLOCKSIZE; 9715329Sgw25295 void *gbuf = zio_buf_alloc(gsize); 9725329Sgw25295 9735329Sgw25295 zio_push_transform(zio, gbuf, gsize, gsize); 9745329Sgw25295 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 9755329Sgw25295 } 9765530Sbonwick 9775530Sbonwick if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 9785530Sbonwick zio->io_flags |= ZIO_FLAG_DONT_CACHE; 9795530Sbonwick 9805530Sbonwick return (ZIO_PIPELINE_CONTINUE); 9815329Sgw25295 } 9825329Sgw25295 9835530Sbonwick static int 984789Sahrens zio_ready(zio_t *zio) 985789Sahrens { 986789Sahrens zio_t *pio = zio->io_parent; 987789Sahrens 9883547Smaybee if (zio->io_ready) 9893547Smaybee zio->io_ready(zio); 9903547Smaybee 991789Sahrens if (pio != NULL) 9925530Sbonwick zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 993789Sahrens &pio->io_children_notready); 994789Sahrens 995789Sahrens if (zio->io_bp) 996789Sahrens zio->io_bp_copy = *zio->io_bp; 997789Sahrens 9985530Sbonwick return (ZIO_PIPELINE_CONTINUE); 999789Sahrens } 1000789Sahrens 10015530Sbonwick static int 10025329Sgw25295 zio_vdev_retry_io(zio_t *zio) 1003789Sahrens { 1004789Sahrens zio_t *pio = zio->io_parent; 10055329Sgw25295 10065329Sgw25295 /* 10075329Sgw25295 * Preserve the failed bp so that the io_ready() callback can 10085329Sgw25295 * update the accounting accordingly. The callback will also be 10095329Sgw25295 * responsible for freeing the previously allocated block, if one 10105329Sgw25295 * exists. 10115329Sgw25295 */ 10125329Sgw25295 zio->io_bp_orig = *zio->io_bp; 10135329Sgw25295 10145329Sgw25295 /* 10155329Sgw25295 * We must zero out the old DVA and blk_birth before reallocating 10165403Sgw25295 * the bp. 10175329Sgw25295 */ 10185403Sgw25295 BP_ZERO_DVAS(zio->io_bp); 10195329Sgw25295 zio_reset(zio); 10205329Sgw25295 10215329Sgw25295 if (pio) { 10225329Sgw25295 /* 10235329Sgw25295 * Let the parent know that we will 10245329Sgw25295 * re-alloc the write (=> new bp info). 10255329Sgw25295 */ 10265329Sgw25295 mutex_enter(&pio->io_lock); 10275329Sgw25295 pio->io_children_notready++; 10285329Sgw25295 10295329Sgw25295 /* 10305329Sgw25295 * If the parent I/O is still in the open stage, then 10315329Sgw25295 * don't bother telling it to retry since it hasn't 10325329Sgw25295 * progressed far enough for it to care. 10335329Sgw25295 */ 10345329Sgw25295 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 10355329Sgw25295 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 10365329Sgw25295 10375530Sbonwick ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 10385329Sgw25295 mutex_exit(&pio->io_lock); 10395329Sgw25295 } 10405329Sgw25295 10415329Sgw25295 /* 10425329Sgw25295 * We are getting ready to process the retry request so clear 10435329Sgw25295 * the flag and the zio's current error status. 10445329Sgw25295 */ 10455329Sgw25295 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 10465329Sgw25295 zio->io_error = 0; 10475530Sbonwick 10485530Sbonwick return (ZIO_PIPELINE_CONTINUE); 10495329Sgw25295 } 10505329Sgw25295 10515329Sgw25295 int 10525329Sgw25295 zio_vdev_resume_io(spa_t *spa) 10535329Sgw25295 { 10545329Sgw25295 zio_t *zio; 10555329Sgw25295 10565329Sgw25295 mutex_enter(&spa->spa_zio_lock); 10575329Sgw25295 10585329Sgw25295 /* 10595329Sgw25295 * Probe all of vdevs that have experienced an I/O error. 10605329Sgw25295 * If we are still unable to verify the integrity of the vdev 10615329Sgw25295 * then we prevent the resume from proceeeding. 10625329Sgw25295 */ 10635329Sgw25295 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 10645329Sgw25295 zio = list_next(&spa->spa_zio_list, zio)) { 10655329Sgw25295 int error = 0; 10665329Sgw25295 10675329Sgw25295 /* We only care about I/Os that must succeed */ 10685329Sgw25295 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 10695329Sgw25295 continue; 10705329Sgw25295 error = vdev_probe(zio->io_vd); 10715329Sgw25295 if (error) { 10725329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10735329Sgw25295 return (error); 10745329Sgw25295 } 10755329Sgw25295 } 10765329Sgw25295 10775329Sgw25295 /* 10785329Sgw25295 * Clear the vdev stats so that I/O can flow. 10795329Sgw25295 */ 10805329Sgw25295 vdev_clear(spa, NULL, B_FALSE); 10815329Sgw25295 10825329Sgw25295 spa->spa_state = POOL_STATE_ACTIVE; 10835329Sgw25295 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 10845329Sgw25295 list_remove(&spa->spa_zio_list, zio); 10855329Sgw25295 zio->io_error = 0; 10865329Sgw25295 10875329Sgw25295 /* 10885329Sgw25295 * If we are resuming an allocating I/O then we force it 10895329Sgw25295 * to retry and let it resume operation where it left off. 10905329Sgw25295 * Otherwise, go back to the ready stage and pick up from 10915329Sgw25295 * there. 10925329Sgw25295 */ 10935329Sgw25295 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 10945329Sgw25295 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 10955329Sgw25295 zio->io_stage--; 10965329Sgw25295 } else { 10975329Sgw25295 zio->io_stage = ZIO_STAGE_READY; 10985329Sgw25295 } 10995329Sgw25295 11005530Sbonwick (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, 11015329Sgw25295 zio, TQ_SLEEP); 11025329Sgw25295 } 11035329Sgw25295 mutex_exit(&spa->spa_zio_lock); 11045329Sgw25295 11055329Sgw25295 /* 11065329Sgw25295 * Wait for the taskqs to finish and recheck the pool state since 11075329Sgw25295 * it's possible that a resumed I/O has failed again. 11085329Sgw25295 */ 11095329Sgw25295 taskq_wait(zio_taskq); 11105329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 11115329Sgw25295 return (EIO); 11125329Sgw25295 11135329Sgw25295 mutex_enter(&spa->spa_zio_lock); 11145329Sgw25295 cv_broadcast(&spa->spa_zio_cv); 11155329Sgw25295 mutex_exit(&spa->spa_zio_lock); 11165329Sgw25295 11175329Sgw25295 return (0); 11185329Sgw25295 } 11195329Sgw25295 11205530Sbonwick static int 11215329Sgw25295 zio_vdev_suspend_io(zio_t *zio) 11225329Sgw25295 { 11235329Sgw25295 spa_t *spa = zio->io_spa; 11245329Sgw25295 11255329Sgw25295 /* 11265329Sgw25295 * We've experienced an unrecoverable failure so 11275329Sgw25295 * set the pool state accordingly and queue all 11285329Sgw25295 * failed IOs. 11295329Sgw25295 */ 11305329Sgw25295 spa->spa_state = POOL_STATE_IO_FAILURE; 11315329Sgw25295 11325329Sgw25295 mutex_enter(&spa->spa_zio_lock); 11335329Sgw25295 list_insert_tail(&spa->spa_zio_list, zio); 11345329Sgw25295 11355329Sgw25295 #ifndef _KERNEL 11365329Sgw25295 /* Used to notify ztest that the pool has suspended */ 11375329Sgw25295 cv_broadcast(&spa->spa_zio_cv); 11385329Sgw25295 #endif 11395329Sgw25295 mutex_exit(&spa->spa_zio_lock); 11405530Sbonwick 11415530Sbonwick return (ZIO_PIPELINE_STOP); 11425329Sgw25295 } 11435329Sgw25295 11446523Sek110237 static void 11456523Sek110237 zio_handle_io_failure(zio_t *zio, vdev_t *vd) 11466523Sek110237 { 11476523Sek110237 spa_t *spa = zio->io_spa; 11486523Sek110237 blkptr_t *bp = zio->io_bp; 11496523Sek110237 char *blkbuf; 11506523Sek110237 11516523Sek110237 #ifdef ZFS_DEBUG 11526523Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 11536523Sek110237 if (blkbuf) { 11546523Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 11556523Sek110237 bp ? bp : &zio->io_bp_copy); 11566523Sek110237 } 11576523Sek110237 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 11586523Sek110237 zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 11596523Sek110237 zio_type_name[zio->io_type], vdev_description(vd), 11606523Sek110237 (u_longlong_t)zio->io_offset, (void *)zio, 11616523Sek110237 blkbuf ? blkbuf : "", zio->io_error); 11626523Sek110237 if (blkbuf) 11636523Sek110237 kmem_free(blkbuf, BP_SPRINTF_LEN); 11646523Sek110237 #endif 11656523Sek110237 11666523Sek110237 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 11676523Sek110237 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 11686523Sek110237 "failure and the failure mode property for this pool " 11696523Sek110237 "is set to panic.", spa_name(spa)); 11706523Sek110237 } 11716523Sek110237 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 11726523Sek110237 vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE, 11736523Sek110237 VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE); 11746523Sek110237 } 11756523Sek110237 11765530Sbonwick static int 11775329Sgw25295 zio_assess(zio_t *zio) 11785329Sgw25295 { 1179789Sahrens spa_t *spa = zio->io_spa; 1180789Sahrens blkptr_t *bp = zio->io_bp; 1181789Sahrens vdev_t *vd = zio->io_vd; 1182789Sahrens 1183789Sahrens ASSERT(zio->io_children_notready == 0); 1184789Sahrens ASSERT(zio->io_children_notdone == 0); 1185789Sahrens 1186789Sahrens if (bp != NULL) { 1187789Sahrens ASSERT(bp->blk_pad[0] == 0); 1188789Sahrens ASSERT(bp->blk_pad[1] == 0); 1189789Sahrens ASSERT(bp->blk_pad[2] == 0); 1190789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1191789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 11921775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1193789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 11941775Sbillm if (zio->io_ndvas != 0) 11951775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 11961775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 11971775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 11981775Sbillm } 1199789Sahrens } 1200789Sahrens 12015329Sgw25295 /* 12025329Sgw25295 * Some child I/O has indicated that a retry is necessary, so 12035329Sgw25295 * we set an error on the I/O and let the logic below do the 12045329Sgw25295 * rest. 12055329Sgw25295 */ 12065329Sgw25295 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 12075329Sgw25295 zio->io_error = ERESTART; 12085329Sgw25295 1209789Sahrens if (vd != NULL) 1210789Sahrens vdev_stat_update(zio); 1211789Sahrens 1212789Sahrens if (zio->io_error) { 12131544Seschrock /* 12141544Seschrock * If this I/O is attached to a particular vdev, 12151544Seschrock * generate an error message describing the I/O failure 12161544Seschrock * at the block level. We ignore these errors if the 12171544Seschrock * device is currently unavailable. 12181544Seschrock */ 12191732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 12205329Sgw25295 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1221789Sahrens 12221544Seschrock if ((zio->io_error == EIO || 12231544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 12241544Seschrock zio->io_logical == zio) { 12251544Seschrock /* 12261544Seschrock * For root I/O requests, tell the SPA to log the error 12271544Seschrock * appropriately. Also, generate a logical data 12281544Seschrock * ereport. 12291544Seschrock */ 12305329Sgw25295 spa_log_error(spa, zio); 12311544Seschrock 12325329Sgw25295 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 12335329Sgw25295 0, 0); 12341544Seschrock } 1235789Sahrens 12361544Seschrock /* 12375403Sgw25295 * If we are an allocating I/O then we attempt to reissue 12385403Sgw25295 * the I/O on another vdev unless the pool is out of space. 12395403Sgw25295 * We handle this condition based on the spa's failmode 12405403Sgw25295 * property. 12415329Sgw25295 */ 12425329Sgw25295 if (zio_write_retry && zio->io_error != ENOSPC && 12435530Sbonwick IO_IS_ALLOCATING(zio)) 12445530Sbonwick return (zio_vdev_retry_io(zio)); 12455530Sbonwick 12465329Sgw25295 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 12475329Sgw25295 12485329Sgw25295 /* 12495329Sgw25295 * For I/O requests that cannot fail, we carry out 12505329Sgw25295 * the requested behavior based on the failmode pool 12515329Sgw25295 * property. 12525329Sgw25295 * 12535329Sgw25295 * XXX - Need to differentiate between an ENOSPC as 12545329Sgw25295 * a result of vdev failures vs. a full pool. 12551544Seschrock */ 12561544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 12576523Sek110237 int i; 12583459Sek110237 12596523Sek110237 for (i = 0; i < zio->io_failed_vds_count; i++) { 12606523Sek110237 zio_handle_io_failure(zio, 12616523Sek110237 zio->io_failed_vds[i]); 12623459Sek110237 } 12636523Sek110237 if (zio->io_failed_vds_count == 0) { 12646523Sek110237 zio_handle_io_failure(zio, 12656523Sek110237 vd ? vd : spa->spa_root_vdev); 12665329Sgw25295 } 12676523Sek110237 if (zio->io_failed_vds != NULL) { 12686523Sek110237 kmem_free(zio->io_failed_vds, 12696523Sek110237 zio->io_failed_vds_count * 12706523Sek110237 sizeof (vdev_t *)); 12716523Sek110237 zio->io_failed_vds = NULL; 12726523Sek110237 zio->io_failed_vds_count = 0; 12736523Sek110237 } 12745530Sbonwick return (zio_vdev_suspend_io(zio)); 12751544Seschrock } 1276789Sahrens } 12775329Sgw25295 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 12785329Sgw25295 ASSERT(zio->io_children_notready == 0); 12795530Sbonwick 12805530Sbonwick return (ZIO_PIPELINE_CONTINUE); 12815329Sgw25295 } 12825329Sgw25295 12835530Sbonwick static int 12845329Sgw25295 zio_done(zio_t *zio) 12855329Sgw25295 { 12865329Sgw25295 zio_t *pio = zio->io_parent; 12875329Sgw25295 spa_t *spa = zio->io_spa; 12885329Sgw25295 12895329Sgw25295 ASSERT(zio->io_children_notready == 0); 12905329Sgw25295 ASSERT(zio->io_children_notdone == 0); 12915329Sgw25295 1292789Sahrens zio_clear_transform_stack(zio); 1293789Sahrens 1294789Sahrens if (zio->io_done) 1295789Sahrens zio->io_done(zio); 1296789Sahrens 1297789Sahrens ASSERT(zio->io_delegate_list == NULL); 1298789Sahrens ASSERT(zio->io_delegate_next == NULL); 1299789Sahrens 1300789Sahrens if (pio != NULL) { 1301789Sahrens zio_t *next, *prev; 1302789Sahrens 1303789Sahrens mutex_enter(&pio->io_lock); 1304789Sahrens next = zio->io_sibling_next; 1305789Sahrens prev = zio->io_sibling_prev; 1306789Sahrens if (next != NULL) 1307789Sahrens next->io_sibling_prev = prev; 1308789Sahrens if (prev != NULL) 1309789Sahrens prev->io_sibling_next = next; 1310789Sahrens if (pio->io_child == zio) 1311789Sahrens pio->io_child = next; 1312789Sahrens mutex_exit(&pio->io_lock); 1313789Sahrens 13145530Sbonwick zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 1315789Sahrens &pio->io_children_notdone); 1316789Sahrens } 1317789Sahrens 13183463Sahrens /* 13194055Seschrock * Note: this I/O is now done, and will shortly be freed, so there is no 13204055Seschrock * need to clear this (or any other) flag. 13213463Sahrens */ 13223463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 13231544Seschrock spa_config_exit(spa, zio); 1324789Sahrens 1325789Sahrens if (zio->io_waiter != NULL) { 1326789Sahrens mutex_enter(&zio->io_lock); 1327789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1328789Sahrens zio->io_stalled = zio->io_stage; 1329789Sahrens cv_broadcast(&zio->io_cv); 1330789Sahrens mutex_exit(&zio->io_lock); 1331789Sahrens } else { 13326523Sek110237 zio_destroy(zio); 1333789Sahrens } 13345530Sbonwick 13355530Sbonwick return (ZIO_PIPELINE_STOP); 1336789Sahrens } 1337789Sahrens 1338789Sahrens /* 1339789Sahrens * ========================================================================== 1340789Sahrens * Compression support 1341789Sahrens * ========================================================================== 1342789Sahrens */ 13435530Sbonwick static int 1344789Sahrens zio_write_compress(zio_t *zio) 1345789Sahrens { 1346789Sahrens int compress = zio->io_compress; 1347789Sahrens blkptr_t *bp = zio->io_bp; 1348789Sahrens void *cbuf; 1349789Sahrens uint64_t lsize = zio->io_size; 1350789Sahrens uint64_t csize = lsize; 1351789Sahrens uint64_t cbufsize = 0; 1352789Sahrens int pass; 1353789Sahrens 1354789Sahrens if (bp->blk_birth == zio->io_txg) { 1355789Sahrens /* 1356789Sahrens * We're rewriting an existing block, which means we're 1357789Sahrens * working on behalf of spa_sync(). For spa_sync() to 1358789Sahrens * converge, it must eventually be the case that we don't 1359789Sahrens * have to allocate new blocks. But compression changes 1360789Sahrens * the blocksize, which forces a reallocate, and makes 1361789Sahrens * convergence take longer. Therefore, after the first 1362789Sahrens * few passes, stop compressing to ensure convergence. 1363789Sahrens */ 1364789Sahrens pass = spa_sync_pass(zio->io_spa); 1365789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 1366789Sahrens compress = ZIO_COMPRESS_OFF; 1367789Sahrens } else { 1368789Sahrens ASSERT(BP_IS_HOLE(bp)); 1369789Sahrens pass = 1; 1370789Sahrens } 1371789Sahrens 1372789Sahrens if (compress != ZIO_COMPRESS_OFF) 1373789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1374789Sahrens &cbuf, &csize, &cbufsize)) 1375789Sahrens compress = ZIO_COMPRESS_OFF; 1376789Sahrens 1377789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 1378789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 1379789Sahrens 1380789Sahrens /* 1381789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 1382789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 1383789Sahrens * but newly allocated blocks are sequential, so they can be written 1384789Sahrens * to disk faster. Therefore, we allow the first few passes of 1385789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 1386789Sahrens * There should only be a handful of blocks after pass 1 in any case. 1387789Sahrens */ 1388789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1389789Sahrens pass > zio_sync_pass.zp_rewrite) { 1390789Sahrens ASSERT(csize != 0); 13912885Sahrens BP_SET_LSIZE(bp, lsize); 13922885Sahrens BP_SET_COMPRESS(bp, compress); 13935530Sbonwick zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); 1394789Sahrens } else { 13953882Sahrens if (bp->blk_birth == zio->io_txg) 13963882Sahrens BP_ZERO(bp); 1397789Sahrens if (csize == 0) { 1398789Sahrens BP_ZERO(bp); 1399789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1400789Sahrens } else { 14011775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1402789Sahrens BP_SET_LSIZE(bp, lsize); 1403789Sahrens BP_SET_PSIZE(bp, csize); 1404789Sahrens BP_SET_COMPRESS(bp, compress); 1405789Sahrens } 1406789Sahrens } 1407789Sahrens 14085530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1409789Sahrens } 1410789Sahrens 14115530Sbonwick static int 1412789Sahrens zio_read_decompress(zio_t *zio) 1413789Sahrens { 1414789Sahrens blkptr_t *bp = zio->io_bp; 1415789Sahrens void *data; 1416789Sahrens uint64_t size; 1417789Sahrens uint64_t bufsize; 1418789Sahrens int compress = BP_GET_COMPRESS(bp); 1419789Sahrens 1420789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1421789Sahrens 1422789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1423789Sahrens 1424789Sahrens if (zio_decompress_data(compress, data, size, 1425789Sahrens zio->io_data, zio->io_size)) 1426789Sahrens zio->io_error = EIO; 1427789Sahrens 1428789Sahrens zio_buf_free(data, bufsize); 1429789Sahrens 14305530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1431789Sahrens } 1432789Sahrens 1433789Sahrens /* 1434789Sahrens * ========================================================================== 1435789Sahrens * Gang block support 1436789Sahrens * ========================================================================== 1437789Sahrens */ 1438789Sahrens static void 1439789Sahrens zio_gang_byteswap(zio_t *zio) 1440789Sahrens { 1441789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1442789Sahrens 1443789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1444789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1445789Sahrens } 1446789Sahrens 14475530Sbonwick static int 1448789Sahrens zio_get_gang_header(zio_t *zio) 1449789Sahrens { 1450789Sahrens blkptr_t *bp = zio->io_bp; 1451789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1452789Sahrens void *gbuf = zio_buf_alloc(gsize); 1453789Sahrens 14541775Sbillm ASSERT(BP_IS_GANG(bp)); 1455789Sahrens 1456789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1457789Sahrens 1458789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1459789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1460789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 14615329Sgw25295 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1462789Sahrens 14635530Sbonwick return (zio_wait_for_children_done(zio)); 1464789Sahrens } 1465789Sahrens 14665530Sbonwick static int 1467789Sahrens zio_read_gang_members(zio_t *zio) 1468789Sahrens { 1469789Sahrens zio_gbh_phys_t *gbh; 1470789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1471789Sahrens int i; 1472789Sahrens 14731775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1474789Sahrens 1475789Sahrens zio_gang_byteswap(zio); 1476789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1477789Sahrens 1478789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1479789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1480789Sahrens lsize = BP_GET_PSIZE(gbp); 1481789Sahrens 1482789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1483789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1484789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1485789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1486789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1487789Sahrens 1488789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 14895530Sbonwick (char *)zio->io_data + loff, lsize, 14905530Sbonwick NULL, NULL, zio->io_priority, 14915530Sbonwick zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1492789Sahrens } 1493789Sahrens 1494789Sahrens zio_buf_free(gbh, gbufsize); 14955530Sbonwick 14965530Sbonwick return (zio_wait_for_children_done(zio)); 1497789Sahrens } 1498789Sahrens 14995530Sbonwick static int 1500789Sahrens zio_rewrite_gang_members(zio_t *zio) 1501789Sahrens { 1502789Sahrens zio_gbh_phys_t *gbh; 1503789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1504789Sahrens int i; 1505789Sahrens 15061775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1507789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1508789Sahrens 1509789Sahrens zio_gang_byteswap(zio); 1510789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1511789Sahrens 1512789Sahrens ASSERT(gsize == gbufsize); 1513789Sahrens 1514789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1515789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1516789Sahrens lsize = BP_GET_PSIZE(gbp); 1517789Sahrens 1518789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1519789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1520789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1521789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1522789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1523789Sahrens 15247030Sperrin zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, gbp, 15257030Sperrin (char *)zio->io_data + loff, lsize, NULL, NULL, 15267030Sperrin zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 15277030Sperrin &zio->io_bookmark)); 1528789Sahrens } 1529789Sahrens 1530789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 15315530Sbonwick 15325530Sbonwick return (zio_wait_for_children_ready(zio)); 1533789Sahrens } 1534789Sahrens 15355530Sbonwick static int 1536789Sahrens zio_free_gang_members(zio_t *zio) 1537789Sahrens { 1538789Sahrens zio_gbh_phys_t *gbh; 1539789Sahrens uint64_t gsize, gbufsize; 1540789Sahrens int i; 1541789Sahrens 15421775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1543789Sahrens 1544789Sahrens zio_gang_byteswap(zio); 1545789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1546789Sahrens 1547789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1548789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1549789Sahrens 1550789Sahrens if (BP_IS_HOLE(gbp)) 1551789Sahrens continue; 1552789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1553789Sahrens gbp, NULL, NULL)); 1554789Sahrens } 1555789Sahrens 1556789Sahrens zio_buf_free(gbh, gbufsize); 15575530Sbonwick 15585530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1559789Sahrens } 1560789Sahrens 15615530Sbonwick static int 1562789Sahrens zio_claim_gang_members(zio_t *zio) 1563789Sahrens { 1564789Sahrens zio_gbh_phys_t *gbh; 1565789Sahrens uint64_t gsize, gbufsize; 1566789Sahrens int i; 1567789Sahrens 15681775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1569789Sahrens 1570789Sahrens zio_gang_byteswap(zio); 1571789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1572789Sahrens 1573789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1574789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1575789Sahrens if (BP_IS_HOLE(gbp)) 1576789Sahrens continue; 1577789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1578789Sahrens gbp, NULL, NULL)); 1579789Sahrens } 1580789Sahrens 1581789Sahrens zio_buf_free(gbh, gbufsize); 15825530Sbonwick 15835530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1584789Sahrens } 1585789Sahrens 1586789Sahrens static void 1587789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1588789Sahrens { 1589789Sahrens zio_t *pio = zio->io_parent; 15901775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 15911775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1592789Sahrens uint64_t asize; 15931775Sbillm int d; 1594789Sahrens 15951775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 15961775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 15971775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 15981775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 15991775Sbillm 1600789Sahrens mutex_enter(&pio->io_lock); 16011775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 16021775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 16031775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 16041775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 16051775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 16061775Sbillm } 1607789Sahrens mutex_exit(&pio->io_lock); 1608789Sahrens } 1609789Sahrens 16105329Sgw25295 static int 16114527Sperrin zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1612789Sahrens { 1613789Sahrens blkptr_t *bp = zio->io_bp; 16141775Sbillm dva_t *dva = bp->blk_dva; 16151775Sbillm spa_t *spa = zio->io_spa; 1616789Sahrens zio_gbh_phys_t *gbh; 16171775Sbillm uint64_t txg = zio->io_txg; 1618789Sahrens uint64_t resid = zio->io_size; 1619789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1620789Sahrens uint64_t gsize, loff, lsize; 1621789Sahrens uint32_t gbps_left; 16221775Sbillm int ndvas = zio->io_ndvas; 16231775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1624789Sahrens int error; 16251775Sbillm int i, d; 1626789Sahrens 1627789Sahrens gsize = SPA_GANGBLOCKSIZE; 1628789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1629789Sahrens 16304527Sperrin error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 16314527Sperrin B_FALSE); 16325530Sbonwick if (error) { 16335530Sbonwick zio->io_error = error; 16345530Sbonwick return (ZIO_PIPELINE_CONTINUE); 16355530Sbonwick } 1636789Sahrens 16371775Sbillm for (d = 0; d < gbh_ndvas; d++) 16381775Sbillm DVA_SET_GANG(&dva[d], 1); 1639789Sahrens 16401775Sbillm bp->blk_birth = txg; 1641789Sahrens 1642789Sahrens gbh = zio_buf_alloc(gsize); 1643789Sahrens bzero(gbh, gsize); 1644789Sahrens 1645789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1646789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1647789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 16481775Sbillm dva = gbp->blk_dva; 1649789Sahrens 1650789Sahrens ASSERT(gbps_left != 0); 1651789Sahrens maxalloc = MIN(maxalloc, resid); 1652789Sahrens 1653789Sahrens while (resid <= maxalloc * gbps_left) { 16544527Sperrin error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 16553063Sperrin txg, bp, B_FALSE); 1656789Sahrens if (error == 0) 1657789Sahrens break; 1658789Sahrens ASSERT3U(error, ==, ENOSPC); 16595329Sgw25295 /* XXX - free up previous allocations? */ 16605530Sbonwick if (maxalloc == SPA_MINBLOCKSIZE) { 16615530Sbonwick zio->io_error = error; 16625530Sbonwick return (ZIO_PIPELINE_CONTINUE); 16635530Sbonwick } 1664789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1665789Sahrens } 1666789Sahrens 1667789Sahrens if (resid <= maxalloc * gbps_left) { 1668789Sahrens lsize = maxalloc; 1669789Sahrens BP_SET_LSIZE(gbp, lsize); 1670789Sahrens BP_SET_PSIZE(gbp, lsize); 1671789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 16721775Sbillm gbp->blk_birth = txg; 16737030Sperrin zio_nowait(zio_rewrite(zio, spa, zio->io_checksum, gbp, 1674789Sahrens (char *)zio->io_data + loff, lsize, 1675789Sahrens zio_write_allocate_gang_member_done, NULL, 16765403Sgw25295 zio->io_priority, 16775403Sgw25295 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 16781544Seschrock &zio->io_bookmark)); 1679789Sahrens } else { 1680789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1681789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 16821775Sbillm zio_nowait(zio_write_allocate(zio, spa, 16831775Sbillm zio->io_checksum, txg, gbp, 1684789Sahrens (char *)zio->io_data + loff, lsize, 1685789Sahrens zio_write_allocate_gang_member_done, NULL, 16865403Sgw25295 zio->io_priority, 16875403Sgw25295 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1688789Sahrens } 1689789Sahrens } 1690789Sahrens 1691789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1692789Sahrens 1693789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1694789Sahrens 1695789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 16965530Sbonwick 16971775Sbillm /* 16985530Sbonwick * As much as we'd like this to be 'ready' instead of 'done', 16991775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 17001775Sbillm * so we have to wait for that to finish in order for our BP 17011775Sbillm * to be stable. 17021775Sbillm */ 17035530Sbonwick return (zio_wait_for_children_done(zio)); 1704789Sahrens } 1705789Sahrens 1706789Sahrens /* 1707789Sahrens * ========================================================================== 1708789Sahrens * Allocate and free blocks 1709789Sahrens * ========================================================================== 1710789Sahrens */ 17115530Sbonwick static int 1712789Sahrens zio_dva_allocate(zio_t *zio) 1713789Sahrens { 17144527Sperrin spa_t *spa = zio->io_spa; 17154527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1716789Sahrens blkptr_t *bp = zio->io_bp; 1717789Sahrens int error; 1718789Sahrens 1719789Sahrens ASSERT(BP_IS_HOLE(bp)); 17201775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 17211775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 17224527Sperrin ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1723789Sahrens 17245329Sgw25295 /* 17255329Sgw25295 * For testing purposes, we force I/Os to retry. We don't allow 17265329Sgw25295 * retries beyond the first pass since those I/Os are non-allocating 17275403Sgw25295 * writes. 17285329Sgw25295 */ 17295329Sgw25295 if (zio_io_fail_shift && 17305329Sgw25295 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 17315329Sgw25295 zio_io_should_fail(zio_io_fail_shift)) 17325329Sgw25295 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 17335329Sgw25295 1734789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1735789Sahrens 17364527Sperrin error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 17373063Sperrin zio->io_txg, NULL, B_FALSE); 1738789Sahrens 1739789Sahrens if (error == 0) { 1740789Sahrens bp->blk_birth = zio->io_txg; 17415329Sgw25295 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 17425530Sbonwick return (zio_write_allocate_gang_members(zio, mc)); 1743789Sahrens } else { 1744789Sahrens zio->io_error = error; 1745789Sahrens } 17465530Sbonwick 17475530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1748789Sahrens } 1749789Sahrens 17505530Sbonwick static int 1751789Sahrens zio_dva_free(zio_t *zio) 1752789Sahrens { 1753789Sahrens blkptr_t *bp = zio->io_bp; 1754789Sahrens 17551807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1756789Sahrens 1757789Sahrens BP_ZERO(bp); 1758789Sahrens 17595530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1760789Sahrens } 1761789Sahrens 17625530Sbonwick static int 1763789Sahrens zio_dva_claim(zio_t *zio) 1764789Sahrens { 17651807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1766789Sahrens 17675530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1768789Sahrens } 1769789Sahrens 1770789Sahrens /* 1771789Sahrens * ========================================================================== 1772789Sahrens * Read and write to physical devices 1773789Sahrens * ========================================================================== 1774789Sahrens */ 1775789Sahrens 17765530Sbonwick static int 17771775Sbillm zio_vdev_io_start(zio_t *zio) 1778789Sahrens { 1779789Sahrens vdev_t *vd = zio->io_vd; 17801775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 17811775Sbillm blkptr_t *bp = zio->io_bp; 17821775Sbillm uint64_t align; 17835329Sgw25295 spa_t *spa = zio->io_spa; 17845329Sgw25295 17855329Sgw25295 /* 17865329Sgw25295 * If the pool is already in a failure state then just suspend 17875329Sgw25295 * this IO until the problem is resolved. We will reissue them 17885329Sgw25295 * at that time. 17895329Sgw25295 */ 17905329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 17915530Sbonwick zio->io_type == ZIO_TYPE_WRITE) 17925530Sbonwick return (zio_vdev_suspend_io(zio)); 1793789Sahrens 17945530Sbonwick /* 17955530Sbonwick * The mirror_ops handle multiple DVAs in a single BP 17965530Sbonwick */ 17975530Sbonwick if (vd == NULL) 17985530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 17991775Sbillm 18001775Sbillm align = 1ULL << tvd->vdev_ashift; 18011775Sbillm 18021732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1803789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1804789Sahrens 18055530Sbonwick if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1806789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1807789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1808789Sahrens } 1809789Sahrens 18101732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 18111732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 18121732Sbonwick char *abuf = zio_buf_alloc(asize); 18131732Sbonwick ASSERT(vd == tvd); 18141732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 18151732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 18161732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 18171732Sbonwick } 18181732Sbonwick zio_push_transform(zio, abuf, asize, asize); 18191732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 18201732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 18211732Sbonwick } 18221732Sbonwick 18231732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 18241732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 18251732Sbonwick ASSERT(bp == NULL || 18261732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1827789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1828789Sahrens 18295530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 1830789Sahrens } 1831789Sahrens 18325530Sbonwick static int 1833789Sahrens zio_vdev_io_done(zio_t *zio) 1834789Sahrens { 18351775Sbillm if (zio->io_vd == NULL) 18365530Sbonwick return (vdev_mirror_ops.vdev_op_io_done(zio)); 18375530Sbonwick 18385530Sbonwick return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); 1839789Sahrens } 1840789Sahrens 1841789Sahrens /* XXPOLICY */ 18421544Seschrock boolean_t 1843789Sahrens zio_should_retry(zio_t *zio) 1844789Sahrens { 1845789Sahrens vdev_t *vd = zio->io_vd; 1846789Sahrens 1847789Sahrens if (zio->io_error == 0) 1848789Sahrens return (B_FALSE); 1849789Sahrens if (zio->io_delegate_list != NULL) 1850789Sahrens return (B_FALSE); 18516976Seschrock if (vd != NULL) { 18526976Seschrock if (vd != vd->vdev_top) 18536976Seschrock return (B_FALSE); 18546976Seschrock if (vd->vdev_is_failing) 18556976Seschrock return (B_FALSE); 18566976Seschrock } 1857789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1858789Sahrens return (B_FALSE); 18591544Seschrock if (zio->io_retries > 0) 1860789Sahrens return (B_FALSE); 1861789Sahrens 1862789Sahrens return (B_TRUE); 1863789Sahrens } 1864789Sahrens 18655530Sbonwick static int 1866789Sahrens zio_vdev_io_assess(zio_t *zio) 1867789Sahrens { 1868789Sahrens vdev_t *vd = zio->io_vd; 18691775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1870789Sahrens 18711544Seschrock ASSERT(zio->io_vsd == NULL); 1872789Sahrens 18731732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 18741732Sbonwick void *abuf; 18751732Sbonwick uint64_t asize; 18761732Sbonwick ASSERT(vd == tvd); 18771732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 18781732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 18791732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 18801732Sbonwick zio_buf_free(abuf, asize); 18811732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 18821732Sbonwick } 18831732Sbonwick 18841544Seschrock if (zio_injection_enabled && !zio->io_error) 18851544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1886789Sahrens 1887789Sahrens /* 1888789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1889789Sahrens */ 1890789Sahrens /* XXPOLICY */ 1891789Sahrens if (zio_should_retry(zio)) { 1892789Sahrens ASSERT(tvd == vd); 1893789Sahrens 1894789Sahrens zio->io_retries++; 1895789Sahrens zio->io_error = 0; 18965688Sbonwick zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; 1897789Sahrens /* XXPOLICY */ 1898789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1899789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 19001775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1901789Sahrens 19025530Sbonwick return (ZIO_PIPELINE_CONTINUE); 19031544Seschrock } 1904789Sahrens 19055530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1906789Sahrens } 1907789Sahrens 1908789Sahrens void 1909789Sahrens zio_vdev_io_reissue(zio_t *zio) 1910789Sahrens { 1911789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1912789Sahrens ASSERT(zio->io_error == 0); 1913789Sahrens 1914789Sahrens zio->io_stage--; 1915789Sahrens } 1916789Sahrens 1917789Sahrens void 1918789Sahrens zio_vdev_io_redone(zio_t *zio) 1919789Sahrens { 1920789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1921789Sahrens 1922789Sahrens zio->io_stage--; 1923789Sahrens } 1924789Sahrens 1925789Sahrens void 1926789Sahrens zio_vdev_io_bypass(zio_t *zio) 1927789Sahrens { 1928789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1929789Sahrens ASSERT(zio->io_error == 0); 1930789Sahrens 1931789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1932789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1933789Sahrens } 1934789Sahrens 1935789Sahrens /* 1936789Sahrens * ========================================================================== 1937789Sahrens * Generate and verify checksums 1938789Sahrens * ========================================================================== 1939789Sahrens */ 19405530Sbonwick static int 1941789Sahrens zio_checksum_generate(zio_t *zio) 1942789Sahrens { 1943789Sahrens int checksum = zio->io_checksum; 1944789Sahrens blkptr_t *bp = zio->io_bp; 1945789Sahrens 1946789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1947789Sahrens 1948789Sahrens BP_SET_CHECKSUM(bp, checksum); 1949789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1950789Sahrens 1951789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1952789Sahrens 19535530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1954789Sahrens } 1955789Sahrens 19565530Sbonwick static int 1957789Sahrens zio_gang_checksum_generate(zio_t *zio) 1958789Sahrens { 1959789Sahrens zio_cksum_t zc; 1960789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1961789Sahrens 19621775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1963789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1964789Sahrens 1965789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1966789Sahrens 1967789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1968789Sahrens 19695530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1970789Sahrens } 1971789Sahrens 19725530Sbonwick static int 1973789Sahrens zio_checksum_verify(zio_t *zio) 1974789Sahrens { 1975789Sahrens if (zio->io_bp != NULL) { 1976789Sahrens zio->io_error = zio_checksum_error(zio); 19771544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 19781544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 19791544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1980789Sahrens } 1981789Sahrens 19825530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1983789Sahrens } 1984789Sahrens 1985789Sahrens /* 1986789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1987789Sahrens */ 1988789Sahrens void 1989789Sahrens zio_checksum_verified(zio_t *zio) 1990789Sahrens { 1991789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1992789Sahrens } 1993789Sahrens 1994789Sahrens /* 1995789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1996789Sahrens */ 1997789Sahrens void 1998789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1999789Sahrens { 20001775Sbillm blkptr_t *bp = zio->io_bp; 20011775Sbillm 20021775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 20031775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 20041775Sbillm zcp->zc_word[2] = bp->blk_birth; 2005789Sahrens zcp->zc_word[3] = 0; 2006789Sahrens } 2007789Sahrens 2008789Sahrens /* 2009789Sahrens * ========================================================================== 2010789Sahrens * Define the pipeline 2011789Sahrens * ========================================================================== 2012789Sahrens */ 20135530Sbonwick typedef int zio_pipe_stage_t(zio_t *zio); 2014789Sahrens 2015789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 20165530Sbonwick NULL, 20175530Sbonwick zio_wait_for_children_ready, 20185530Sbonwick zio_read_init, 20195530Sbonwick zio_issue_async, 2020789Sahrens zio_write_compress, 2021789Sahrens zio_checksum_generate, 2022789Sahrens zio_get_gang_header, 2023789Sahrens zio_rewrite_gang_members, 2024789Sahrens zio_free_gang_members, 2025789Sahrens zio_claim_gang_members, 2026789Sahrens zio_dva_allocate, 2027789Sahrens zio_dva_free, 2028789Sahrens zio_dva_claim, 2029789Sahrens zio_gang_checksum_generate, 2030789Sahrens zio_ready, 2031789Sahrens zio_vdev_io_start, 2032789Sahrens zio_vdev_io_done, 2033789Sahrens zio_vdev_io_assess, 20345530Sbonwick zio_wait_for_children_done, 2035789Sahrens zio_checksum_verify, 2036789Sahrens zio_read_gang_members, 2037789Sahrens zio_read_decompress, 20385329Sgw25295 zio_assess, 2039789Sahrens zio_done, 20405530Sbonwick NULL 2041789Sahrens }; 2042789Sahrens 2043789Sahrens /* 20445530Sbonwick * Execute the I/O pipeline until one of the following occurs: 20455530Sbonwick * (1) the I/O completes; (2) the pipeline stalls waiting for 20465530Sbonwick * dependent child I/Os; (3) the I/O issues, so we're waiting 20475530Sbonwick * for an I/O completion interrupt; (4) the I/O is delegated by 20485530Sbonwick * vdev-level caching or aggregation; (5) the I/O is deferred 20495530Sbonwick * due to vdev-level queueing; (6) the I/O is handed off to 20505530Sbonwick * another thread. In all cases, the pipeline stops whenever 20515530Sbonwick * there's no CPU work; it never burns a thread in cv_wait(). 20525530Sbonwick * 20535530Sbonwick * There's no locking on io_stage because there's no legitimate way 20545530Sbonwick * for multiple threads to be attempting to process the same I/O. 2055789Sahrens */ 2056789Sahrens void 20575530Sbonwick zio_execute(zio_t *zio) 2058789Sahrens { 20595530Sbonwick while (zio->io_stage < ZIO_STAGE_DONE) { 20605530Sbonwick uint32_t pipeline = zio->io_pipeline; 20615530Sbonwick int rv; 2062789Sahrens 20635530Sbonwick ASSERT(!MUTEX_HELD(&zio->io_lock)); 2064789Sahrens 20655530Sbonwick /* 20665530Sbonwick * If an error occurred outside the vdev stack, 20675530Sbonwick * just execute the interlock stages to clean up. 20685530Sbonwick */ 20695530Sbonwick if (zio->io_error && 20705530Sbonwick ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) 2071789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 2072789Sahrens 20735530Sbonwick while (((1U << ++zio->io_stage) & pipeline) == 0) 20745530Sbonwick continue; 2075789Sahrens 20765530Sbonwick ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 20775530Sbonwick ASSERT(zio->io_stalled == 0); 20785530Sbonwick 20795530Sbonwick rv = zio_pipeline[zio->io_stage](zio); 20805530Sbonwick 20815530Sbonwick if (rv == ZIO_PIPELINE_STOP) 20825530Sbonwick return; 20835530Sbonwick 20845530Sbonwick ASSERT(rv == ZIO_PIPELINE_CONTINUE); 2085789Sahrens } 2086789Sahrens } 2087789Sahrens 20883668Sgw25295 static boolean_t 20895329Sgw25295 zio_io_should_fail(uint16_t range) 20903668Sgw25295 { 20913668Sgw25295 static uint16_t allocs = 0; 20923668Sgw25295 20935329Sgw25295 return (P2PHASE(allocs++, 1U<<range) == 0); 20943668Sgw25295 } 20953668Sgw25295 2096789Sahrens /* 2097789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 2098789Sahrens */ 2099789Sahrens int 21003063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 21013063Sperrin uint64_t txg) 2102789Sahrens { 2103789Sahrens int error; 2104789Sahrens 21051544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2106789Sahrens 21075329Sgw25295 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 21083668Sgw25295 spa_config_exit(spa, FTAG); 21093668Sgw25295 return (ENOSPC); 21103668Sgw25295 } 21113668Sgw25295 21123063Sperrin /* 21134527Sperrin * We were passed the previous log block's DVA in bp->blk_dva[0]. 21144527Sperrin * We use that as a hint for which vdev to allocate from next. 21153063Sperrin */ 21164527Sperrin error = metaslab_alloc(spa, spa->spa_log_class, size, 21174527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 21184527Sperrin 21194527Sperrin if (error) 21204527Sperrin error = metaslab_alloc(spa, spa->spa_normal_class, size, 21214527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 2122789Sahrens 2123789Sahrens if (error == 0) { 21243063Sperrin BP_SET_LSIZE(new_bp, size); 21253063Sperrin BP_SET_PSIZE(new_bp, size); 21263063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 21273063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 21283063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 21293063Sperrin BP_SET_LEVEL(new_bp, 0); 21303063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 21313063Sperrin new_bp->blk_birth = txg; 2132789Sahrens } 2133789Sahrens 21341544Seschrock spa_config_exit(spa, FTAG); 2135789Sahrens 2136789Sahrens return (error); 2137789Sahrens } 2138789Sahrens 2139789Sahrens /* 2140789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 2141789Sahrens * nothing to do except metaslab_free() it. 2142789Sahrens */ 2143789Sahrens void 2144789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2145789Sahrens { 21461775Sbillm ASSERT(!BP_IS_GANG(bp)); 2147789Sahrens 21481544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2149789Sahrens 21501807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 2151789Sahrens 21521544Seschrock spa_config_exit(spa, FTAG); 2153789Sahrens } 21544469Sperrin 21554469Sperrin /* 21564469Sperrin * start an async flush of the write cache for this vdev 21574469Sperrin */ 21584469Sperrin void 21595688Sbonwick zio_flush(zio_t *zio, vdev_t *vd) 21604469Sperrin { 21615688Sbonwick zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 21624469Sperrin NULL, NULL, ZIO_PRIORITY_NOW, 21634469Sperrin ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 21644469Sperrin } 2165