1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 22*6245Smaybee * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 291544Seschrock #include <sys/fm/fs/zfs.h> 30789Sahrens #include <sys/spa.h> 31789Sahrens #include <sys/txg.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/vdev_impl.h> 34789Sahrens #include <sys/zio_impl.h> 35789Sahrens #include <sys/zio_compress.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens 38789Sahrens /* 39789Sahrens * ========================================================================== 40789Sahrens * I/O priority table 41789Sahrens * ========================================================================== 42789Sahrens */ 43789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44789Sahrens 0, /* ZIO_PRIORITY_NOW */ 45789Sahrens 0, /* ZIO_PRIORITY_SYNC_READ */ 46789Sahrens 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47789Sahrens 6, /* ZIO_PRIORITY_ASYNC_READ */ 48789Sahrens 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49789Sahrens 4, /* ZIO_PRIORITY_FREE */ 50789Sahrens 0, /* ZIO_PRIORITY_CACHE_FILL */ 51789Sahrens 0, /* ZIO_PRIORITY_LOG_WRITE */ 52789Sahrens 10, /* ZIO_PRIORITY_RESILVER */ 53789Sahrens 20, /* ZIO_PRIORITY_SCRUB */ 54789Sahrens }; 55789Sahrens 56789Sahrens /* 57789Sahrens * ========================================================================== 58789Sahrens * I/O type descriptions 59789Sahrens * ========================================================================== 60789Sahrens */ 61789Sahrens char *zio_type_name[ZIO_TYPES] = { 62789Sahrens "null", "read", "write", "free", "claim", "ioctl" }; 63789Sahrens 643668Sgw25295 /* Force an allocation failure when non-zero */ 653668Sgw25295 uint16_t zio_zil_fail_shift = 0; 665329Sgw25295 uint16_t zio_io_fail_shift = 0; 675329Sgw25295 685329Sgw25295 /* Enable/disable the write-retry logic */ 695329Sgw25295 int zio_write_retry = 1; 705329Sgw25295 715329Sgw25295 /* Taskq to handle reissuing of I/Os */ 725329Sgw25295 taskq_t *zio_taskq; 735329Sgw25295 int zio_resume_threads = 4; 743668Sgw25295 75789Sahrens typedef struct zio_sync_pass { 76789Sahrens int zp_defer_free; /* defer frees after this pass */ 77789Sahrens int zp_dontcompress; /* don't compress after this pass */ 78789Sahrens int zp_rewrite; /* rewrite new bps after this pass */ 79789Sahrens } zio_sync_pass_t; 80789Sahrens 81789Sahrens zio_sync_pass_t zio_sync_pass = { 82789Sahrens 1, /* zp_defer_free */ 83789Sahrens 4, /* zp_dontcompress */ 84789Sahrens 1, /* zp_rewrite */ 85789Sahrens }; 86789Sahrens 875329Sgw25295 static boolean_t zio_io_should_fail(uint16_t); 885329Sgw25295 89789Sahrens /* 90789Sahrens * ========================================================================== 91789Sahrens * I/O kmem caches 92789Sahrens * ========================================================================== 93789Sahrens */ 944055Seschrock kmem_cache_t *zio_cache; 95789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 963290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 973290Sjohansen 983290Sjohansen #ifdef _KERNEL 993290Sjohansen extern vmem_t *zio_alloc_arena; 1003290Sjohansen #endif 101789Sahrens 1025329Sgw25295 /* 1035329Sgw25295 * Determine if we are allowed to issue the IO based on the 1045329Sgw25295 * pool state. If we must wait then block until we are told 1055329Sgw25295 * that we may continue. 1065329Sgw25295 */ 1075329Sgw25295 #define ZIO_ENTER(spa) { \ 1085329Sgw25295 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 1095329Sgw25295 mutex_enter(&spa->spa_zio_lock); \ 1105329Sgw25295 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 1115329Sgw25295 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 1125329Sgw25295 mutex_exit(&spa->spa_zio_lock); \ 1135329Sgw25295 } \ 1145329Sgw25295 } 1155329Sgw25295 1165329Sgw25295 /* 1175329Sgw25295 * An allocation zio is one that either currently has the DVA allocate 1185329Sgw25295 * stage set or will have it later in it's lifetime. 1195329Sgw25295 */ 1205329Sgw25295 #define IO_IS_ALLOCATING(zio) \ 1215688Sbonwick ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 1225329Sgw25295 123789Sahrens void 124789Sahrens zio_init(void) 125789Sahrens { 126789Sahrens size_t c; 1273290Sjohansen vmem_t *data_alloc_arena = NULL; 1283290Sjohansen 1293290Sjohansen #ifdef _KERNEL 1303290Sjohansen data_alloc_arena = zio_alloc_arena; 1313290Sjohansen #endif 132789Sahrens 1334055Seschrock zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 1344055Seschrock NULL, NULL, NULL, NULL, NULL, 0); 1354055Seschrock 136789Sahrens /* 137789Sahrens * For small buffers, we want a cache for each multiple of 138789Sahrens * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139789Sahrens * for each quarter-power of 2. For large buffers, we want 140789Sahrens * a cache for each multiple of PAGESIZE. 141789Sahrens */ 142789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143789Sahrens size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144789Sahrens size_t p2 = size; 145789Sahrens size_t align = 0; 146789Sahrens 147789Sahrens while (p2 & (p2 - 1)) 148789Sahrens p2 &= p2 - 1; 149789Sahrens 150789Sahrens if (size <= 4 * SPA_MINBLOCKSIZE) { 151789Sahrens align = SPA_MINBLOCKSIZE; 152789Sahrens } else if (P2PHASE(size, PAGESIZE) == 0) { 153789Sahrens align = PAGESIZE; 154789Sahrens } else if (P2PHASE(size, p2 >> 2) == 0) { 155789Sahrens align = p2 >> 2; 156789Sahrens } 157789Sahrens 158789Sahrens if (align != 0) { 1593290Sjohansen char name[36]; 1602856Snd150628 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161789Sahrens zio_buf_cache[c] = kmem_cache_create(name, size, 162849Sbonwick align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 1633290Sjohansen 1643290Sjohansen (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 1653290Sjohansen zio_data_buf_cache[c] = kmem_cache_create(name, size, 1663290Sjohansen align, NULL, NULL, NULL, NULL, data_alloc_arena, 1673290Sjohansen KMC_NODEBUG); 1683290Sjohansen 169789Sahrens } 170789Sahrens } 171789Sahrens 172789Sahrens while (--c != 0) { 173789Sahrens ASSERT(zio_buf_cache[c] != NULL); 174789Sahrens if (zio_buf_cache[c - 1] == NULL) 175789Sahrens zio_buf_cache[c - 1] = zio_buf_cache[c]; 1763290Sjohansen 1773290Sjohansen ASSERT(zio_data_buf_cache[c] != NULL); 1783290Sjohansen if (zio_data_buf_cache[c - 1] == NULL) 1793290Sjohansen zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180789Sahrens } 1811544Seschrock 1825329Sgw25295 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 1835329Sgw25295 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 1845329Sgw25295 1851544Seschrock zio_inject_init(); 186789Sahrens } 187789Sahrens 188789Sahrens void 189789Sahrens zio_fini(void) 190789Sahrens { 191789Sahrens size_t c; 192789Sahrens kmem_cache_t *last_cache = NULL; 1933290Sjohansen kmem_cache_t *last_data_cache = NULL; 194789Sahrens 195789Sahrens for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196789Sahrens if (zio_buf_cache[c] != last_cache) { 197789Sahrens last_cache = zio_buf_cache[c]; 198789Sahrens kmem_cache_destroy(zio_buf_cache[c]); 199789Sahrens } 200789Sahrens zio_buf_cache[c] = NULL; 2013290Sjohansen 2023290Sjohansen if (zio_data_buf_cache[c] != last_data_cache) { 2033290Sjohansen last_data_cache = zio_data_buf_cache[c]; 2043290Sjohansen kmem_cache_destroy(zio_data_buf_cache[c]); 2053290Sjohansen } 2063290Sjohansen zio_data_buf_cache[c] = NULL; 207789Sahrens } 2081544Seschrock 2095329Sgw25295 taskq_destroy(zio_taskq); 2105329Sgw25295 2114055Seschrock kmem_cache_destroy(zio_cache); 2124055Seschrock 2131544Seschrock zio_inject_fini(); 214789Sahrens } 215789Sahrens 216789Sahrens /* 217789Sahrens * ========================================================================== 218789Sahrens * Allocate and free I/O buffers 219789Sahrens * ========================================================================== 220789Sahrens */ 2213290Sjohansen 2223290Sjohansen /* 2233290Sjohansen * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 2243290Sjohansen * crashdump if the kernel panics, so use it judiciously. Obviously, it's 2253290Sjohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping 2263290Sjohansen * excess / transient data in-core during a crashdump. 2273290Sjohansen */ 228789Sahrens void * 229789Sahrens zio_buf_alloc(size_t size) 230789Sahrens { 231789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232789Sahrens 233789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234789Sahrens 235*6245Smaybee return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236789Sahrens } 237789Sahrens 2383290Sjohansen /* 2393290Sjohansen * Use zio_data_buf_alloc to allocate data. The data will not appear in a 2403290Sjohansen * crashdump if the kernel panics. This exists so that we will limit the amount 2413290Sjohansen * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 2423290Sjohansen * of kernel heap dumped to disk when the kernel panics) 2433290Sjohansen */ 2443290Sjohansen void * 2453290Sjohansen zio_data_buf_alloc(size_t size) 2463290Sjohansen { 2473290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2483290Sjohansen 2493290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2503290Sjohansen 251*6245Smaybee return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 2523290Sjohansen } 2533290Sjohansen 254789Sahrens void 255789Sahrens zio_buf_free(void *buf, size_t size) 256789Sahrens { 257789Sahrens size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258789Sahrens 259789Sahrens ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260789Sahrens 261789Sahrens kmem_cache_free(zio_buf_cache[c], buf); 262789Sahrens } 263789Sahrens 2643290Sjohansen void 2653290Sjohansen zio_data_buf_free(void *buf, size_t size) 2663290Sjohansen { 2673290Sjohansen size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 2683290Sjohansen 2693290Sjohansen ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 2703290Sjohansen 2713290Sjohansen kmem_cache_free(zio_data_buf_cache[c], buf); 2723290Sjohansen } 2733463Sahrens 274789Sahrens /* 275789Sahrens * ========================================================================== 276789Sahrens * Push and pop I/O transform buffers 277789Sahrens * ========================================================================== 278789Sahrens */ 279789Sahrens static void 280789Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281789Sahrens { 282789Sahrens zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283789Sahrens 284789Sahrens zt->zt_data = data; 285789Sahrens zt->zt_size = size; 286789Sahrens zt->zt_bufsize = bufsize; 287789Sahrens 288789Sahrens zt->zt_next = zio->io_transform_stack; 289789Sahrens zio->io_transform_stack = zt; 290789Sahrens 291789Sahrens zio->io_data = data; 292789Sahrens zio->io_size = size; 293789Sahrens } 294789Sahrens 295789Sahrens static void 296789Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297789Sahrens { 298789Sahrens zio_transform_t *zt = zio->io_transform_stack; 299789Sahrens 300789Sahrens *data = zt->zt_data; 301789Sahrens *size = zt->zt_size; 302789Sahrens *bufsize = zt->zt_bufsize; 303789Sahrens 304789Sahrens zio->io_transform_stack = zt->zt_next; 305789Sahrens kmem_free(zt, sizeof (zio_transform_t)); 306789Sahrens 307789Sahrens if ((zt = zio->io_transform_stack) != NULL) { 308789Sahrens zio->io_data = zt->zt_data; 309789Sahrens zio->io_size = zt->zt_size; 310789Sahrens } 311789Sahrens } 312789Sahrens 313789Sahrens static void 314789Sahrens zio_clear_transform_stack(zio_t *zio) 315789Sahrens { 316789Sahrens void *data; 317789Sahrens uint64_t size, bufsize; 318789Sahrens 319789Sahrens ASSERT(zio->io_transform_stack != NULL); 320789Sahrens 321789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 322789Sahrens while (zio->io_transform_stack != NULL) { 323789Sahrens zio_buf_free(data, bufsize); 324789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 325789Sahrens } 326789Sahrens } 327789Sahrens 328789Sahrens /* 329789Sahrens * ========================================================================== 330789Sahrens * Create the various types of I/O (read, write, free) 331789Sahrens * ========================================================================== 332789Sahrens */ 333789Sahrens static zio_t * 334789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335789Sahrens void *data, uint64_t size, zio_done_func_t *done, void *private, 336789Sahrens zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337789Sahrens { 338789Sahrens zio_t *zio; 339789Sahrens 340789Sahrens ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342789Sahrens 3434055Seschrock zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 3444055Seschrock bzero(zio, sizeof (zio_t)); 345789Sahrens zio->io_parent = pio; 346789Sahrens zio->io_spa = spa; 347789Sahrens zio->io_txg = txg; 3484634Sek110237 zio->io_flags = flags; 349789Sahrens if (bp != NULL) { 350789Sahrens zio->io_bp = bp; 351789Sahrens zio->io_bp_copy = *bp; 352789Sahrens zio->io_bp_orig = *bp; 353789Sahrens } 354789Sahrens zio->io_done = done; 355789Sahrens zio->io_private = private; 356789Sahrens zio->io_type = type; 357789Sahrens zio->io_priority = priority; 358789Sahrens zio->io_stage = stage; 359789Sahrens zio->io_pipeline = pipeline; 360789Sahrens zio->io_timestamp = lbolt64; 3612856Snd150628 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 3624831Sgw25295 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 363789Sahrens zio_push_transform(zio, data, size, size); 364789Sahrens 3653463Sahrens /* 3663463Sahrens * Note on config lock: 3673463Sahrens * 3683463Sahrens * If CONFIG_HELD is set, then the caller already has the config 3693463Sahrens * lock, so we don't need it for this io. 3703463Sahrens * 3713463Sahrens * We set CONFIG_GRABBED to indicate that we have grabbed the 3723463Sahrens * config lock on behalf of this io, so it should be released 3733463Sahrens * in zio_done. 3743463Sahrens * 3753463Sahrens * Unless CONFIG_HELD is set, we will grab the config lock for 3763463Sahrens * any top-level (parent-less) io, *except* NULL top-level ios. 3773463Sahrens * The NULL top-level ios rarely have any children, so we delay 3783463Sahrens * grabbing the lock until the first child is added (but it is 3793463Sahrens * still grabbed on behalf of the top-level i/o, so additional 3803463Sahrens * children don't need to also grab it). This greatly reduces 3813463Sahrens * contention on the config lock. 3823463Sahrens */ 383789Sahrens if (pio == NULL) { 3843463Sahrens if (type != ZIO_TYPE_NULL && 3853463Sahrens !(flags & ZIO_FLAG_CONFIG_HELD)) { 3865530Sbonwick spa_config_enter(spa, RW_READER, zio); 3873463Sahrens zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 3883463Sahrens } 389789Sahrens zio->io_root = zio; 390789Sahrens } else { 391789Sahrens zio->io_root = pio->io_root; 3921544Seschrock if (!(flags & ZIO_FLAG_NOBOOKMARK)) 3931544Seschrock zio->io_logical = pio->io_logical; 394789Sahrens mutex_enter(&pio->io_lock); 3953463Sahrens if (pio->io_parent == NULL && 3963463Sahrens pio->io_type == ZIO_TYPE_NULL && 3973463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 3983463Sahrens !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 3993463Sahrens pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 4005530Sbonwick spa_config_enter(spa, RW_READER, pio); 4013463Sahrens } 402789Sahrens if (stage < ZIO_STAGE_READY) 403789Sahrens pio->io_children_notready++; 404789Sahrens pio->io_children_notdone++; 405789Sahrens zio->io_sibling_next = pio->io_child; 406789Sahrens zio->io_sibling_prev = NULL; 407789Sahrens if (pio->io_child != NULL) 408789Sahrens pio->io_child->io_sibling_prev = zio; 409789Sahrens pio->io_child = zio; 4101775Sbillm zio->io_ndvas = pio->io_ndvas; 411789Sahrens mutex_exit(&pio->io_lock); 412789Sahrens } 413789Sahrens 4145329Sgw25295 /* 4155329Sgw25295 * Save off the original state incase we need to retry later. 4165329Sgw25295 */ 4175329Sgw25295 zio->io_orig_stage = zio->io_stage; 4185329Sgw25295 zio->io_orig_pipeline = zio->io_pipeline; 4195329Sgw25295 zio->io_orig_flags = zio->io_flags; 4205329Sgw25295 421789Sahrens return (zio); 422789Sahrens } 423789Sahrens 4245329Sgw25295 static void 4255329Sgw25295 zio_reset(zio_t *zio) 4265329Sgw25295 { 4275329Sgw25295 zio_clear_transform_stack(zio); 4285329Sgw25295 4295329Sgw25295 zio->io_flags = zio->io_orig_flags; 4305329Sgw25295 zio->io_stage = zio->io_orig_stage; 4315329Sgw25295 zio->io_pipeline = zio->io_orig_pipeline; 4325329Sgw25295 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 4335329Sgw25295 } 4345329Sgw25295 435789Sahrens zio_t * 436789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 437789Sahrens int flags) 438789Sahrens { 439789Sahrens zio_t *zio; 440789Sahrens 441789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 442789Sahrens ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 443789Sahrens ZIO_WAIT_FOR_CHILDREN_PIPELINE); 444789Sahrens 445789Sahrens return (zio); 446789Sahrens } 447789Sahrens 448789Sahrens zio_t * 449789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 450789Sahrens { 451789Sahrens return (zio_null(NULL, spa, done, private, flags)); 452789Sahrens } 453789Sahrens 454789Sahrens zio_t * 455789Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 456789Sahrens uint64_t size, zio_done_func_t *done, void *private, 4571544Seschrock int priority, int flags, zbookmark_t *zb) 458789Sahrens { 459789Sahrens zio_t *zio; 460789Sahrens 461789Sahrens ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 462789Sahrens 4635329Sgw25295 /* 4645329Sgw25295 * If the user has specified that we allow I/Os to continue 4655329Sgw25295 * then attempt to satisfy the read. 4665329Sgw25295 */ 4675329Sgw25295 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 4685329Sgw25295 ZIO_ENTER(spa); 4695329Sgw25295 470789Sahrens zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 4712981Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 4722981Sahrens ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 4731544Seschrock zio->io_bookmark = *zb; 4741544Seschrock 4751544Seschrock zio->io_logical = zio; 476789Sahrens 477789Sahrens /* 478789Sahrens * Work off our copy of the bp so the caller can free it. 479789Sahrens */ 480789Sahrens zio->io_bp = &zio->io_bp_copy; 481789Sahrens 482789Sahrens return (zio); 483789Sahrens } 484789Sahrens 485789Sahrens zio_t * 4861775Sbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 487789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 4883547Smaybee zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 4893547Smaybee int flags, zbookmark_t *zb) 490789Sahrens { 491789Sahrens zio_t *zio; 492789Sahrens 493789Sahrens ASSERT(checksum >= ZIO_CHECKSUM_OFF && 494789Sahrens checksum < ZIO_CHECKSUM_FUNCTIONS); 495789Sahrens 496789Sahrens ASSERT(compress >= ZIO_COMPRESS_OFF && 497789Sahrens compress < ZIO_COMPRESS_FUNCTIONS); 498789Sahrens 4995329Sgw25295 ZIO_ENTER(spa); 5005329Sgw25295 501789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5022981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 503789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 504789Sahrens 5053547Smaybee zio->io_ready = ready; 5063547Smaybee 5071544Seschrock zio->io_bookmark = *zb; 5081544Seschrock 5091544Seschrock zio->io_logical = zio; 5101544Seschrock 511789Sahrens zio->io_checksum = checksum; 512789Sahrens zio->io_compress = compress; 5131775Sbillm zio->io_ndvas = ncopies; 514789Sahrens 515789Sahrens if (bp->blk_birth != txg) { 516789Sahrens /* XXX the bp usually (always?) gets re-zeroed later */ 517789Sahrens BP_ZERO(bp); 518789Sahrens BP_SET_LSIZE(bp, size); 519789Sahrens BP_SET_PSIZE(bp, size); 5201775Sbillm } else { 5211775Sbillm /* Make sure someone doesn't change their mind on overwrites */ 5221775Sbillm ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 5231775Sbillm spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 524789Sahrens } 525789Sahrens 526789Sahrens return (zio); 527789Sahrens } 528789Sahrens 529789Sahrens zio_t * 530789Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 531789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 5321544Seschrock zio_done_func_t *done, void *private, int priority, int flags, 5331544Seschrock zbookmark_t *zb) 534789Sahrens { 535789Sahrens zio_t *zio; 536789Sahrens 537789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 5382981Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 5395530Sbonwick ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 540789Sahrens 5411544Seschrock zio->io_bookmark = *zb; 542789Sahrens zio->io_checksum = checksum; 543789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 544789Sahrens 5451775Sbillm if (pio != NULL) 5461775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 5471775Sbillm 548789Sahrens return (zio); 549789Sahrens } 550789Sahrens 5515329Sgw25295 static void 5525329Sgw25295 zio_write_allocate_ready(zio_t *zio) 5535329Sgw25295 { 5545329Sgw25295 /* Free up the previous block */ 5555329Sgw25295 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 5565329Sgw25295 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 5575329Sgw25295 &zio->io_bp_orig, NULL, NULL)); 5585329Sgw25295 } 5595329Sgw25295 } 5605329Sgw25295 561789Sahrens static zio_t * 562789Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 563789Sahrens uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 564789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 565789Sahrens { 566789Sahrens zio_t *zio; 567789Sahrens 568789Sahrens BP_ZERO(bp); 569789Sahrens BP_SET_LSIZE(bp, size); 570789Sahrens BP_SET_PSIZE(bp, size); 571789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 572789Sahrens 573789Sahrens zio = zio_create(pio, spa, txg, bp, data, size, done, private, 574789Sahrens ZIO_TYPE_WRITE, priority, flags, 575789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 576789Sahrens 577789Sahrens zio->io_checksum = checksum; 578789Sahrens zio->io_compress = ZIO_COMPRESS_OFF; 5795329Sgw25295 zio->io_ready = zio_write_allocate_ready; 580789Sahrens 581789Sahrens return (zio); 582789Sahrens } 583789Sahrens 584789Sahrens zio_t * 585789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 586789Sahrens zio_done_func_t *done, void *private) 587789Sahrens { 588789Sahrens zio_t *zio; 589789Sahrens 590789Sahrens ASSERT(!BP_IS_HOLE(bp)); 591789Sahrens 592789Sahrens if (txg == spa->spa_syncing_txg && 593789Sahrens spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 594789Sahrens bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 595789Sahrens return (zio_null(pio, spa, NULL, NULL, 0)); 596789Sahrens } 597789Sahrens 598789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 5992981Sahrens ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 6005530Sbonwick ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 601789Sahrens 602789Sahrens zio->io_bp = &zio->io_bp_copy; 603789Sahrens 604789Sahrens return (zio); 605789Sahrens } 606789Sahrens 607789Sahrens zio_t * 608789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 609789Sahrens zio_done_func_t *done, void *private) 610789Sahrens { 611789Sahrens zio_t *zio; 612789Sahrens 613789Sahrens /* 614789Sahrens * A claim is an allocation of a specific block. Claims are needed 615789Sahrens * to support immediate writes in the intent log. The issue is that 616789Sahrens * immediate writes contain committed data, but in a txg that was 617789Sahrens * *not* committed. Upon opening the pool after an unclean shutdown, 618789Sahrens * the intent log claims all blocks that contain immediate write data 619789Sahrens * so that the SPA knows they're in use. 620789Sahrens * 621789Sahrens * All claims *must* be resolved in the first txg -- before the SPA 622789Sahrens * starts allocating blocks -- so that nothing is allocated twice. 623789Sahrens */ 624789Sahrens ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 625789Sahrens ASSERT3U(spa_first_txg(spa), <=, txg); 626789Sahrens 627789Sahrens zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 628789Sahrens ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 6295530Sbonwick ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 630789Sahrens 631789Sahrens zio->io_bp = &zio->io_bp_copy; 632789Sahrens 633789Sahrens return (zio); 634789Sahrens } 635789Sahrens 636789Sahrens zio_t * 637789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 638789Sahrens zio_done_func_t *done, void *private, int priority, int flags) 639789Sahrens { 640789Sahrens zio_t *zio; 641789Sahrens int c; 642789Sahrens 643789Sahrens if (vd->vdev_children == 0) { 644789Sahrens zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 645789Sahrens ZIO_TYPE_IOCTL, priority, flags, 646789Sahrens ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 647789Sahrens 648789Sahrens zio->io_vd = vd; 649789Sahrens zio->io_cmd = cmd; 650789Sahrens } else { 651789Sahrens zio = zio_null(pio, spa, NULL, NULL, flags); 652789Sahrens 653789Sahrens for (c = 0; c < vd->vdev_children; c++) 654789Sahrens zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 655789Sahrens done, private, priority, flags)); 656789Sahrens } 657789Sahrens 658789Sahrens return (zio); 659789Sahrens } 660789Sahrens 661789Sahrens static void 662789Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 6635450Sbrendan int checksum, boolean_t labels) 664789Sahrens { 665789Sahrens ASSERT(vd->vdev_children == 0); 666789Sahrens 667789Sahrens ASSERT(size <= SPA_MAXBLOCKSIZE); 668789Sahrens ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 669789Sahrens ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 670789Sahrens 6715450Sbrendan #ifdef ZFS_DEBUG 6725450Sbrendan if (labels) { 6735450Sbrendan ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 6745450Sbrendan offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 6755450Sbrendan } 6765450Sbrendan #endif 677789Sahrens ASSERT3U(offset + size, <=, vd->vdev_psize); 678789Sahrens 679789Sahrens BP_ZERO(bp); 680789Sahrens 681789Sahrens BP_SET_LSIZE(bp, size); 682789Sahrens BP_SET_PSIZE(bp, size); 683789Sahrens 684789Sahrens BP_SET_CHECKSUM(bp, checksum); 685789Sahrens BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 686789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 687789Sahrens 688789Sahrens if (checksum != ZIO_CHECKSUM_OFF) 689789Sahrens ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 690789Sahrens } 691789Sahrens 692789Sahrens zio_t * 693789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 694789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 6955450Sbrendan int priority, int flags, boolean_t labels) 696789Sahrens { 697789Sahrens zio_t *zio; 698789Sahrens blkptr_t blk; 699789Sahrens 7005329Sgw25295 ZIO_ENTER(vd->vdev_spa); 7015329Sgw25295 7025450Sbrendan zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 703789Sahrens 704789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 705789Sahrens ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 706789Sahrens ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 707789Sahrens 708789Sahrens zio->io_vd = vd; 709789Sahrens zio->io_offset = offset; 710789Sahrens 711789Sahrens /* 712789Sahrens * Work off our copy of the bp so the caller can free it. 713789Sahrens */ 714789Sahrens zio->io_bp = &zio->io_bp_copy; 715789Sahrens 716789Sahrens return (zio); 717789Sahrens } 718789Sahrens 719789Sahrens zio_t * 720789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 721789Sahrens void *data, int checksum, zio_done_func_t *done, void *private, 7225450Sbrendan int priority, int flags, boolean_t labels) 723789Sahrens { 724789Sahrens zio_block_tail_t *zbt; 725789Sahrens void *wbuf; 726789Sahrens zio_t *zio; 727789Sahrens blkptr_t blk; 728789Sahrens 7295329Sgw25295 ZIO_ENTER(vd->vdev_spa); 7305329Sgw25295 7315450Sbrendan zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 732789Sahrens 733789Sahrens zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 734789Sahrens ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 735789Sahrens ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 736789Sahrens 737789Sahrens zio->io_vd = vd; 738789Sahrens zio->io_offset = offset; 739789Sahrens 740789Sahrens zio->io_bp = &zio->io_bp_copy; 741789Sahrens zio->io_checksum = checksum; 742789Sahrens 743789Sahrens if (zio_checksum_table[checksum].ci_zbt) { 744789Sahrens /* 745789Sahrens * zbt checksums are necessarily destructive -- they modify 746789Sahrens * one word of the write buffer to hold the verifier/checksum. 747789Sahrens * Therefore, we must make a local copy in case the data is 748789Sahrens * being written to multiple places. 749789Sahrens */ 750789Sahrens wbuf = zio_buf_alloc(size); 751789Sahrens bcopy(data, wbuf, size); 752789Sahrens zio_push_transform(zio, wbuf, size, size); 753789Sahrens 754789Sahrens zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 755789Sahrens zbt->zbt_cksum = blk.blk_cksum; 756789Sahrens } 757789Sahrens 758789Sahrens return (zio); 759789Sahrens } 760789Sahrens 761789Sahrens /* 762789Sahrens * Create a child I/O to do some work for us. It has no associated bp. 763789Sahrens */ 764789Sahrens zio_t * 765789Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 766789Sahrens void *data, uint64_t size, int type, int priority, int flags, 767789Sahrens zio_done_func_t *done, void *private) 768789Sahrens { 769789Sahrens uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 770789Sahrens zio_t *cio; 771789Sahrens 772789Sahrens if (type == ZIO_TYPE_READ && bp != NULL) { 773789Sahrens /* 774789Sahrens * If we have the bp, then the child should perform the 775789Sahrens * checksum and the parent need not. This pushes error 776789Sahrens * detection as close to the leaves as possible and 777789Sahrens * eliminates redundant checksums in the interior nodes. 778789Sahrens */ 779789Sahrens pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 780789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 781789Sahrens } 782789Sahrens 783789Sahrens cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 784789Sahrens done, private, type, priority, 785789Sahrens (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 7861775Sbillm ZIO_STAGE_VDEV_IO_START - 1, pipeline); 787789Sahrens 788789Sahrens cio->io_vd = vd; 789789Sahrens cio->io_offset = offset; 790789Sahrens 791789Sahrens return (cio); 792789Sahrens } 793789Sahrens 794789Sahrens /* 795789Sahrens * ========================================================================== 796789Sahrens * Initiate I/O, either sync or async 797789Sahrens * ========================================================================== 798789Sahrens */ 799789Sahrens int 800789Sahrens zio_wait(zio_t *zio) 801789Sahrens { 802789Sahrens int error; 803789Sahrens 804789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 805789Sahrens 806789Sahrens zio->io_waiter = curthread; 807789Sahrens 8085530Sbonwick zio_execute(zio); 809789Sahrens 810789Sahrens mutex_enter(&zio->io_lock); 811789Sahrens while (zio->io_stalled != ZIO_STAGE_DONE) 812789Sahrens cv_wait(&zio->io_cv, &zio->io_lock); 813789Sahrens mutex_exit(&zio->io_lock); 814789Sahrens 815789Sahrens error = zio->io_error; 8162856Snd150628 mutex_destroy(&zio->io_lock); 8174831Sgw25295 cv_destroy(&zio->io_cv); 8184055Seschrock kmem_cache_free(zio_cache, zio); 819789Sahrens 820789Sahrens return (error); 821789Sahrens } 822789Sahrens 823789Sahrens void 824789Sahrens zio_nowait(zio_t *zio) 825789Sahrens { 8265530Sbonwick zio_execute(zio); 8275530Sbonwick } 8285530Sbonwick 8295530Sbonwick void 8305530Sbonwick zio_interrupt(zio_t *zio) 8315530Sbonwick { 8325530Sbonwick (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 8335530Sbonwick (task_func_t *)zio_execute, zio, TQ_SLEEP); 8345530Sbonwick } 8355530Sbonwick 8365530Sbonwick static int 8375530Sbonwick zio_issue_async(zio_t *zio) 8385530Sbonwick { 8395530Sbonwick (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 8405530Sbonwick (task_func_t *)zio_execute, zio, TQ_SLEEP); 8415530Sbonwick 8425530Sbonwick return (ZIO_PIPELINE_STOP); 843789Sahrens } 844789Sahrens 845789Sahrens /* 846789Sahrens * ========================================================================== 847789Sahrens * I/O pipeline interlocks: parent/child dependency scoreboarding 848789Sahrens * ========================================================================== 849789Sahrens */ 8505530Sbonwick static int 851789Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 852789Sahrens { 8535530Sbonwick int rv = ZIO_PIPELINE_CONTINUE; 8545530Sbonwick 855789Sahrens mutex_enter(&zio->io_lock); 8565530Sbonwick ASSERT(zio->io_stalled == 0); 8575530Sbonwick if (*countp != 0) { 858789Sahrens zio->io_stalled = stage; 8595530Sbonwick rv = ZIO_PIPELINE_STOP; 860789Sahrens } 8615530Sbonwick mutex_exit(&zio->io_lock); 8625530Sbonwick 8635530Sbonwick return (rv); 864789Sahrens } 865789Sahrens 866789Sahrens static void 867789Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 868789Sahrens { 869789Sahrens zio_t *pio = zio->io_parent; 870789Sahrens 871789Sahrens mutex_enter(&pio->io_lock); 872789Sahrens if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 873789Sahrens pio->io_error = zio->io_error; 8745329Sgw25295 ASSERT3U(*countp, >, 0); 875789Sahrens if (--*countp == 0 && pio->io_stalled == stage) { 876789Sahrens pio->io_stalled = 0; 877789Sahrens mutex_exit(&pio->io_lock); 8785530Sbonwick zio_execute(pio); 879789Sahrens } else { 880789Sahrens mutex_exit(&pio->io_lock); 881789Sahrens } 882789Sahrens } 883789Sahrens 8845530Sbonwick int 8855530Sbonwick zio_wait_for_children_ready(zio_t *zio) 886789Sahrens { 8875530Sbonwick return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 8885530Sbonwick &zio->io_children_notready)); 889789Sahrens } 890789Sahrens 8915530Sbonwick int 8925530Sbonwick zio_wait_for_children_done(zio_t *zio) 893789Sahrens { 8945530Sbonwick return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 8955530Sbonwick &zio->io_children_notdone)); 896789Sahrens } 897789Sahrens 8985530Sbonwick static int 8995329Sgw25295 zio_read_init(zio_t *zio) 9005329Sgw25295 { 9015530Sbonwick blkptr_t *bp = zio->io_bp; 9025530Sbonwick 9035530Sbonwick if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 9045530Sbonwick uint64_t csize = BP_GET_PSIZE(bp); 9055329Sgw25295 void *cbuf = zio_buf_alloc(csize); 9065329Sgw25295 9075329Sgw25295 zio_push_transform(zio, cbuf, csize, csize); 9085329Sgw25295 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 9095329Sgw25295 } 9105329Sgw25295 9115530Sbonwick if (BP_IS_GANG(bp)) { 9125329Sgw25295 uint64_t gsize = SPA_GANGBLOCKSIZE; 9135329Sgw25295 void *gbuf = zio_buf_alloc(gsize); 9145329Sgw25295 9155329Sgw25295 zio_push_transform(zio, gbuf, gsize, gsize); 9165329Sgw25295 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 9175329Sgw25295 } 9185530Sbonwick 9195530Sbonwick if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 9205530Sbonwick zio->io_flags |= ZIO_FLAG_DONT_CACHE; 9215530Sbonwick 9225530Sbonwick return (ZIO_PIPELINE_CONTINUE); 9235329Sgw25295 } 9245329Sgw25295 9255530Sbonwick static int 926789Sahrens zio_ready(zio_t *zio) 927789Sahrens { 928789Sahrens zio_t *pio = zio->io_parent; 929789Sahrens 9303547Smaybee if (zio->io_ready) 9313547Smaybee zio->io_ready(zio); 9323547Smaybee 933789Sahrens if (pio != NULL) 9345530Sbonwick zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 935789Sahrens &pio->io_children_notready); 936789Sahrens 937789Sahrens if (zio->io_bp) 938789Sahrens zio->io_bp_copy = *zio->io_bp; 939789Sahrens 9405530Sbonwick return (ZIO_PIPELINE_CONTINUE); 941789Sahrens } 942789Sahrens 9435530Sbonwick static int 9445329Sgw25295 zio_vdev_retry_io(zio_t *zio) 945789Sahrens { 946789Sahrens zio_t *pio = zio->io_parent; 9475329Sgw25295 9485329Sgw25295 /* 9495329Sgw25295 * Preserve the failed bp so that the io_ready() callback can 9505329Sgw25295 * update the accounting accordingly. The callback will also be 9515329Sgw25295 * responsible for freeing the previously allocated block, if one 9525329Sgw25295 * exists. 9535329Sgw25295 */ 9545329Sgw25295 zio->io_bp_orig = *zio->io_bp; 9555329Sgw25295 9565329Sgw25295 /* 9575329Sgw25295 * We must zero out the old DVA and blk_birth before reallocating 9585403Sgw25295 * the bp. 9595329Sgw25295 */ 9605403Sgw25295 BP_ZERO_DVAS(zio->io_bp); 9615329Sgw25295 zio_reset(zio); 9625329Sgw25295 9635329Sgw25295 if (pio) { 9645329Sgw25295 /* 9655329Sgw25295 * Let the parent know that we will 9665329Sgw25295 * re-alloc the write (=> new bp info). 9675329Sgw25295 */ 9685329Sgw25295 mutex_enter(&pio->io_lock); 9695329Sgw25295 pio->io_children_notready++; 9705329Sgw25295 9715329Sgw25295 /* 9725329Sgw25295 * If the parent I/O is still in the open stage, then 9735329Sgw25295 * don't bother telling it to retry since it hasn't 9745329Sgw25295 * progressed far enough for it to care. 9755329Sgw25295 */ 9765329Sgw25295 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 9775329Sgw25295 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 9785329Sgw25295 9795530Sbonwick ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 9805329Sgw25295 mutex_exit(&pio->io_lock); 9815329Sgw25295 } 9825329Sgw25295 9835329Sgw25295 /* 9845329Sgw25295 * We are getting ready to process the retry request so clear 9855329Sgw25295 * the flag and the zio's current error status. 9865329Sgw25295 */ 9875329Sgw25295 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 9885329Sgw25295 zio->io_error = 0; 9895530Sbonwick 9905530Sbonwick return (ZIO_PIPELINE_CONTINUE); 9915329Sgw25295 } 9925329Sgw25295 9935329Sgw25295 int 9945329Sgw25295 zio_vdev_resume_io(spa_t *spa) 9955329Sgw25295 { 9965329Sgw25295 zio_t *zio; 9975329Sgw25295 9985329Sgw25295 mutex_enter(&spa->spa_zio_lock); 9995329Sgw25295 10005329Sgw25295 /* 10015329Sgw25295 * Probe all of vdevs that have experienced an I/O error. 10025329Sgw25295 * If we are still unable to verify the integrity of the vdev 10035329Sgw25295 * then we prevent the resume from proceeeding. 10045329Sgw25295 */ 10055329Sgw25295 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 10065329Sgw25295 zio = list_next(&spa->spa_zio_list, zio)) { 10075329Sgw25295 int error = 0; 10085329Sgw25295 10095329Sgw25295 /* We only care about I/Os that must succeed */ 10105329Sgw25295 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 10115329Sgw25295 continue; 10125329Sgw25295 error = vdev_probe(zio->io_vd); 10135329Sgw25295 if (error) { 10145329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10155329Sgw25295 return (error); 10165329Sgw25295 } 10175329Sgw25295 } 10185329Sgw25295 10195329Sgw25295 /* 10205329Sgw25295 * Clear the vdev stats so that I/O can flow. 10215329Sgw25295 */ 10225329Sgw25295 vdev_clear(spa, NULL, B_FALSE); 10235329Sgw25295 10245329Sgw25295 spa->spa_state = POOL_STATE_ACTIVE; 10255329Sgw25295 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 10265329Sgw25295 list_remove(&spa->spa_zio_list, zio); 10275329Sgw25295 zio->io_error = 0; 10285329Sgw25295 10295329Sgw25295 /* 10305329Sgw25295 * If we are resuming an allocating I/O then we force it 10315329Sgw25295 * to retry and let it resume operation where it left off. 10325329Sgw25295 * Otherwise, go back to the ready stage and pick up from 10335329Sgw25295 * there. 10345329Sgw25295 */ 10355329Sgw25295 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 10365329Sgw25295 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 10375329Sgw25295 zio->io_stage--; 10385329Sgw25295 } else { 10395329Sgw25295 zio->io_stage = ZIO_STAGE_READY; 10405329Sgw25295 } 10415329Sgw25295 10425530Sbonwick (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, 10435329Sgw25295 zio, TQ_SLEEP); 10445329Sgw25295 } 10455329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10465329Sgw25295 10475329Sgw25295 /* 10485329Sgw25295 * Wait for the taskqs to finish and recheck the pool state since 10495329Sgw25295 * it's possible that a resumed I/O has failed again. 10505329Sgw25295 */ 10515329Sgw25295 taskq_wait(zio_taskq); 10525329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 10535329Sgw25295 return (EIO); 10545329Sgw25295 10555329Sgw25295 mutex_enter(&spa->spa_zio_lock); 10565329Sgw25295 cv_broadcast(&spa->spa_zio_cv); 10575329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10585329Sgw25295 10595329Sgw25295 return (0); 10605329Sgw25295 } 10615329Sgw25295 10625530Sbonwick static int 10635329Sgw25295 zio_vdev_suspend_io(zio_t *zio) 10645329Sgw25295 { 10655329Sgw25295 spa_t *spa = zio->io_spa; 10665329Sgw25295 10675329Sgw25295 /* 10685329Sgw25295 * We've experienced an unrecoverable failure so 10695329Sgw25295 * set the pool state accordingly and queue all 10705329Sgw25295 * failed IOs. 10715329Sgw25295 */ 10725329Sgw25295 spa->spa_state = POOL_STATE_IO_FAILURE; 10735329Sgw25295 10745329Sgw25295 mutex_enter(&spa->spa_zio_lock); 10755329Sgw25295 list_insert_tail(&spa->spa_zio_list, zio); 10765329Sgw25295 10775329Sgw25295 #ifndef _KERNEL 10785329Sgw25295 /* Used to notify ztest that the pool has suspended */ 10795329Sgw25295 cv_broadcast(&spa->spa_zio_cv); 10805329Sgw25295 #endif 10815329Sgw25295 mutex_exit(&spa->spa_zio_lock); 10825530Sbonwick 10835530Sbonwick return (ZIO_PIPELINE_STOP); 10845329Sgw25295 } 10855329Sgw25295 10865530Sbonwick static int 10875329Sgw25295 zio_assess(zio_t *zio) 10885329Sgw25295 { 1089789Sahrens spa_t *spa = zio->io_spa; 1090789Sahrens blkptr_t *bp = zio->io_bp; 1091789Sahrens vdev_t *vd = zio->io_vd; 1092789Sahrens 1093789Sahrens ASSERT(zio->io_children_notready == 0); 1094789Sahrens ASSERT(zio->io_children_notdone == 0); 1095789Sahrens 1096789Sahrens if (bp != NULL) { 1097789Sahrens ASSERT(bp->blk_pad[0] == 0); 1098789Sahrens ASSERT(bp->blk_pad[1] == 0); 1099789Sahrens ASSERT(bp->blk_pad[2] == 0); 1100789Sahrens ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1101789Sahrens if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 11021775Sbillm !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1103789Sahrens ASSERT(!BP_SHOULD_BYTESWAP(bp)); 11041775Sbillm if (zio->io_ndvas != 0) 11051775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 11061775Sbillm ASSERT(BP_COUNT_GANG(bp) == 0 || 11071775Sbillm (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 11081775Sbillm } 1109789Sahrens } 1110789Sahrens 11115329Sgw25295 /* 11125329Sgw25295 * Some child I/O has indicated that a retry is necessary, so 11135329Sgw25295 * we set an error on the I/O and let the logic below do the 11145329Sgw25295 * rest. 11155329Sgw25295 */ 11165329Sgw25295 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 11175329Sgw25295 zio->io_error = ERESTART; 11185329Sgw25295 1119789Sahrens if (vd != NULL) 1120789Sahrens vdev_stat_update(zio); 1121789Sahrens 1122789Sahrens if (zio->io_error) { 11231544Seschrock /* 11241544Seschrock * If this I/O is attached to a particular vdev, 11251544Seschrock * generate an error message describing the I/O failure 11261544Seschrock * at the block level. We ignore these errors if the 11271544Seschrock * device is currently unavailable. 11281544Seschrock */ 11291732Sbonwick if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 11305329Sgw25295 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1131789Sahrens 11321544Seschrock if ((zio->io_error == EIO || 11331544Seschrock !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 11341544Seschrock zio->io_logical == zio) { 11351544Seschrock /* 11361544Seschrock * For root I/O requests, tell the SPA to log the error 11371544Seschrock * appropriately. Also, generate a logical data 11381544Seschrock * ereport. 11391544Seschrock */ 11405329Sgw25295 spa_log_error(spa, zio); 11411544Seschrock 11425329Sgw25295 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 11435329Sgw25295 0, 0); 11441544Seschrock } 1145789Sahrens 11461544Seschrock /* 11475403Sgw25295 * If we are an allocating I/O then we attempt to reissue 11485403Sgw25295 * the I/O on another vdev unless the pool is out of space. 11495403Sgw25295 * We handle this condition based on the spa's failmode 11505403Sgw25295 * property. 11515329Sgw25295 */ 11525329Sgw25295 if (zio_write_retry && zio->io_error != ENOSPC && 11535530Sbonwick IO_IS_ALLOCATING(zio)) 11545530Sbonwick return (zio_vdev_retry_io(zio)); 11555530Sbonwick 11565329Sgw25295 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 11575329Sgw25295 11585329Sgw25295 /* 11595329Sgw25295 * For I/O requests that cannot fail, we carry out 11605329Sgw25295 * the requested behavior based on the failmode pool 11615329Sgw25295 * property. 11625329Sgw25295 * 11635329Sgw25295 * XXX - Need to differentiate between an ENOSPC as 11645329Sgw25295 * a result of vdev failures vs. a full pool. 11651544Seschrock */ 11661544Seschrock if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 11673459Sek110237 char *blkbuf; 11683459Sek110237 11695329Sgw25295 #ifdef ZFS_DEBUG 11703459Sek110237 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 11713459Sek110237 if (blkbuf) { 11723459Sek110237 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 11733459Sek110237 bp ? bp : &zio->io_bp_copy); 11743459Sek110237 } 11755329Sgw25295 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p " 11765329Sgw25295 "%s): error %d", zio->io_error == ECKSUM ? 11771544Seschrock "bad checksum" : "I/O failure", 11781544Seschrock zio_type_name[zio->io_type], 11791544Seschrock vdev_description(vd), 11801544Seschrock (u_longlong_t)zio->io_offset, 11815329Sgw25295 (void *)zio, blkbuf ? blkbuf : "", zio->io_error); 11825329Sgw25295 #endif 11835329Sgw25295 11845329Sgw25295 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 11855329Sgw25295 fm_panic("Pool '%s' has encountered an " 11865329Sgw25295 "uncorrectable I/O failure and the " 11875329Sgw25295 "failure mode property for this pool " 11885329Sgw25295 "is set to panic.", spa_name(spa)); 11895329Sgw25295 } 11905530Sbonwick cmn_err(CE_WARN, "Pool '%s' has encountered " 11915530Sbonwick "an uncorrectable I/O error. " 11925530Sbonwick "Manual intervention is required.", spa_name(spa)); 11935530Sbonwick return (zio_vdev_suspend_io(zio)); 11941544Seschrock } 1195789Sahrens } 11965329Sgw25295 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 11975329Sgw25295 ASSERT(zio->io_children_notready == 0); 11985530Sbonwick 11995530Sbonwick return (ZIO_PIPELINE_CONTINUE); 12005329Sgw25295 } 12015329Sgw25295 12025530Sbonwick static int 12035329Sgw25295 zio_done(zio_t *zio) 12045329Sgw25295 { 12055329Sgw25295 zio_t *pio = zio->io_parent; 12065329Sgw25295 spa_t *spa = zio->io_spa; 12075329Sgw25295 12085329Sgw25295 ASSERT(zio->io_children_notready == 0); 12095329Sgw25295 ASSERT(zio->io_children_notdone == 0); 12105329Sgw25295 1211789Sahrens zio_clear_transform_stack(zio); 1212789Sahrens 1213789Sahrens if (zio->io_done) 1214789Sahrens zio->io_done(zio); 1215789Sahrens 1216789Sahrens ASSERT(zio->io_delegate_list == NULL); 1217789Sahrens ASSERT(zio->io_delegate_next == NULL); 1218789Sahrens 1219789Sahrens if (pio != NULL) { 1220789Sahrens zio_t *next, *prev; 1221789Sahrens 1222789Sahrens mutex_enter(&pio->io_lock); 1223789Sahrens next = zio->io_sibling_next; 1224789Sahrens prev = zio->io_sibling_prev; 1225789Sahrens if (next != NULL) 1226789Sahrens next->io_sibling_prev = prev; 1227789Sahrens if (prev != NULL) 1228789Sahrens prev->io_sibling_next = next; 1229789Sahrens if (pio->io_child == zio) 1230789Sahrens pio->io_child = next; 1231789Sahrens mutex_exit(&pio->io_lock); 1232789Sahrens 12335530Sbonwick zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 1234789Sahrens &pio->io_children_notdone); 1235789Sahrens } 1236789Sahrens 12373463Sahrens /* 12384055Seschrock * Note: this I/O is now done, and will shortly be freed, so there is no 12394055Seschrock * need to clear this (or any other) flag. 12403463Sahrens */ 12413463Sahrens if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 12421544Seschrock spa_config_exit(spa, zio); 1243789Sahrens 1244789Sahrens if (zio->io_waiter != NULL) { 1245789Sahrens mutex_enter(&zio->io_lock); 1246789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1247789Sahrens zio->io_stalled = zio->io_stage; 1248789Sahrens cv_broadcast(&zio->io_cv); 1249789Sahrens mutex_exit(&zio->io_lock); 1250789Sahrens } else { 12514831Sgw25295 mutex_destroy(&zio->io_lock); 12524831Sgw25295 cv_destroy(&zio->io_cv); 12534055Seschrock kmem_cache_free(zio_cache, zio); 1254789Sahrens } 12555530Sbonwick 12565530Sbonwick return (ZIO_PIPELINE_STOP); 1257789Sahrens } 1258789Sahrens 1259789Sahrens /* 1260789Sahrens * ========================================================================== 1261789Sahrens * Compression support 1262789Sahrens * ========================================================================== 1263789Sahrens */ 12645530Sbonwick static int 1265789Sahrens zio_write_compress(zio_t *zio) 1266789Sahrens { 1267789Sahrens int compress = zio->io_compress; 1268789Sahrens blkptr_t *bp = zio->io_bp; 1269789Sahrens void *cbuf; 1270789Sahrens uint64_t lsize = zio->io_size; 1271789Sahrens uint64_t csize = lsize; 1272789Sahrens uint64_t cbufsize = 0; 1273789Sahrens int pass; 1274789Sahrens 1275789Sahrens if (bp->blk_birth == zio->io_txg) { 1276789Sahrens /* 1277789Sahrens * We're rewriting an existing block, which means we're 1278789Sahrens * working on behalf of spa_sync(). For spa_sync() to 1279789Sahrens * converge, it must eventually be the case that we don't 1280789Sahrens * have to allocate new blocks. But compression changes 1281789Sahrens * the blocksize, which forces a reallocate, and makes 1282789Sahrens * convergence take longer. Therefore, after the first 1283789Sahrens * few passes, stop compressing to ensure convergence. 1284789Sahrens */ 1285789Sahrens pass = spa_sync_pass(zio->io_spa); 1286789Sahrens if (pass > zio_sync_pass.zp_dontcompress) 1287789Sahrens compress = ZIO_COMPRESS_OFF; 1288789Sahrens } else { 1289789Sahrens ASSERT(BP_IS_HOLE(bp)); 1290789Sahrens pass = 1; 1291789Sahrens } 1292789Sahrens 1293789Sahrens if (compress != ZIO_COMPRESS_OFF) 1294789Sahrens if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1295789Sahrens &cbuf, &csize, &cbufsize)) 1296789Sahrens compress = ZIO_COMPRESS_OFF; 1297789Sahrens 1298789Sahrens if (compress != ZIO_COMPRESS_OFF && csize != 0) 1299789Sahrens zio_push_transform(zio, cbuf, csize, cbufsize); 1300789Sahrens 1301789Sahrens /* 1302789Sahrens * The final pass of spa_sync() must be all rewrites, but the first 1303789Sahrens * few passes offer a trade-off: allocating blocks defers convergence, 1304789Sahrens * but newly allocated blocks are sequential, so they can be written 1305789Sahrens * to disk faster. Therefore, we allow the first few passes of 1306789Sahrens * spa_sync() to reallocate new blocks, but force rewrites after that. 1307789Sahrens * There should only be a handful of blocks after pass 1 in any case. 1308789Sahrens */ 1309789Sahrens if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1310789Sahrens pass > zio_sync_pass.zp_rewrite) { 1311789Sahrens ASSERT(csize != 0); 13122885Sahrens BP_SET_LSIZE(bp, lsize); 13132885Sahrens BP_SET_COMPRESS(bp, compress); 13145530Sbonwick zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); 1315789Sahrens } else { 13163882Sahrens if (bp->blk_birth == zio->io_txg) 13173882Sahrens BP_ZERO(bp); 1318789Sahrens if (csize == 0) { 1319789Sahrens BP_ZERO(bp); 1320789Sahrens zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1321789Sahrens } else { 13221775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1323789Sahrens BP_SET_LSIZE(bp, lsize); 1324789Sahrens BP_SET_PSIZE(bp, csize); 1325789Sahrens BP_SET_COMPRESS(bp, compress); 1326789Sahrens } 1327789Sahrens } 1328789Sahrens 13295530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1330789Sahrens } 1331789Sahrens 13325530Sbonwick static int 1333789Sahrens zio_read_decompress(zio_t *zio) 1334789Sahrens { 1335789Sahrens blkptr_t *bp = zio->io_bp; 1336789Sahrens void *data; 1337789Sahrens uint64_t size; 1338789Sahrens uint64_t bufsize; 1339789Sahrens int compress = BP_GET_COMPRESS(bp); 1340789Sahrens 1341789Sahrens ASSERT(compress != ZIO_COMPRESS_OFF); 1342789Sahrens 1343789Sahrens zio_pop_transform(zio, &data, &size, &bufsize); 1344789Sahrens 1345789Sahrens if (zio_decompress_data(compress, data, size, 1346789Sahrens zio->io_data, zio->io_size)) 1347789Sahrens zio->io_error = EIO; 1348789Sahrens 1349789Sahrens zio_buf_free(data, bufsize); 1350789Sahrens 13515530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1352789Sahrens } 1353789Sahrens 1354789Sahrens /* 1355789Sahrens * ========================================================================== 1356789Sahrens * Gang block support 1357789Sahrens * ========================================================================== 1358789Sahrens */ 1359789Sahrens static void 1360789Sahrens zio_gang_byteswap(zio_t *zio) 1361789Sahrens { 1362789Sahrens ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1363789Sahrens 1364789Sahrens if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1365789Sahrens byteswap_uint64_array(zio->io_data, zio->io_size); 1366789Sahrens } 1367789Sahrens 13685530Sbonwick static int 1369789Sahrens zio_get_gang_header(zio_t *zio) 1370789Sahrens { 1371789Sahrens blkptr_t *bp = zio->io_bp; 1372789Sahrens uint64_t gsize = SPA_GANGBLOCKSIZE; 1373789Sahrens void *gbuf = zio_buf_alloc(gsize); 1374789Sahrens 13751775Sbillm ASSERT(BP_IS_GANG(bp)); 1376789Sahrens 1377789Sahrens zio_push_transform(zio, gbuf, gsize, gsize); 1378789Sahrens 1379789Sahrens zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1380789Sahrens NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1381789Sahrens zio->io_flags & ZIO_FLAG_GANG_INHERIT, 13825329Sgw25295 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1383789Sahrens 13845530Sbonwick return (zio_wait_for_children_done(zio)); 1385789Sahrens } 1386789Sahrens 13875530Sbonwick static int 1388789Sahrens zio_read_gang_members(zio_t *zio) 1389789Sahrens { 1390789Sahrens zio_gbh_phys_t *gbh; 1391789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1392789Sahrens int i; 1393789Sahrens 13941775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1395789Sahrens 1396789Sahrens zio_gang_byteswap(zio); 1397789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1398789Sahrens 1399789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1400789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1401789Sahrens lsize = BP_GET_PSIZE(gbp); 1402789Sahrens 1403789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1404789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1405789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1406789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1407789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1408789Sahrens 1409789Sahrens zio_nowait(zio_read(zio, zio->io_spa, gbp, 14105530Sbonwick (char *)zio->io_data + loff, lsize, 14115530Sbonwick NULL, NULL, zio->io_priority, 14125530Sbonwick zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1413789Sahrens } 1414789Sahrens 1415789Sahrens zio_buf_free(gbh, gbufsize); 14165530Sbonwick 14175530Sbonwick return (zio_wait_for_children_done(zio)); 1418789Sahrens } 1419789Sahrens 14205530Sbonwick static int 1421789Sahrens zio_rewrite_gang_members(zio_t *zio) 1422789Sahrens { 1423789Sahrens zio_gbh_phys_t *gbh; 1424789Sahrens uint64_t gsize, gbufsize, loff, lsize; 1425789Sahrens int i; 1426789Sahrens 14271775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1428789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1429789Sahrens 1430789Sahrens zio_gang_byteswap(zio); 1431789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1432789Sahrens 1433789Sahrens ASSERT(gsize == gbufsize); 1434789Sahrens 1435789Sahrens for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1436789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1437789Sahrens lsize = BP_GET_PSIZE(gbp); 1438789Sahrens 1439789Sahrens ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1440789Sahrens ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1441789Sahrens ASSERT3U(loff + lsize, <=, zio->io_size); 1442789Sahrens ASSERT(i < SPA_GBH_NBLKPTRS); 1443789Sahrens ASSERT(!BP_IS_HOLE(gbp)); 1444789Sahrens 1445789Sahrens zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1446789Sahrens zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 14475530Sbonwick NULL, NULL, zio->io_priority, 14485530Sbonwick zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1449789Sahrens } 1450789Sahrens 1451789Sahrens zio_push_transform(zio, gbh, gsize, gbufsize); 14525530Sbonwick 14535530Sbonwick return (zio_wait_for_children_ready(zio)); 1454789Sahrens } 1455789Sahrens 14565530Sbonwick static int 1457789Sahrens zio_free_gang_members(zio_t *zio) 1458789Sahrens { 1459789Sahrens zio_gbh_phys_t *gbh; 1460789Sahrens uint64_t gsize, gbufsize; 1461789Sahrens int i; 1462789Sahrens 14631775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1464789Sahrens 1465789Sahrens zio_gang_byteswap(zio); 1466789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1467789Sahrens 1468789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1469789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1470789Sahrens 1471789Sahrens if (BP_IS_HOLE(gbp)) 1472789Sahrens continue; 1473789Sahrens zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1474789Sahrens gbp, NULL, NULL)); 1475789Sahrens } 1476789Sahrens 1477789Sahrens zio_buf_free(gbh, gbufsize); 14785530Sbonwick 14795530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1480789Sahrens } 1481789Sahrens 14825530Sbonwick static int 1483789Sahrens zio_claim_gang_members(zio_t *zio) 1484789Sahrens { 1485789Sahrens zio_gbh_phys_t *gbh; 1486789Sahrens uint64_t gsize, gbufsize; 1487789Sahrens int i; 1488789Sahrens 14891775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1490789Sahrens 1491789Sahrens zio_gang_byteswap(zio); 1492789Sahrens zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1493789Sahrens 1494789Sahrens for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1495789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 1496789Sahrens if (BP_IS_HOLE(gbp)) 1497789Sahrens continue; 1498789Sahrens zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1499789Sahrens gbp, NULL, NULL)); 1500789Sahrens } 1501789Sahrens 1502789Sahrens zio_buf_free(gbh, gbufsize); 15035530Sbonwick 15045530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1505789Sahrens } 1506789Sahrens 1507789Sahrens static void 1508789Sahrens zio_write_allocate_gang_member_done(zio_t *zio) 1509789Sahrens { 1510789Sahrens zio_t *pio = zio->io_parent; 15111775Sbillm dva_t *cdva = zio->io_bp->blk_dva; 15121775Sbillm dva_t *pdva = pio->io_bp->blk_dva; 1513789Sahrens uint64_t asize; 15141775Sbillm int d; 1515789Sahrens 15161775Sbillm ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 15171775Sbillm ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 15181775Sbillm ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 15191775Sbillm ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 15201775Sbillm 1521789Sahrens mutex_enter(&pio->io_lock); 15221775Sbillm for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 15231775Sbillm ASSERT(DVA_GET_GANG(&pdva[d])); 15241775Sbillm asize = DVA_GET_ASIZE(&pdva[d]); 15251775Sbillm asize += DVA_GET_ASIZE(&cdva[d]); 15261775Sbillm DVA_SET_ASIZE(&pdva[d], asize); 15271775Sbillm } 1528789Sahrens mutex_exit(&pio->io_lock); 1529789Sahrens } 1530789Sahrens 15315329Sgw25295 static int 15324527Sperrin zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1533789Sahrens { 1534789Sahrens blkptr_t *bp = zio->io_bp; 15351775Sbillm dva_t *dva = bp->blk_dva; 15361775Sbillm spa_t *spa = zio->io_spa; 1537789Sahrens zio_gbh_phys_t *gbh; 15381775Sbillm uint64_t txg = zio->io_txg; 1539789Sahrens uint64_t resid = zio->io_size; 1540789Sahrens uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1541789Sahrens uint64_t gsize, loff, lsize; 1542789Sahrens uint32_t gbps_left; 15431775Sbillm int ndvas = zio->io_ndvas; 15441775Sbillm int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1545789Sahrens int error; 15461775Sbillm int i, d; 1547789Sahrens 1548789Sahrens gsize = SPA_GANGBLOCKSIZE; 1549789Sahrens gbps_left = SPA_GBH_NBLKPTRS; 1550789Sahrens 15514527Sperrin error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 15524527Sperrin B_FALSE); 15535530Sbonwick if (error) { 15545530Sbonwick zio->io_error = error; 15555530Sbonwick return (ZIO_PIPELINE_CONTINUE); 15565530Sbonwick } 1557789Sahrens 15581775Sbillm for (d = 0; d < gbh_ndvas; d++) 15591775Sbillm DVA_SET_GANG(&dva[d], 1); 1560789Sahrens 15611775Sbillm bp->blk_birth = txg; 1562789Sahrens 1563789Sahrens gbh = zio_buf_alloc(gsize); 1564789Sahrens bzero(gbh, gsize); 1565789Sahrens 1566789Sahrens for (loff = 0, i = 0; loff != zio->io_size; 1567789Sahrens loff += lsize, resid -= lsize, gbps_left--, i++) { 1568789Sahrens blkptr_t *gbp = &gbh->zg_blkptr[i]; 15691775Sbillm dva = gbp->blk_dva; 1570789Sahrens 1571789Sahrens ASSERT(gbps_left != 0); 1572789Sahrens maxalloc = MIN(maxalloc, resid); 1573789Sahrens 1574789Sahrens while (resid <= maxalloc * gbps_left) { 15754527Sperrin error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 15763063Sperrin txg, bp, B_FALSE); 1577789Sahrens if (error == 0) 1578789Sahrens break; 1579789Sahrens ASSERT3U(error, ==, ENOSPC); 15805329Sgw25295 /* XXX - free up previous allocations? */ 15815530Sbonwick if (maxalloc == SPA_MINBLOCKSIZE) { 15825530Sbonwick zio->io_error = error; 15835530Sbonwick return (ZIO_PIPELINE_CONTINUE); 15845530Sbonwick } 1585789Sahrens maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1586789Sahrens } 1587789Sahrens 1588789Sahrens if (resid <= maxalloc * gbps_left) { 1589789Sahrens lsize = maxalloc; 1590789Sahrens BP_SET_LSIZE(gbp, lsize); 1591789Sahrens BP_SET_PSIZE(gbp, lsize); 1592789Sahrens BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 15931775Sbillm gbp->blk_birth = txg; 15941775Sbillm zio_nowait(zio_rewrite(zio, spa, 15951775Sbillm zio->io_checksum, txg, gbp, 1596789Sahrens (char *)zio->io_data + loff, lsize, 1597789Sahrens zio_write_allocate_gang_member_done, NULL, 15985403Sgw25295 zio->io_priority, 15995403Sgw25295 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 16001544Seschrock &zio->io_bookmark)); 1601789Sahrens } else { 1602789Sahrens lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1603789Sahrens ASSERT(lsize != SPA_MINBLOCKSIZE); 16041775Sbillm zio_nowait(zio_write_allocate(zio, spa, 16051775Sbillm zio->io_checksum, txg, gbp, 1606789Sahrens (char *)zio->io_data + loff, lsize, 1607789Sahrens zio_write_allocate_gang_member_done, NULL, 16085403Sgw25295 zio->io_priority, 16095403Sgw25295 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1610789Sahrens } 1611789Sahrens } 1612789Sahrens 1613789Sahrens ASSERT(resid == 0 && loff == zio->io_size); 1614789Sahrens 1615789Sahrens zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1616789Sahrens 1617789Sahrens zio_push_transform(zio, gbh, gsize, gsize); 16185530Sbonwick 16191775Sbillm /* 16205530Sbonwick * As much as we'd like this to be 'ready' instead of 'done', 16211775Sbillm * updating our ASIZE doesn't happen until the io_done callback, 16221775Sbillm * so we have to wait for that to finish in order for our BP 16231775Sbillm * to be stable. 16241775Sbillm */ 16255530Sbonwick return (zio_wait_for_children_done(zio)); 1626789Sahrens } 1627789Sahrens 1628789Sahrens /* 1629789Sahrens * ========================================================================== 1630789Sahrens * Allocate and free blocks 1631789Sahrens * ========================================================================== 1632789Sahrens */ 16335530Sbonwick static int 1634789Sahrens zio_dva_allocate(zio_t *zio) 1635789Sahrens { 16364527Sperrin spa_t *spa = zio->io_spa; 16374527Sperrin metaslab_class_t *mc = spa->spa_normal_class; 1638789Sahrens blkptr_t *bp = zio->io_bp; 1639789Sahrens int error; 1640789Sahrens 1641789Sahrens ASSERT(BP_IS_HOLE(bp)); 16421775Sbillm ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 16431775Sbillm ASSERT3U(zio->io_ndvas, >, 0); 16444527Sperrin ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1645789Sahrens 16465329Sgw25295 /* 16475329Sgw25295 * For testing purposes, we force I/Os to retry. We don't allow 16485329Sgw25295 * retries beyond the first pass since those I/Os are non-allocating 16495403Sgw25295 * writes. 16505329Sgw25295 */ 16515329Sgw25295 if (zio_io_fail_shift && 16525329Sgw25295 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 16535329Sgw25295 zio_io_should_fail(zio_io_fail_shift)) 16545329Sgw25295 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 16555329Sgw25295 1656789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1657789Sahrens 16584527Sperrin error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 16593063Sperrin zio->io_txg, NULL, B_FALSE); 1660789Sahrens 1661789Sahrens if (error == 0) { 1662789Sahrens bp->blk_birth = zio->io_txg; 16635329Sgw25295 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 16645530Sbonwick return (zio_write_allocate_gang_members(zio, mc)); 1665789Sahrens } else { 1666789Sahrens zio->io_error = error; 1667789Sahrens } 16685530Sbonwick 16695530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1670789Sahrens } 1671789Sahrens 16725530Sbonwick static int 1673789Sahrens zio_dva_free(zio_t *zio) 1674789Sahrens { 1675789Sahrens blkptr_t *bp = zio->io_bp; 1676789Sahrens 16771807Sbonwick metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1678789Sahrens 1679789Sahrens BP_ZERO(bp); 1680789Sahrens 16815530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1682789Sahrens } 1683789Sahrens 16845530Sbonwick static int 1685789Sahrens zio_dva_claim(zio_t *zio) 1686789Sahrens { 16871807Sbonwick zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1688789Sahrens 16895530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1690789Sahrens } 1691789Sahrens 1692789Sahrens /* 1693789Sahrens * ========================================================================== 1694789Sahrens * Read and write to physical devices 1695789Sahrens * ========================================================================== 1696789Sahrens */ 1697789Sahrens 16985530Sbonwick static int 16991775Sbillm zio_vdev_io_start(zio_t *zio) 1700789Sahrens { 1701789Sahrens vdev_t *vd = zio->io_vd; 17021775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 17031775Sbillm blkptr_t *bp = zio->io_bp; 17041775Sbillm uint64_t align; 17055329Sgw25295 spa_t *spa = zio->io_spa; 17065329Sgw25295 17075329Sgw25295 /* 17085329Sgw25295 * If the pool is already in a failure state then just suspend 17095329Sgw25295 * this IO until the problem is resolved. We will reissue them 17105329Sgw25295 * at that time. 17115329Sgw25295 */ 17125329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 17135530Sbonwick zio->io_type == ZIO_TYPE_WRITE) 17145530Sbonwick return (zio_vdev_suspend_io(zio)); 1715789Sahrens 17165530Sbonwick /* 17175530Sbonwick * The mirror_ops handle multiple DVAs in a single BP 17185530Sbonwick */ 17195530Sbonwick if (vd == NULL) 17205530Sbonwick return (vdev_mirror_ops.vdev_op_io_start(zio)); 17211775Sbillm 17221775Sbillm align = 1ULL << tvd->vdev_ashift; 17231775Sbillm 17241732Sbonwick if (zio->io_retries == 0 && vd == tvd) 1725789Sahrens zio->io_flags |= ZIO_FLAG_FAILFAST; 1726789Sahrens 17275530Sbonwick if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1728789Sahrens zio->io_flags |= ZIO_FLAG_PHYSICAL; 1729789Sahrens zio->io_offset += VDEV_LABEL_START_SIZE; 1730789Sahrens } 1731789Sahrens 17321732Sbonwick if (P2PHASE(zio->io_size, align) != 0) { 17331732Sbonwick uint64_t asize = P2ROUNDUP(zio->io_size, align); 17341732Sbonwick char *abuf = zio_buf_alloc(asize); 17351732Sbonwick ASSERT(vd == tvd); 17361732Sbonwick if (zio->io_type == ZIO_TYPE_WRITE) { 17371732Sbonwick bcopy(zio->io_data, abuf, zio->io_size); 17381732Sbonwick bzero(abuf + zio->io_size, asize - zio->io_size); 17391732Sbonwick } 17401732Sbonwick zio_push_transform(zio, abuf, asize, asize); 17411732Sbonwick ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 17421732Sbonwick zio->io_flags |= ZIO_FLAG_SUBBLOCK; 17431732Sbonwick } 17441732Sbonwick 17451732Sbonwick ASSERT(P2PHASE(zio->io_offset, align) == 0); 17461732Sbonwick ASSERT(P2PHASE(zio->io_size, align) == 0); 17471732Sbonwick ASSERT(bp == NULL || 17481732Sbonwick P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1749789Sahrens ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1750789Sahrens 17515530Sbonwick return (vd->vdev_ops->vdev_op_io_start(zio)); 1752789Sahrens } 1753789Sahrens 17545530Sbonwick static int 1755789Sahrens zio_vdev_io_done(zio_t *zio) 1756789Sahrens { 17571775Sbillm if (zio->io_vd == NULL) 17585530Sbonwick return (vdev_mirror_ops.vdev_op_io_done(zio)); 17595530Sbonwick 17605530Sbonwick return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); 1761789Sahrens } 1762789Sahrens 1763789Sahrens /* XXPOLICY */ 17641544Seschrock boolean_t 1765789Sahrens zio_should_retry(zio_t *zio) 1766789Sahrens { 1767789Sahrens vdev_t *vd = zio->io_vd; 1768789Sahrens 1769789Sahrens if (zio->io_error == 0) 1770789Sahrens return (B_FALSE); 1771789Sahrens if (zio->io_delegate_list != NULL) 1772789Sahrens return (B_FALSE); 17731775Sbillm if (vd && vd != vd->vdev_top) 1774789Sahrens return (B_FALSE); 1775789Sahrens if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1776789Sahrens return (B_FALSE); 17771544Seschrock if (zio->io_retries > 0) 1778789Sahrens return (B_FALSE); 1779789Sahrens 1780789Sahrens return (B_TRUE); 1781789Sahrens } 1782789Sahrens 17835530Sbonwick static int 1784789Sahrens zio_vdev_io_assess(zio_t *zio) 1785789Sahrens { 1786789Sahrens vdev_t *vd = zio->io_vd; 17871775Sbillm vdev_t *tvd = vd ? vd->vdev_top : NULL; 1788789Sahrens 17891544Seschrock ASSERT(zio->io_vsd == NULL); 1790789Sahrens 17911732Sbonwick if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 17921732Sbonwick void *abuf; 17931732Sbonwick uint64_t asize; 17941732Sbonwick ASSERT(vd == tvd); 17951732Sbonwick zio_pop_transform(zio, &abuf, &asize, &asize); 17961732Sbonwick if (zio->io_type == ZIO_TYPE_READ) 17971732Sbonwick bcopy(abuf, zio->io_data, zio->io_size); 17981732Sbonwick zio_buf_free(abuf, asize); 17991732Sbonwick zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 18001732Sbonwick } 18011732Sbonwick 18021544Seschrock if (zio_injection_enabled && !zio->io_error) 18031544Seschrock zio->io_error = zio_handle_fault_injection(zio, EIO); 1804789Sahrens 1805789Sahrens /* 1806789Sahrens * If the I/O failed, determine whether we should attempt to retry it. 1807789Sahrens */ 1808789Sahrens /* XXPOLICY */ 1809789Sahrens if (zio_should_retry(zio)) { 1810789Sahrens ASSERT(tvd == vd); 1811789Sahrens 1812789Sahrens zio->io_retries++; 1813789Sahrens zio->io_error = 0; 18145688Sbonwick zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; 1815789Sahrens /* XXPOLICY */ 1816789Sahrens zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1817789Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE; 18181775Sbillm zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1819789Sahrens 18205530Sbonwick return (ZIO_PIPELINE_CONTINUE); 18211544Seschrock } 1822789Sahrens 18235530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1824789Sahrens } 1825789Sahrens 1826789Sahrens void 1827789Sahrens zio_vdev_io_reissue(zio_t *zio) 1828789Sahrens { 1829789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1830789Sahrens ASSERT(zio->io_error == 0); 1831789Sahrens 1832789Sahrens zio->io_stage--; 1833789Sahrens } 1834789Sahrens 1835789Sahrens void 1836789Sahrens zio_vdev_io_redone(zio_t *zio) 1837789Sahrens { 1838789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1839789Sahrens 1840789Sahrens zio->io_stage--; 1841789Sahrens } 1842789Sahrens 1843789Sahrens void 1844789Sahrens zio_vdev_io_bypass(zio_t *zio) 1845789Sahrens { 1846789Sahrens ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1847789Sahrens ASSERT(zio->io_error == 0); 1848789Sahrens 1849789Sahrens zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1850789Sahrens zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1851789Sahrens } 1852789Sahrens 1853789Sahrens /* 1854789Sahrens * ========================================================================== 1855789Sahrens * Generate and verify checksums 1856789Sahrens * ========================================================================== 1857789Sahrens */ 18585530Sbonwick static int 1859789Sahrens zio_checksum_generate(zio_t *zio) 1860789Sahrens { 1861789Sahrens int checksum = zio->io_checksum; 1862789Sahrens blkptr_t *bp = zio->io_bp; 1863789Sahrens 1864789Sahrens ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1865789Sahrens 1866789Sahrens BP_SET_CHECKSUM(bp, checksum); 1867789Sahrens BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1868789Sahrens 1869789Sahrens zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1870789Sahrens 18715530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1872789Sahrens } 1873789Sahrens 18745530Sbonwick static int 1875789Sahrens zio_gang_checksum_generate(zio_t *zio) 1876789Sahrens { 1877789Sahrens zio_cksum_t zc; 1878789Sahrens zio_gbh_phys_t *gbh = zio->io_data; 1879789Sahrens 18801775Sbillm ASSERT(BP_IS_GANG(zio->io_bp)); 1881789Sahrens ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1882789Sahrens 1883789Sahrens zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1884789Sahrens 1885789Sahrens zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1886789Sahrens 18875530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1888789Sahrens } 1889789Sahrens 18905530Sbonwick static int 1891789Sahrens zio_checksum_verify(zio_t *zio) 1892789Sahrens { 1893789Sahrens if (zio->io_bp != NULL) { 1894789Sahrens zio->io_error = zio_checksum_error(zio); 18951544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 18961544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 18971544Seschrock zio->io_spa, zio->io_vd, zio, 0, 0); 1898789Sahrens } 1899789Sahrens 19005530Sbonwick return (ZIO_PIPELINE_CONTINUE); 1901789Sahrens } 1902789Sahrens 1903789Sahrens /* 1904789Sahrens * Called by RAID-Z to ensure we don't compute the checksum twice. 1905789Sahrens */ 1906789Sahrens void 1907789Sahrens zio_checksum_verified(zio_t *zio) 1908789Sahrens { 1909789Sahrens zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1910789Sahrens } 1911789Sahrens 1912789Sahrens /* 1913789Sahrens * Set the external verifier for a gang block based on stuff in the bp 1914789Sahrens */ 1915789Sahrens void 1916789Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1917789Sahrens { 19181775Sbillm blkptr_t *bp = zio->io_bp; 19191775Sbillm 19201775Sbillm zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 19211775Sbillm zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 19221775Sbillm zcp->zc_word[2] = bp->blk_birth; 1923789Sahrens zcp->zc_word[3] = 0; 1924789Sahrens } 1925789Sahrens 1926789Sahrens /* 1927789Sahrens * ========================================================================== 1928789Sahrens * Define the pipeline 1929789Sahrens * ========================================================================== 1930789Sahrens */ 19315530Sbonwick typedef int zio_pipe_stage_t(zio_t *zio); 1932789Sahrens 1933789Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 19345530Sbonwick NULL, 19355530Sbonwick zio_wait_for_children_ready, 19365530Sbonwick zio_read_init, 19375530Sbonwick zio_issue_async, 1938789Sahrens zio_write_compress, 1939789Sahrens zio_checksum_generate, 1940789Sahrens zio_get_gang_header, 1941789Sahrens zio_rewrite_gang_members, 1942789Sahrens zio_free_gang_members, 1943789Sahrens zio_claim_gang_members, 1944789Sahrens zio_dva_allocate, 1945789Sahrens zio_dva_free, 1946789Sahrens zio_dva_claim, 1947789Sahrens zio_gang_checksum_generate, 1948789Sahrens zio_ready, 1949789Sahrens zio_vdev_io_start, 1950789Sahrens zio_vdev_io_done, 1951789Sahrens zio_vdev_io_assess, 19525530Sbonwick zio_wait_for_children_done, 1953789Sahrens zio_checksum_verify, 1954789Sahrens zio_read_gang_members, 1955789Sahrens zio_read_decompress, 19565329Sgw25295 zio_assess, 1957789Sahrens zio_done, 19585530Sbonwick NULL 1959789Sahrens }; 1960789Sahrens 1961789Sahrens /* 19625530Sbonwick * Execute the I/O pipeline until one of the following occurs: 19635530Sbonwick * (1) the I/O completes; (2) the pipeline stalls waiting for 19645530Sbonwick * dependent child I/Os; (3) the I/O issues, so we're waiting 19655530Sbonwick * for an I/O completion interrupt; (4) the I/O is delegated by 19665530Sbonwick * vdev-level caching or aggregation; (5) the I/O is deferred 19675530Sbonwick * due to vdev-level queueing; (6) the I/O is handed off to 19685530Sbonwick * another thread. In all cases, the pipeline stops whenever 19695530Sbonwick * there's no CPU work; it never burns a thread in cv_wait(). 19705530Sbonwick * 19715530Sbonwick * There's no locking on io_stage because there's no legitimate way 19725530Sbonwick * for multiple threads to be attempting to process the same I/O. 1973789Sahrens */ 1974789Sahrens void 19755530Sbonwick zio_execute(zio_t *zio) 1976789Sahrens { 19775530Sbonwick while (zio->io_stage < ZIO_STAGE_DONE) { 19785530Sbonwick uint32_t pipeline = zio->io_pipeline; 19795530Sbonwick int rv; 1980789Sahrens 19815530Sbonwick ASSERT(!MUTEX_HELD(&zio->io_lock)); 1982789Sahrens 19835530Sbonwick /* 19845530Sbonwick * If an error occurred outside the vdev stack, 19855530Sbonwick * just execute the interlock stages to clean up. 19865530Sbonwick */ 19875530Sbonwick if (zio->io_error && 19885530Sbonwick ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) 1989789Sahrens pipeline &= ZIO_ERROR_PIPELINE_MASK; 1990789Sahrens 19915530Sbonwick while (((1U << ++zio->io_stage) & pipeline) == 0) 19925530Sbonwick continue; 1993789Sahrens 19945530Sbonwick ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 19955530Sbonwick ASSERT(zio->io_stalled == 0); 19965530Sbonwick 19975530Sbonwick rv = zio_pipeline[zio->io_stage](zio); 19985530Sbonwick 19995530Sbonwick if (rv == ZIO_PIPELINE_STOP) 20005530Sbonwick return; 20015530Sbonwick 20025530Sbonwick ASSERT(rv == ZIO_PIPELINE_CONTINUE); 2003789Sahrens } 2004789Sahrens } 2005789Sahrens 20063668Sgw25295 static boolean_t 20075329Sgw25295 zio_io_should_fail(uint16_t range) 20083668Sgw25295 { 20093668Sgw25295 static uint16_t allocs = 0; 20103668Sgw25295 20115329Sgw25295 return (P2PHASE(allocs++, 1U<<range) == 0); 20123668Sgw25295 } 20133668Sgw25295 2014789Sahrens /* 2015789Sahrens * Try to allocate an intent log block. Return 0 on success, errno on failure. 2016789Sahrens */ 2017789Sahrens int 20183063Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 20193063Sperrin uint64_t txg) 2020789Sahrens { 2021789Sahrens int error; 2022789Sahrens 20231544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2024789Sahrens 20255329Sgw25295 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 20263668Sgw25295 spa_config_exit(spa, FTAG); 20273668Sgw25295 return (ENOSPC); 20283668Sgw25295 } 20293668Sgw25295 20303063Sperrin /* 20314527Sperrin * We were passed the previous log block's DVA in bp->blk_dva[0]. 20324527Sperrin * We use that as a hint for which vdev to allocate from next. 20333063Sperrin */ 20344527Sperrin error = metaslab_alloc(spa, spa->spa_log_class, size, 20354527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 20364527Sperrin 20374527Sperrin if (error) 20384527Sperrin error = metaslab_alloc(spa, spa->spa_normal_class, size, 20394527Sperrin new_bp, 1, txg, old_bp, B_TRUE); 2040789Sahrens 2041789Sahrens if (error == 0) { 20423063Sperrin BP_SET_LSIZE(new_bp, size); 20433063Sperrin BP_SET_PSIZE(new_bp, size); 20443063Sperrin BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 20453063Sperrin BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 20463063Sperrin BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 20473063Sperrin BP_SET_LEVEL(new_bp, 0); 20483063Sperrin BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 20493063Sperrin new_bp->blk_birth = txg; 2050789Sahrens } 2051789Sahrens 20521544Seschrock spa_config_exit(spa, FTAG); 2053789Sahrens 2054789Sahrens return (error); 2055789Sahrens } 2056789Sahrens 2057789Sahrens /* 2058789Sahrens * Free an intent log block. We know it can't be a gang block, so there's 2059789Sahrens * nothing to do except metaslab_free() it. 2060789Sahrens */ 2061789Sahrens void 2062789Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2063789Sahrens { 20641775Sbillm ASSERT(!BP_IS_GANG(bp)); 2065789Sahrens 20661544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2067789Sahrens 20681807Sbonwick metaslab_free(spa, bp, txg, B_FALSE); 2069789Sahrens 20701544Seschrock spa_config_exit(spa, FTAG); 2071789Sahrens } 20724469Sperrin 20734469Sperrin /* 20744469Sperrin * start an async flush of the write cache for this vdev 20754469Sperrin */ 20764469Sperrin void 20775688Sbonwick zio_flush(zio_t *zio, vdev_t *vd) 20784469Sperrin { 20795688Sbonwick zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 20804469Sperrin NULL, NULL, ZIO_PRIORITY_NOW, 20814469Sperrin ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 20824469Sperrin } 2083