1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 31eda14cbcSMatt Macy #include <sys/spa.h> 32eda14cbcSMatt Macy #include <sys/spa_impl.h> 33eda14cbcSMatt Macy #include <sys/dsl_pool.h> 34eda14cbcSMatt Macy #include <sys/dsl_scan.h> 35eda14cbcSMatt Macy #include <sys/vdev_impl.h> 367877fdebSMatt Macy #include <sys/vdev_draid.h> 37eda14cbcSMatt Macy #include <sys/zio.h> 38a0b956f5SMartin Matuska #include <sys/zio_checksum.h> 39eda14cbcSMatt Macy #include <sys/abd.h> 40eda14cbcSMatt Macy #include <sys/fs/zfs.h> 41eda14cbcSMatt Macy 42eda14cbcSMatt Macy /* 43eda14cbcSMatt Macy * Vdev mirror kstats 44eda14cbcSMatt Macy */ 45eda14cbcSMatt Macy static kstat_t *mirror_ksp = NULL; 46eda14cbcSMatt Macy 47eda14cbcSMatt Macy typedef struct mirror_stats { 48eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_rotating_linear; 49eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_rotating_offset; 50eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_rotating_seek; 51eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_non_rotating_linear; 52eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_non_rotating_seek; 53eda14cbcSMatt Macy 54eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_preferred_found; 55eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_preferred_not_found; 56eda14cbcSMatt Macy } mirror_stats_t; 57eda14cbcSMatt Macy 58eda14cbcSMatt Macy static mirror_stats_t mirror_stats = { 59eda14cbcSMatt Macy /* New I/O follows directly the last I/O */ 60eda14cbcSMatt Macy { "rotating_linear", KSTAT_DATA_UINT64 }, 61eda14cbcSMatt Macy /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ 62eda14cbcSMatt Macy { "rotating_offset", KSTAT_DATA_UINT64 }, 63eda14cbcSMatt Macy /* New I/O requires random seek */ 64eda14cbcSMatt Macy { "rotating_seek", KSTAT_DATA_UINT64 }, 65eda14cbcSMatt Macy /* New I/O follows directly the last I/O (nonrot) */ 66eda14cbcSMatt Macy { "non_rotating_linear", KSTAT_DATA_UINT64 }, 67eda14cbcSMatt Macy /* New I/O requires random seek (nonrot) */ 68eda14cbcSMatt Macy { "non_rotating_seek", KSTAT_DATA_UINT64 }, 69eda14cbcSMatt Macy /* Preferred child vdev found */ 70eda14cbcSMatt Macy { "preferred_found", KSTAT_DATA_UINT64 }, 71eda14cbcSMatt Macy /* Preferred child vdev not found or equal load */ 72eda14cbcSMatt Macy { "preferred_not_found", KSTAT_DATA_UINT64 }, 73eda14cbcSMatt Macy 74eda14cbcSMatt Macy }; 75eda14cbcSMatt Macy 76eda14cbcSMatt Macy #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) 77eda14cbcSMatt Macy #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) 78eda14cbcSMatt Macy #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) 79eda14cbcSMatt Macy 80eda14cbcSMatt Macy void 81eda14cbcSMatt Macy vdev_mirror_stat_init(void) 82eda14cbcSMatt Macy { 83eda14cbcSMatt Macy mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", 84eda14cbcSMatt Macy "misc", KSTAT_TYPE_NAMED, 85eda14cbcSMatt Macy sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 86eda14cbcSMatt Macy if (mirror_ksp != NULL) { 87eda14cbcSMatt Macy mirror_ksp->ks_data = &mirror_stats; 88eda14cbcSMatt Macy kstat_install(mirror_ksp); 89eda14cbcSMatt Macy } 90eda14cbcSMatt Macy } 91eda14cbcSMatt Macy 92eda14cbcSMatt Macy void 93eda14cbcSMatt Macy vdev_mirror_stat_fini(void) 94eda14cbcSMatt Macy { 95eda14cbcSMatt Macy if (mirror_ksp != NULL) { 96eda14cbcSMatt Macy kstat_delete(mirror_ksp); 97eda14cbcSMatt Macy mirror_ksp = NULL; 98eda14cbcSMatt Macy } 99eda14cbcSMatt Macy } 100eda14cbcSMatt Macy 101eda14cbcSMatt Macy /* 102eda14cbcSMatt Macy * Virtual device vector for mirroring. 103eda14cbcSMatt Macy */ 104eda14cbcSMatt Macy typedef struct mirror_child { 105eda14cbcSMatt Macy vdev_t *mc_vd; 106a0b956f5SMartin Matuska abd_t *mc_abd; 107eda14cbcSMatt Macy uint64_t mc_offset; 108eda14cbcSMatt Macy int mc_error; 109eda14cbcSMatt Macy int mc_load; 110eda14cbcSMatt Macy uint8_t mc_tried; 111eda14cbcSMatt Macy uint8_t mc_skipped; 112eda14cbcSMatt Macy uint8_t mc_speculative; 1137877fdebSMatt Macy uint8_t mc_rebuilding; 114eda14cbcSMatt Macy } mirror_child_t; 115eda14cbcSMatt Macy 116eda14cbcSMatt Macy typedef struct mirror_map { 117eda14cbcSMatt Macy int *mm_preferred; 118eda14cbcSMatt Macy int mm_preferred_cnt; 119eda14cbcSMatt Macy int mm_children; 120eda14cbcSMatt Macy boolean_t mm_resilvering; 1217877fdebSMatt Macy boolean_t mm_rebuilding; 122eda14cbcSMatt Macy boolean_t mm_root; 123eda14cbcSMatt Macy mirror_child_t mm_child[]; 124eda14cbcSMatt Macy } mirror_map_t; 125eda14cbcSMatt Macy 126e92ffd9bSMartin Matuska static const int vdev_mirror_shift = 21; 127eda14cbcSMatt Macy 128eda14cbcSMatt Macy /* 129eda14cbcSMatt Macy * The load configuration settings below are tuned by default for 130eda14cbcSMatt Macy * the case where all devices are of the same rotational type. 131eda14cbcSMatt Macy * 132eda14cbcSMatt Macy * If there is a mixture of rotating and non-rotating media, setting 133eda14cbcSMatt Macy * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results 134eda14cbcSMatt Macy * as it will direct more reads to the non-rotating vdevs which are more likely 135eda14cbcSMatt Macy * to have a higher performance. 136eda14cbcSMatt Macy */ 137eda14cbcSMatt Macy 138eda14cbcSMatt Macy /* Rotating media load calculation configuration. */ 139eda14cbcSMatt Macy static int zfs_vdev_mirror_rotating_inc = 0; 140eda14cbcSMatt Macy static int zfs_vdev_mirror_rotating_seek_inc = 5; 141eda14cbcSMatt Macy static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; 142eda14cbcSMatt Macy 143eda14cbcSMatt Macy /* Non-rotating media load calculation configuration. */ 144eda14cbcSMatt Macy static int zfs_vdev_mirror_non_rotating_inc = 0; 145eda14cbcSMatt Macy static int zfs_vdev_mirror_non_rotating_seek_inc = 1; 146eda14cbcSMatt Macy 147eda14cbcSMatt Macy static inline size_t 148eda14cbcSMatt Macy vdev_mirror_map_size(int children) 149eda14cbcSMatt Macy { 150eda14cbcSMatt Macy return (offsetof(mirror_map_t, mm_child[children]) + 151eda14cbcSMatt Macy sizeof (int) * children); 152eda14cbcSMatt Macy } 153eda14cbcSMatt Macy 154eda14cbcSMatt Macy static inline mirror_map_t * 155eda14cbcSMatt Macy vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) 156eda14cbcSMatt Macy { 157eda14cbcSMatt Macy mirror_map_t *mm; 158eda14cbcSMatt Macy 159eda14cbcSMatt Macy mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); 160eda14cbcSMatt Macy mm->mm_children = children; 161eda14cbcSMatt Macy mm->mm_resilvering = resilvering; 162eda14cbcSMatt Macy mm->mm_root = root; 163eda14cbcSMatt Macy mm->mm_preferred = (int *)((uintptr_t)mm + 164eda14cbcSMatt Macy offsetof(mirror_map_t, mm_child[children])); 165eda14cbcSMatt Macy 166eda14cbcSMatt Macy return (mm); 167eda14cbcSMatt Macy } 168eda14cbcSMatt Macy 169eda14cbcSMatt Macy static void 170eda14cbcSMatt Macy vdev_mirror_map_free(zio_t *zio) 171eda14cbcSMatt Macy { 172eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 173eda14cbcSMatt Macy 174eda14cbcSMatt Macy kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); 175eda14cbcSMatt Macy } 176eda14cbcSMatt Macy 177eda14cbcSMatt Macy static const zio_vsd_ops_t vdev_mirror_vsd_ops = { 178eda14cbcSMatt Macy .vsd_free = vdev_mirror_map_free, 179eda14cbcSMatt Macy }; 180eda14cbcSMatt Macy 181eda14cbcSMatt Macy static int 182eda14cbcSMatt Macy vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) 183eda14cbcSMatt Macy { 184eda14cbcSMatt Macy uint64_t last_offset; 185eda14cbcSMatt Macy int64_t offset_diff; 186eda14cbcSMatt Macy int load; 187eda14cbcSMatt Macy 188eda14cbcSMatt Macy /* All DVAs have equal weight at the root. */ 189eda14cbcSMatt Macy if (mm->mm_root) 190eda14cbcSMatt Macy return (INT_MAX); 191eda14cbcSMatt Macy 192eda14cbcSMatt Macy /* 193eda14cbcSMatt Macy * We don't return INT_MAX if the device is resilvering i.e. 194eda14cbcSMatt Macy * vdev_resilver_txg != 0 as when tested performance was slightly 195eda14cbcSMatt Macy * worse overall when resilvering with compared to without. 196eda14cbcSMatt Macy */ 197eda14cbcSMatt Macy 198eda14cbcSMatt Macy /* Fix zio_offset for leaf vdevs */ 199eda14cbcSMatt Macy if (vd->vdev_ops->vdev_op_leaf) 200eda14cbcSMatt Macy zio_offset += VDEV_LABEL_START_SIZE; 201eda14cbcSMatt Macy 202eda14cbcSMatt Macy /* Standard load based on pending queue length. */ 203eda14cbcSMatt Macy load = vdev_queue_length(vd); 204eda14cbcSMatt Macy last_offset = vdev_queue_last_offset(vd); 205eda14cbcSMatt Macy 206eda14cbcSMatt Macy if (vd->vdev_nonrot) { 207eda14cbcSMatt Macy /* Non-rotating media. */ 208eda14cbcSMatt Macy if (last_offset == zio_offset) { 209eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); 210eda14cbcSMatt Macy return (load + zfs_vdev_mirror_non_rotating_inc); 211eda14cbcSMatt Macy } 212eda14cbcSMatt Macy 213eda14cbcSMatt Macy /* 214eda14cbcSMatt Macy * Apply a seek penalty even for non-rotating devices as 215eda14cbcSMatt Macy * sequential I/O's can be aggregated into fewer operations on 216eda14cbcSMatt Macy * the device, thus avoiding unnecessary per-command overhead 217eda14cbcSMatt Macy * and boosting performance. 218eda14cbcSMatt Macy */ 219eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); 220eda14cbcSMatt Macy return (load + zfs_vdev_mirror_non_rotating_seek_inc); 221eda14cbcSMatt Macy } 222eda14cbcSMatt Macy 223eda14cbcSMatt Macy /* Rotating media I/O's which directly follow the last I/O. */ 224eda14cbcSMatt Macy if (last_offset == zio_offset) { 225eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_rotating_linear); 226eda14cbcSMatt Macy return (load + zfs_vdev_mirror_rotating_inc); 227eda14cbcSMatt Macy } 228eda14cbcSMatt Macy 229eda14cbcSMatt Macy /* 230eda14cbcSMatt Macy * Apply half the seek increment to I/O's within seek offset 231eda14cbcSMatt Macy * of the last I/O issued to this vdev as they should incur less 232eda14cbcSMatt Macy * of a seek increment. 233eda14cbcSMatt Macy */ 234eda14cbcSMatt Macy offset_diff = (int64_t)(last_offset - zio_offset); 235eda14cbcSMatt Macy if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { 236eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_rotating_offset); 237eda14cbcSMatt Macy return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); 238eda14cbcSMatt Macy } 239eda14cbcSMatt Macy 240eda14cbcSMatt Macy /* Apply the full seek increment to all other I/O's. */ 241eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_rotating_seek); 242eda14cbcSMatt Macy return (load + zfs_vdev_mirror_rotating_seek_inc); 243eda14cbcSMatt Macy } 244eda14cbcSMatt Macy 2457877fdebSMatt Macy static boolean_t 2467877fdebSMatt Macy vdev_mirror_rebuilding(vdev_t *vd) 2477877fdebSMatt Macy { 2487877fdebSMatt Macy if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) 2497877fdebSMatt Macy return (B_TRUE); 2507877fdebSMatt Macy 2517877fdebSMatt Macy for (int i = 0; i < vd->vdev_children; i++) { 2527877fdebSMatt Macy if (vdev_mirror_rebuilding(vd->vdev_child[i])) { 2537877fdebSMatt Macy return (B_TRUE); 2547877fdebSMatt Macy } 2557877fdebSMatt Macy } 2567877fdebSMatt Macy 2577877fdebSMatt Macy return (B_FALSE); 2587877fdebSMatt Macy } 2597877fdebSMatt Macy 260eda14cbcSMatt Macy /* 261eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_mirror_io_start(), which 262eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 263eda14cbcSMatt Macy */ 264eda14cbcSMatt Macy noinline static mirror_map_t * 265eda14cbcSMatt Macy vdev_mirror_map_init(zio_t *zio) 266eda14cbcSMatt Macy { 267eda14cbcSMatt Macy mirror_map_t *mm = NULL; 268eda14cbcSMatt Macy mirror_child_t *mc; 269eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 270eda14cbcSMatt Macy int c; 271eda14cbcSMatt Macy 272eda14cbcSMatt Macy if (vd == NULL) { 273eda14cbcSMatt Macy dva_t *dva = zio->io_bp->blk_dva; 274eda14cbcSMatt Macy spa_t *spa = zio->io_spa; 275eda14cbcSMatt Macy dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 276eda14cbcSMatt Macy dva_t dva_copy[SPA_DVAS_PER_BP]; 277eda14cbcSMatt Macy 278eda14cbcSMatt Macy /* 279eda14cbcSMatt Macy * The sequential scrub code sorts and issues all DVAs 280eda14cbcSMatt Macy * of a bp separately. Each of these IOs includes all 281eda14cbcSMatt Macy * original DVA copies so that repairs can be performed 282eda14cbcSMatt Macy * in the event of an error, but we only actually want 283eda14cbcSMatt Macy * to check the first DVA since the others will be 284eda14cbcSMatt Macy * checked by their respective sorted IOs. Only if we 285eda14cbcSMatt Macy * hit an error will we try all DVAs upon retrying. 286eda14cbcSMatt Macy * 287eda14cbcSMatt Macy * Note: This check is safe even if the user switches 288eda14cbcSMatt Macy * from a legacy scrub to a sequential one in the middle 289eda14cbcSMatt Macy * of processing, since scn_is_sorted isn't updated until 290eda14cbcSMatt Macy * all outstanding IOs from the previous scrub pass 291eda14cbcSMatt Macy * complete. 292eda14cbcSMatt Macy */ 293eda14cbcSMatt Macy if ((zio->io_flags & ZIO_FLAG_SCRUB) && 294eda14cbcSMatt Macy !(zio->io_flags & ZIO_FLAG_IO_RETRY) && 295eda14cbcSMatt Macy dsl_scan_scrubbing(spa->spa_dsl_pool) && 296eda14cbcSMatt Macy scn->scn_is_sorted) { 297eda14cbcSMatt Macy c = 1; 298eda14cbcSMatt Macy } else { 299eda14cbcSMatt Macy c = BP_GET_NDVAS(zio->io_bp); 300eda14cbcSMatt Macy } 301eda14cbcSMatt Macy 302eda14cbcSMatt Macy /* 303eda14cbcSMatt Macy * If the pool cannot be written to, then infer that some 304eda14cbcSMatt Macy * DVAs might be invalid or point to vdevs that do not exist. 305eda14cbcSMatt Macy * We skip them. 306eda14cbcSMatt Macy */ 307eda14cbcSMatt Macy if (!spa_writeable(spa)) { 308eda14cbcSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 309eda14cbcSMatt Macy int j = 0; 310eda14cbcSMatt Macy for (int i = 0; i < c; i++) { 311eda14cbcSMatt Macy if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) 312eda14cbcSMatt Macy dva_copy[j++] = dva[i]; 313eda14cbcSMatt Macy } 314eda14cbcSMatt Macy if (j == 0) { 315eda14cbcSMatt Macy zio->io_vsd = NULL; 316eda14cbcSMatt Macy zio->io_error = ENXIO; 317eda14cbcSMatt Macy return (NULL); 318eda14cbcSMatt Macy } 319eda14cbcSMatt Macy if (j < c) { 320eda14cbcSMatt Macy dva = dva_copy; 321eda14cbcSMatt Macy c = j; 322eda14cbcSMatt Macy } 323eda14cbcSMatt Macy } 324eda14cbcSMatt Macy 325eda14cbcSMatt Macy mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); 326eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 327eda14cbcSMatt Macy mc = &mm->mm_child[c]; 328eda14cbcSMatt Macy 329eda14cbcSMatt Macy mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); 330eda14cbcSMatt Macy mc->mc_offset = DVA_GET_OFFSET(&dva[c]); 331eda14cbcSMatt Macy if (mc->mc_vd == NULL) { 332eda14cbcSMatt Macy kmem_free(mm, vdev_mirror_map_size( 333eda14cbcSMatt Macy mm->mm_children)); 334eda14cbcSMatt Macy zio->io_vsd = NULL; 335eda14cbcSMatt Macy zio->io_error = ENXIO; 336eda14cbcSMatt Macy return (NULL); 337eda14cbcSMatt Macy } 338eda14cbcSMatt Macy } 339eda14cbcSMatt Macy } else { 340eda14cbcSMatt Macy /* 341eda14cbcSMatt Macy * If we are resilvering, then we should handle scrub reads 342eda14cbcSMatt Macy * differently; we shouldn't issue them to the resilvering 343eda14cbcSMatt Macy * device because it might not have those blocks. 344eda14cbcSMatt Macy * 345eda14cbcSMatt Macy * We are resilvering iff: 346eda14cbcSMatt Macy * 1) We are a replacing vdev (ie our name is "replacing-1" or 347eda14cbcSMatt Macy * "spare-1" or something like that), and 348eda14cbcSMatt Macy * 2) The pool is currently being resilvered. 349eda14cbcSMatt Macy * 350eda14cbcSMatt Macy * We cannot simply check vd->vdev_resilver_txg, because it's 351eda14cbcSMatt Macy * not set in this path. 352eda14cbcSMatt Macy * 353eda14cbcSMatt Macy * Nor can we just check our vdev_ops; there are cases (such as 354eda14cbcSMatt Macy * when a user types "zpool replace pool odev spare_dev" and 355eda14cbcSMatt Macy * spare_dev is in the spare list, or when a spare device is 356eda14cbcSMatt Macy * automatically used to replace a DEGRADED device) when 357eda14cbcSMatt Macy * resilvering is complete but both the original vdev and the 358eda14cbcSMatt Macy * spare vdev remain in the pool. That behavior is intentional. 359eda14cbcSMatt Macy * It helps implement the policy that a spare should be 360eda14cbcSMatt Macy * automatically removed from the pool after the user replaces 361eda14cbcSMatt Macy * the device that originally failed. 362eda14cbcSMatt Macy * 363eda14cbcSMatt Macy * If a spa load is in progress, then spa_dsl_pool may be 364eda14cbcSMatt Macy * uninitialized. But we shouldn't be resilvering during a spa 365eda14cbcSMatt Macy * load anyway. 366eda14cbcSMatt Macy */ 367eda14cbcSMatt Macy boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || 368eda14cbcSMatt Macy vd->vdev_ops == &vdev_spare_ops) && 369eda14cbcSMatt Macy spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && 370eda14cbcSMatt Macy dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); 371eda14cbcSMatt Macy mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, 372eda14cbcSMatt Macy B_FALSE); 373eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 374eda14cbcSMatt Macy mc = &mm->mm_child[c]; 375eda14cbcSMatt Macy mc->mc_vd = vd->vdev_child[c]; 376eda14cbcSMatt Macy mc->mc_offset = zio->io_offset; 3777877fdebSMatt Macy 3787877fdebSMatt Macy if (vdev_mirror_rebuilding(mc->mc_vd)) 3797877fdebSMatt Macy mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; 380eda14cbcSMatt Macy } 381eda14cbcSMatt Macy } 382eda14cbcSMatt Macy 383eda14cbcSMatt Macy return (mm); 384eda14cbcSMatt Macy } 385eda14cbcSMatt Macy 386eda14cbcSMatt Macy static int 387eda14cbcSMatt Macy vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 388eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 389eda14cbcSMatt Macy { 390eda14cbcSMatt Macy int numerrors = 0; 391eda14cbcSMatt Macy int lasterror = 0; 392eda14cbcSMatt Macy 393eda14cbcSMatt Macy if (vd->vdev_children == 0) { 394eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 395eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 396eda14cbcSMatt Macy } 397eda14cbcSMatt Macy 398eda14cbcSMatt Macy vdev_open_children(vd); 399eda14cbcSMatt Macy 400eda14cbcSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 401eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 402eda14cbcSMatt Macy 403eda14cbcSMatt Macy if (cvd->vdev_open_error) { 404eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 405eda14cbcSMatt Macy numerrors++; 406eda14cbcSMatt Macy continue; 407eda14cbcSMatt Macy } 408eda14cbcSMatt Macy 409eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 410eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 411eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 412c7046f76SMartin Matuska } 413c7046f76SMartin Matuska for (int c = 0; c < vd->vdev_children; c++) { 414c7046f76SMartin Matuska vdev_t *cvd = vd->vdev_child[c]; 415c7046f76SMartin Matuska 416c7046f76SMartin Matuska if (cvd->vdev_open_error) 417c7046f76SMartin Matuska continue; 418c7046f76SMartin Matuska *physical_ashift = vdev_best_ashift(*logical_ashift, 419c7046f76SMartin Matuska *physical_ashift, cvd->vdev_physical_ashift); 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422eda14cbcSMatt Macy if (numerrors == vd->vdev_children) { 423eda14cbcSMatt Macy if (vdev_children_are_offline(vd)) 424eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; 425eda14cbcSMatt Macy else 426eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 427eda14cbcSMatt Macy return (lasterror); 428eda14cbcSMatt Macy } 429eda14cbcSMatt Macy 430eda14cbcSMatt Macy return (0); 431eda14cbcSMatt Macy } 432eda14cbcSMatt Macy 433eda14cbcSMatt Macy static void 434eda14cbcSMatt Macy vdev_mirror_close(vdev_t *vd) 435eda14cbcSMatt Macy { 436eda14cbcSMatt Macy for (int c = 0; c < vd->vdev_children; c++) 437eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 438eda14cbcSMatt Macy } 439eda14cbcSMatt Macy 440eda14cbcSMatt Macy static void 441eda14cbcSMatt Macy vdev_mirror_child_done(zio_t *zio) 442eda14cbcSMatt Macy { 443eda14cbcSMatt Macy mirror_child_t *mc = zio->io_private; 444eda14cbcSMatt Macy 445eda14cbcSMatt Macy mc->mc_error = zio->io_error; 446eda14cbcSMatt Macy mc->mc_tried = 1; 447eda14cbcSMatt Macy mc->mc_skipped = 0; 448eda14cbcSMatt Macy } 449eda14cbcSMatt Macy 450eda14cbcSMatt Macy /* 451eda14cbcSMatt Macy * Check the other, lower-index DVAs to see if they're on the same 452eda14cbcSMatt Macy * vdev as the child we picked. If they are, use them since they 453eda14cbcSMatt Macy * are likely to have been allocated from the primary metaslab in 454eda14cbcSMatt Macy * use at the time, and hence are more likely to have locality with 455eda14cbcSMatt Macy * single-copy data. 456eda14cbcSMatt Macy */ 457eda14cbcSMatt Macy static int 458eda14cbcSMatt Macy vdev_mirror_dva_select(zio_t *zio, int p) 459eda14cbcSMatt Macy { 460eda14cbcSMatt Macy dva_t *dva = zio->io_bp->blk_dva; 461eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 462eda14cbcSMatt Macy int preferred; 463eda14cbcSMatt Macy int c; 464eda14cbcSMatt Macy 465eda14cbcSMatt Macy preferred = mm->mm_preferred[p]; 466eda14cbcSMatt Macy for (p--; p >= 0; p--) { 467eda14cbcSMatt Macy c = mm->mm_preferred[p]; 468eda14cbcSMatt Macy if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) 469eda14cbcSMatt Macy preferred = c; 470eda14cbcSMatt Macy } 471eda14cbcSMatt Macy return (preferred); 472eda14cbcSMatt Macy } 473eda14cbcSMatt Macy 474eda14cbcSMatt Macy static int 475eda14cbcSMatt Macy vdev_mirror_preferred_child_randomize(zio_t *zio) 476eda14cbcSMatt Macy { 477eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 478eda14cbcSMatt Macy int p; 479eda14cbcSMatt Macy 480eda14cbcSMatt Macy if (mm->mm_root) { 48133b8c039SMartin Matuska p = random_in_range(mm->mm_preferred_cnt); 482eda14cbcSMatt Macy return (vdev_mirror_dva_select(zio, p)); 483eda14cbcSMatt Macy } 484eda14cbcSMatt Macy 485eda14cbcSMatt Macy /* 486eda14cbcSMatt Macy * To ensure we don't always favour the first matching vdev, 487eda14cbcSMatt Macy * which could lead to wear leveling issues on SSD's, we 488eda14cbcSMatt Macy * use the I/O offset as a pseudo random seed into the vdevs 489eda14cbcSMatt Macy * which have the lowest load. 490eda14cbcSMatt Macy */ 491eda14cbcSMatt Macy p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; 492eda14cbcSMatt Macy return (mm->mm_preferred[p]); 493eda14cbcSMatt Macy } 494eda14cbcSMatt Macy 4957877fdebSMatt Macy static boolean_t 4967877fdebSMatt Macy vdev_mirror_child_readable(mirror_child_t *mc) 4977877fdebSMatt Macy { 4987877fdebSMatt Macy vdev_t *vd = mc->mc_vd; 4997877fdebSMatt Macy 5007877fdebSMatt Macy if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) 5017877fdebSMatt Macy return (vdev_draid_readable(vd, mc->mc_offset)); 5027877fdebSMatt Macy else 5037877fdebSMatt Macy return (vdev_readable(vd)); 5047877fdebSMatt Macy } 5057877fdebSMatt Macy 5067877fdebSMatt Macy static boolean_t 5077877fdebSMatt Macy vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) 5087877fdebSMatt Macy { 5097877fdebSMatt Macy vdev_t *vd = mc->mc_vd; 5107877fdebSMatt Macy 5117877fdebSMatt Macy if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) 5127877fdebSMatt Macy return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); 5137877fdebSMatt Macy else 5147877fdebSMatt Macy return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); 5157877fdebSMatt Macy } 5167877fdebSMatt Macy 517eda14cbcSMatt Macy /* 518eda14cbcSMatt Macy * Try to find a vdev whose DTL doesn't contain the block we want to read 5197877fdebSMatt Macy * preferring vdevs based on determined load. If we can't, try the read on 5207877fdebSMatt Macy * any vdev we haven't already tried. 521eda14cbcSMatt Macy * 5227877fdebSMatt Macy * Distributed spares are an exception to the above load rule. They are 5237877fdebSMatt Macy * always preferred in order to detect gaps in the distributed spare which 5247877fdebSMatt Macy * are created when another disk in the dRAID fails. In order to restore 5257877fdebSMatt Macy * redundancy those gaps must be read to trigger the required repair IO. 526eda14cbcSMatt Macy */ 527eda14cbcSMatt Macy static int 528eda14cbcSMatt Macy vdev_mirror_child_select(zio_t *zio) 529eda14cbcSMatt Macy { 530eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 531eda14cbcSMatt Macy uint64_t txg = zio->io_txg; 532eda14cbcSMatt Macy int c, lowest_load; 533eda14cbcSMatt Macy 534783d3ff6SMartin Matuska ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); 535eda14cbcSMatt Macy 536eda14cbcSMatt Macy lowest_load = INT_MAX; 537eda14cbcSMatt Macy mm->mm_preferred_cnt = 0; 538eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 539eda14cbcSMatt Macy mirror_child_t *mc; 540eda14cbcSMatt Macy 541eda14cbcSMatt Macy mc = &mm->mm_child[c]; 542eda14cbcSMatt Macy if (mc->mc_tried || mc->mc_skipped) 543eda14cbcSMatt Macy continue; 544eda14cbcSMatt Macy 5457877fdebSMatt Macy if (mc->mc_vd == NULL || 5467877fdebSMatt Macy !vdev_mirror_child_readable(mc)) { 547eda14cbcSMatt Macy mc->mc_error = SET_ERROR(ENXIO); 548eda14cbcSMatt Macy mc->mc_tried = 1; /* don't even try */ 549eda14cbcSMatt Macy mc->mc_skipped = 1; 550eda14cbcSMatt Macy continue; 551eda14cbcSMatt Macy } 552eda14cbcSMatt Macy 5537877fdebSMatt Macy if (vdev_mirror_child_missing(mc, txg, 1)) { 554eda14cbcSMatt Macy mc->mc_error = SET_ERROR(ESTALE); 555eda14cbcSMatt Macy mc->mc_skipped = 1; 556eda14cbcSMatt Macy mc->mc_speculative = 1; 557eda14cbcSMatt Macy continue; 558eda14cbcSMatt Macy } 559eda14cbcSMatt Macy 5607877fdebSMatt Macy if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { 5617877fdebSMatt Macy mm->mm_preferred[0] = c; 5627877fdebSMatt Macy mm->mm_preferred_cnt = 1; 5637877fdebSMatt Macy break; 5647877fdebSMatt Macy } 5657877fdebSMatt Macy 566eda14cbcSMatt Macy mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); 567eda14cbcSMatt Macy if (mc->mc_load > lowest_load) 568eda14cbcSMatt Macy continue; 569eda14cbcSMatt Macy 570eda14cbcSMatt Macy if (mc->mc_load < lowest_load) { 571eda14cbcSMatt Macy lowest_load = mc->mc_load; 572eda14cbcSMatt Macy mm->mm_preferred_cnt = 0; 573eda14cbcSMatt Macy } 574eda14cbcSMatt Macy mm->mm_preferred[mm->mm_preferred_cnt] = c; 575eda14cbcSMatt Macy mm->mm_preferred_cnt++; 576eda14cbcSMatt Macy } 577eda14cbcSMatt Macy 578eda14cbcSMatt Macy if (mm->mm_preferred_cnt == 1) { 579eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_preferred_found); 580eda14cbcSMatt Macy return (mm->mm_preferred[0]); 581eda14cbcSMatt Macy } 582eda14cbcSMatt Macy 583eda14cbcSMatt Macy if (mm->mm_preferred_cnt > 1) { 584eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); 585eda14cbcSMatt Macy return (vdev_mirror_preferred_child_randomize(zio)); 586eda14cbcSMatt Macy } 587eda14cbcSMatt Macy 588eda14cbcSMatt Macy /* 589eda14cbcSMatt Macy * Every device is either missing or has this txg in its DTL. 590eda14cbcSMatt Macy * Look for any child we haven't already tried before giving up. 591eda14cbcSMatt Macy */ 592eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 593eda14cbcSMatt Macy if (!mm->mm_child[c].mc_tried) 594eda14cbcSMatt Macy return (c); 595eda14cbcSMatt Macy } 596eda14cbcSMatt Macy 597eda14cbcSMatt Macy /* 598eda14cbcSMatt Macy * Every child failed. There's no place left to look. 599eda14cbcSMatt Macy */ 600eda14cbcSMatt Macy return (-1); 601eda14cbcSMatt Macy } 602eda14cbcSMatt Macy 603eda14cbcSMatt Macy static void 604eda14cbcSMatt Macy vdev_mirror_io_start(zio_t *zio) 605eda14cbcSMatt Macy { 606eda14cbcSMatt Macy mirror_map_t *mm; 607eda14cbcSMatt Macy mirror_child_t *mc; 608eda14cbcSMatt Macy int c, children; 609eda14cbcSMatt Macy 610eda14cbcSMatt Macy mm = vdev_mirror_map_init(zio); 611f9693befSMartin Matuska zio->io_vsd = mm; 612f9693befSMartin Matuska zio->io_vsd_ops = &vdev_mirror_vsd_ops; 613eda14cbcSMatt Macy 614eda14cbcSMatt Macy if (mm == NULL) { 615eda14cbcSMatt Macy ASSERT(!spa_trust_config(zio->io_spa)); 616eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 617eda14cbcSMatt Macy zio_execute(zio); 618eda14cbcSMatt Macy return; 619eda14cbcSMatt Macy } 620eda14cbcSMatt Macy 621eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_READ) { 622a0b956f5SMartin Matuska if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { 623eda14cbcSMatt Macy /* 624a0b956f5SMartin Matuska * For scrubbing reads we need to issue reads to all 625a0b956f5SMartin Matuska * children. One child can reuse parent buffer, but 626a0b956f5SMartin Matuska * for others we have to allocate separate ones to 627a0b956f5SMartin Matuska * verify checksums if io_bp is non-NULL, or compare 628a0b956f5SMartin Matuska * them in vdev_mirror_io_done() otherwise. 629eda14cbcSMatt Macy */ 630a0b956f5SMartin Matuska boolean_t first = B_TRUE; 631eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 632eda14cbcSMatt Macy mc = &mm->mm_child[c]; 63316038816SMartin Matuska 63416038816SMartin Matuska /* Don't issue ZIOs to offline children */ 63516038816SMartin Matuska if (!vdev_mirror_child_readable(mc)) { 63616038816SMartin Matuska mc->mc_error = SET_ERROR(ENXIO); 63716038816SMartin Matuska mc->mc_tried = 1; 63816038816SMartin Matuska mc->mc_skipped = 1; 63916038816SMartin Matuska continue; 64016038816SMartin Matuska } 64116038816SMartin Matuska 642a0b956f5SMartin Matuska mc->mc_abd = first ? zio->io_abd : 643eda14cbcSMatt Macy abd_alloc_sametype(zio->io_abd, 644a0b956f5SMartin Matuska zio->io_size); 645a0b956f5SMartin Matuska zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 646a0b956f5SMartin Matuska mc->mc_vd, mc->mc_offset, mc->mc_abd, 647a0b956f5SMartin Matuska zio->io_size, zio->io_type, 648a0b956f5SMartin Matuska zio->io_priority, 0, 649a0b956f5SMartin Matuska vdev_mirror_child_done, mc)); 650a0b956f5SMartin Matuska first = B_FALSE; 651eda14cbcSMatt Macy } 652eda14cbcSMatt Macy zio_execute(zio); 653eda14cbcSMatt Macy return; 654eda14cbcSMatt Macy } 655eda14cbcSMatt Macy /* 656eda14cbcSMatt Macy * For normal reads just pick one child. 657eda14cbcSMatt Macy */ 658eda14cbcSMatt Macy c = vdev_mirror_child_select(zio); 659eda14cbcSMatt Macy children = (c >= 0); 660eda14cbcSMatt Macy } else { 661eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_WRITE); 662eda14cbcSMatt Macy 663eda14cbcSMatt Macy /* 664eda14cbcSMatt Macy * Writes go to all children. 665eda14cbcSMatt Macy */ 666eda14cbcSMatt Macy c = 0; 667eda14cbcSMatt Macy children = mm->mm_children; 668eda14cbcSMatt Macy } 669eda14cbcSMatt Macy 670eda14cbcSMatt Macy while (children--) { 671eda14cbcSMatt Macy mc = &mm->mm_child[c]; 6727877fdebSMatt Macy c++; 6737877fdebSMatt Macy 6747877fdebSMatt Macy /* 6757877fdebSMatt Macy * When sequentially resilvering only issue write repair 6767877fdebSMatt Macy * IOs to the vdev which is being rebuilt since performance 6777877fdebSMatt Macy * is limited by the slowest child. This is an issue for 6787877fdebSMatt Macy * faster replacement devices such as distributed spares. 6797877fdebSMatt Macy */ 6807877fdebSMatt Macy if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && 6817877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_IO_REPAIR) && 6827877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SCRUB) && 6837877fdebSMatt Macy mm->mm_rebuilding && !mc->mc_rebuilding) { 6847877fdebSMatt Macy continue; 6857877fdebSMatt Macy } 6867877fdebSMatt Macy 687eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 688eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, 689eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 690eda14cbcSMatt Macy vdev_mirror_child_done, mc)); 691eda14cbcSMatt Macy } 692eda14cbcSMatt Macy 693eda14cbcSMatt Macy zio_execute(zio); 694eda14cbcSMatt Macy } 695eda14cbcSMatt Macy 696eda14cbcSMatt Macy static int 697eda14cbcSMatt Macy vdev_mirror_worst_error(mirror_map_t *mm) 698eda14cbcSMatt Macy { 699eda14cbcSMatt Macy int error[2] = { 0, 0 }; 700eda14cbcSMatt Macy 701eda14cbcSMatt Macy for (int c = 0; c < mm->mm_children; c++) { 702eda14cbcSMatt Macy mirror_child_t *mc = &mm->mm_child[c]; 703eda14cbcSMatt Macy int s = mc->mc_speculative; 704eda14cbcSMatt Macy error[s] = zio_worst_error(error[s], mc->mc_error); 705eda14cbcSMatt Macy } 706eda14cbcSMatt Macy 707eda14cbcSMatt Macy return (error[0] ? error[0] : error[1]); 708eda14cbcSMatt Macy } 709eda14cbcSMatt Macy 710eda14cbcSMatt Macy static void 711eda14cbcSMatt Macy vdev_mirror_io_done(zio_t *zio) 712eda14cbcSMatt Macy { 713eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 714eda14cbcSMatt Macy mirror_child_t *mc; 715eda14cbcSMatt Macy int c; 716eda14cbcSMatt Macy int good_copies = 0; 717eda14cbcSMatt Macy int unexpected_errors = 0; 718a0b956f5SMartin Matuska int last_good_copy = -1; 719eda14cbcSMatt Macy 720eda14cbcSMatt Macy if (mm == NULL) 721eda14cbcSMatt Macy return; 722eda14cbcSMatt Macy 723eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 724eda14cbcSMatt Macy mc = &mm->mm_child[c]; 725eda14cbcSMatt Macy 726eda14cbcSMatt Macy if (mc->mc_error) { 727eda14cbcSMatt Macy if (!mc->mc_skipped) 728eda14cbcSMatt Macy unexpected_errors++; 729eda14cbcSMatt Macy } else if (mc->mc_tried) { 730a0b956f5SMartin Matuska last_good_copy = c; 731eda14cbcSMatt Macy good_copies++; 732eda14cbcSMatt Macy } 733eda14cbcSMatt Macy } 734eda14cbcSMatt Macy 735eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 736eda14cbcSMatt Macy /* 737eda14cbcSMatt Macy * XXX -- for now, treat partial writes as success. 738eda14cbcSMatt Macy * 739eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 740eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 741eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 742eda14cbcSMatt Macy * if we intend to reallocate. 743eda14cbcSMatt Macy */ 744eda14cbcSMatt Macy if (good_copies != mm->mm_children) { 745eda14cbcSMatt Macy /* 746eda14cbcSMatt Macy * Always require at least one good copy. 747eda14cbcSMatt Macy * 748eda14cbcSMatt Macy * For ditto blocks (io_vd == NULL), require 749eda14cbcSMatt Macy * all copies to be good. 750eda14cbcSMatt Macy * 751eda14cbcSMatt Macy * XXX -- for replacing vdevs, there's no great answer. 752eda14cbcSMatt Macy * If the old device is really dead, we may not even 753eda14cbcSMatt Macy * be able to access it -- so we only want to 754eda14cbcSMatt Macy * require good writes to the new device. But if 755eda14cbcSMatt Macy * the new device turns out to be flaky, we want 756eda14cbcSMatt Macy * to be able to detach it -- which requires all 757eda14cbcSMatt Macy * writes to the old device to have succeeded. 758eda14cbcSMatt Macy */ 759eda14cbcSMatt Macy if (good_copies == 0 || zio->io_vd == NULL) 760eda14cbcSMatt Macy zio->io_error = vdev_mirror_worst_error(mm); 761eda14cbcSMatt Macy } 762eda14cbcSMatt Macy return; 763eda14cbcSMatt Macy } 764eda14cbcSMatt Macy 765eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 766eda14cbcSMatt Macy 767eda14cbcSMatt Macy /* 768*87bf66d4SMartin Matuska * Any Direct I/O read that has a checksum error must be treated as 769*87bf66d4SMartin Matuska * suspicious as the contents of the buffer could be getting 770*87bf66d4SMartin Matuska * manipulated while the I/O is taking place. The checksum verify error 771*87bf66d4SMartin Matuska * will be reported to the top-level Mirror VDEV. 772*87bf66d4SMartin Matuska * 773*87bf66d4SMartin Matuska * There will be no attampt at reading any additional data copies. If 774*87bf66d4SMartin Matuska * the buffer is still being manipulated while attempting to read from 775*87bf66d4SMartin Matuska * another child, there exists a possibly that the checksum could be 776*87bf66d4SMartin Matuska * verified as valid. However, the buffer contents could again get 777*87bf66d4SMartin Matuska * manipulated after verifying the checksum. This would lead to bad data 778*87bf66d4SMartin Matuska * being written out during self healing. 779*87bf66d4SMartin Matuska */ 780*87bf66d4SMartin Matuska if ((zio->io_flags & ZIO_FLAG_DIO_READ) && 781*87bf66d4SMartin Matuska (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { 782*87bf66d4SMartin Matuska zio_dio_chksum_verify_error_report(zio); 783*87bf66d4SMartin Matuska zio->io_error = vdev_mirror_worst_error(mm); 784*87bf66d4SMartin Matuska ASSERT3U(zio->io_error, ==, ECKSUM); 785*87bf66d4SMartin Matuska return; 786*87bf66d4SMartin Matuska } 787*87bf66d4SMartin Matuska 788*87bf66d4SMartin Matuska /* 789eda14cbcSMatt Macy * If we don't have a good copy yet, keep trying other children. 790eda14cbcSMatt Macy */ 791eda14cbcSMatt Macy if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { 792eda14cbcSMatt Macy ASSERT(c >= 0 && c < mm->mm_children); 793eda14cbcSMatt Macy mc = &mm->mm_child[c]; 794eda14cbcSMatt Macy zio_vdev_io_redone(zio); 795eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 796eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, 797eda14cbcSMatt Macy ZIO_TYPE_READ, zio->io_priority, 0, 798eda14cbcSMatt Macy vdev_mirror_child_done, mc)); 799eda14cbcSMatt Macy return; 800eda14cbcSMatt Macy } 801eda14cbcSMatt Macy 802a0b956f5SMartin Matuska if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) { 803a0b956f5SMartin Matuska abd_t *best_abd = NULL; 804a0b956f5SMartin Matuska if (last_good_copy >= 0) 805a0b956f5SMartin Matuska best_abd = mm->mm_child[last_good_copy].mc_abd; 806a0b956f5SMartin Matuska 807a0b956f5SMartin Matuska /* 808a0b956f5SMartin Matuska * If we're scrubbing but don't have a BP available (because 809a0b956f5SMartin Matuska * this vdev is under a raidz or draid vdev) then the best we 810a0b956f5SMartin Matuska * can do is compare all of the copies read. If they're not 811a0b956f5SMartin Matuska * identical then return a checksum error and the most likely 812a0b956f5SMartin Matuska * correct data. The raidz code will issue a repair I/O if 813a0b956f5SMartin Matuska * possible. 814a0b956f5SMartin Matuska */ 815a0b956f5SMartin Matuska if (zio->io_bp == NULL) { 816a0b956f5SMartin Matuska ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops || 817a0b956f5SMartin Matuska zio->io_vd->vdev_ops == &vdev_spare_ops); 818a0b956f5SMartin Matuska 819a0b956f5SMartin Matuska abd_t *pref_abd = NULL; 820a0b956f5SMartin Matuska for (c = 0; c < last_good_copy; c++) { 821a0b956f5SMartin Matuska mc = &mm->mm_child[c]; 822a0b956f5SMartin Matuska if (mc->mc_error || !mc->mc_tried) 823a0b956f5SMartin Matuska continue; 824a0b956f5SMartin Matuska 825a0b956f5SMartin Matuska if (abd_cmp(mc->mc_abd, best_abd) != 0) 826a0b956f5SMartin Matuska zio->io_error = SET_ERROR(ECKSUM); 827a0b956f5SMartin Matuska 828a0b956f5SMartin Matuska /* 829a0b956f5SMartin Matuska * The distributed spare is always prefered 830a0b956f5SMartin Matuska * by vdev_mirror_child_select() so it's 831a0b956f5SMartin Matuska * considered to be the best candidate. 832a0b956f5SMartin Matuska */ 833a0b956f5SMartin Matuska if (pref_abd == NULL && 834a0b956f5SMartin Matuska mc->mc_vd->vdev_ops == 835a0b956f5SMartin Matuska &vdev_draid_spare_ops) 836a0b956f5SMartin Matuska pref_abd = mc->mc_abd; 837a0b956f5SMartin Matuska 838a0b956f5SMartin Matuska /* 839a0b956f5SMartin Matuska * In the absence of a preferred copy, use 840a0b956f5SMartin Matuska * the parent pointer to avoid a memory copy. 841a0b956f5SMartin Matuska */ 842a0b956f5SMartin Matuska if (mc->mc_abd == zio->io_abd) 843a0b956f5SMartin Matuska best_abd = mc->mc_abd; 844a0b956f5SMartin Matuska } 845a0b956f5SMartin Matuska if (pref_abd) 846a0b956f5SMartin Matuska best_abd = pref_abd; 847a0b956f5SMartin Matuska } else { 848a0b956f5SMartin Matuska 849a0b956f5SMartin Matuska /* 850a0b956f5SMartin Matuska * If we have a BP available, then checksums are 851a0b956f5SMartin Matuska * already verified and we just need a buffer 852a0b956f5SMartin Matuska * with valid data, preferring parent one to 853a0b956f5SMartin Matuska * avoid a memory copy. 854a0b956f5SMartin Matuska */ 855a0b956f5SMartin Matuska for (c = 0; c < last_good_copy; c++) { 856a0b956f5SMartin Matuska mc = &mm->mm_child[c]; 857a0b956f5SMartin Matuska if (mc->mc_error || !mc->mc_tried) 858a0b956f5SMartin Matuska continue; 859a0b956f5SMartin Matuska if (mc->mc_abd == zio->io_abd) { 860a0b956f5SMartin Matuska best_abd = mc->mc_abd; 861a0b956f5SMartin Matuska break; 862a0b956f5SMartin Matuska } 863a0b956f5SMartin Matuska } 864a0b956f5SMartin Matuska } 865a0b956f5SMartin Matuska 866a0b956f5SMartin Matuska if (best_abd && best_abd != zio->io_abd) 867a0b956f5SMartin Matuska abd_copy(zio->io_abd, best_abd, zio->io_size); 868a0b956f5SMartin Matuska for (c = 0; c < mm->mm_children; c++) { 869a0b956f5SMartin Matuska mc = &mm->mm_child[c]; 870a0b956f5SMartin Matuska if (mc->mc_abd != zio->io_abd) 871a0b956f5SMartin Matuska abd_free(mc->mc_abd); 872a0b956f5SMartin Matuska mc->mc_abd = NULL; 873a0b956f5SMartin Matuska } 874a0b956f5SMartin Matuska } 875a0b956f5SMartin Matuska 876eda14cbcSMatt Macy if (good_copies == 0) { 877eda14cbcSMatt Macy zio->io_error = vdev_mirror_worst_error(mm); 878eda14cbcSMatt Macy ASSERT(zio->io_error != 0); 879eda14cbcSMatt Macy } 880eda14cbcSMatt Macy 881eda14cbcSMatt Macy if (good_copies && spa_writeable(zio->io_spa) && 882eda14cbcSMatt Macy (unexpected_errors || 883eda14cbcSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER) || 884eda14cbcSMatt Macy ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { 885eda14cbcSMatt Macy /* 886eda14cbcSMatt Macy * Use the good data we have in hand to repair damaged children. 887eda14cbcSMatt Macy */ 888eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 889eda14cbcSMatt Macy /* 890eda14cbcSMatt Macy * Don't rewrite known good children. 891eda14cbcSMatt Macy * Not only is it unnecessary, it could 892eda14cbcSMatt Macy * actually be harmful: if the system lost 893eda14cbcSMatt Macy * power while rewriting the only good copy, 894eda14cbcSMatt Macy * there would be no good copies left! 895eda14cbcSMatt Macy */ 896eda14cbcSMatt Macy mc = &mm->mm_child[c]; 897eda14cbcSMatt Macy 898eda14cbcSMatt Macy if (mc->mc_error == 0) { 8997877fdebSMatt Macy vdev_ops_t *ops = mc->mc_vd->vdev_ops; 9007877fdebSMatt Macy 901eda14cbcSMatt Macy if (mc->mc_tried) 902eda14cbcSMatt Macy continue; 903eda14cbcSMatt Macy /* 904eda14cbcSMatt Macy * We didn't try this child. We need to 905eda14cbcSMatt Macy * repair it if: 906eda14cbcSMatt Macy * 1. it's a scrub (in which case we have 907eda14cbcSMatt Macy * tried everything that was healthy) 908eda14cbcSMatt Macy * - or - 9097877fdebSMatt Macy * 2. it's an indirect or distributed spare 9107877fdebSMatt Macy * vdev (in which case it could point to any 9117877fdebSMatt Macy * other vdev, which might have a bad DTL) 912eda14cbcSMatt Macy * - or - 913eda14cbcSMatt Macy * 3. the DTL indicates that this data is 914eda14cbcSMatt Macy * missing from this vdev 915eda14cbcSMatt Macy */ 916eda14cbcSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SCRUB) && 9177877fdebSMatt Macy ops != &vdev_indirect_ops && 9187877fdebSMatt Macy ops != &vdev_draid_spare_ops && 919eda14cbcSMatt Macy !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, 920eda14cbcSMatt Macy zio->io_txg, 1)) 921eda14cbcSMatt Macy continue; 922eda14cbcSMatt Macy mc->mc_error = SET_ERROR(ESTALE); 923eda14cbcSMatt Macy } 924eda14cbcSMatt Macy 925eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 926eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, 927eda14cbcSMatt Macy zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, 928eda14cbcSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 929eda14cbcSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 930eda14cbcSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 931eda14cbcSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 932eda14cbcSMatt Macy } 933eda14cbcSMatt Macy } 934eda14cbcSMatt Macy } 935eda14cbcSMatt Macy 936eda14cbcSMatt Macy static void 937eda14cbcSMatt Macy vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) 938eda14cbcSMatt Macy { 939eda14cbcSMatt Macy if (faulted == vd->vdev_children) { 940eda14cbcSMatt Macy if (vdev_children_are_offline(vd)) { 941eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, 942eda14cbcSMatt Macy VDEV_AUX_CHILDREN_OFFLINE); 943eda14cbcSMatt Macy } else { 944eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 945eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 946eda14cbcSMatt Macy } 947eda14cbcSMatt Macy } else if (degraded + faulted != 0) { 948eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 949eda14cbcSMatt Macy } else { 950eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 951eda14cbcSMatt Macy } 952eda14cbcSMatt Macy } 953eda14cbcSMatt Macy 9547877fdebSMatt Macy /* 9557877fdebSMatt Macy * Return the maximum asize for a rebuild zio in the provided range. 9567877fdebSMatt Macy */ 9577877fdebSMatt Macy static uint64_t 9587877fdebSMatt Macy vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, 9597877fdebSMatt Macy uint64_t max_segment) 9607877fdebSMatt Macy { 961e92ffd9bSMartin Matuska (void) start; 962e92ffd9bSMartin Matuska 9637877fdebSMatt Macy uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), 9647877fdebSMatt Macy SPA_MAXBLOCKSIZE); 9657877fdebSMatt Macy 9667877fdebSMatt Macy return (MIN(asize, vdev_psize_to_asize(vd, psize))); 9677877fdebSMatt Macy } 9687877fdebSMatt Macy 969eda14cbcSMatt Macy vdev_ops_t vdev_mirror_ops = { 9707877fdebSMatt Macy .vdev_op_init = NULL, 9717877fdebSMatt Macy .vdev_op_fini = NULL, 972eda14cbcSMatt Macy .vdev_op_open = vdev_mirror_open, 973eda14cbcSMatt Macy .vdev_op_close = vdev_mirror_close, 974eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 9757877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 9767877fdebSMatt Macy .vdev_op_min_alloc = NULL, 977eda14cbcSMatt Macy .vdev_op_io_start = vdev_mirror_io_start, 978eda14cbcSMatt Macy .vdev_op_io_done = vdev_mirror_io_done, 979eda14cbcSMatt Macy .vdev_op_state_change = vdev_mirror_state_change, 9807877fdebSMatt Macy .vdev_op_need_resilver = vdev_default_need_resilver, 981eda14cbcSMatt Macy .vdev_op_hold = NULL, 982eda14cbcSMatt Macy .vdev_op_rele = NULL, 983eda14cbcSMatt Macy .vdev_op_remap = NULL, 984eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 9857877fdebSMatt Macy .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, 9867877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 9877877fdebSMatt Macy .vdev_op_config_generate = NULL, 9887877fdebSMatt Macy .vdev_op_nparity = NULL, 9897877fdebSMatt Macy .vdev_op_ndisks = NULL, 990eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ 991eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 992eda14cbcSMatt Macy }; 993eda14cbcSMatt Macy 994eda14cbcSMatt Macy vdev_ops_t vdev_replacing_ops = { 9957877fdebSMatt Macy .vdev_op_init = NULL, 9967877fdebSMatt Macy .vdev_op_fini = NULL, 997eda14cbcSMatt Macy .vdev_op_open = vdev_mirror_open, 998eda14cbcSMatt Macy .vdev_op_close = vdev_mirror_close, 999eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 10007877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 10017877fdebSMatt Macy .vdev_op_min_alloc = NULL, 1002eda14cbcSMatt Macy .vdev_op_io_start = vdev_mirror_io_start, 1003eda14cbcSMatt Macy .vdev_op_io_done = vdev_mirror_io_done, 1004eda14cbcSMatt Macy .vdev_op_state_change = vdev_mirror_state_change, 10057877fdebSMatt Macy .vdev_op_need_resilver = vdev_default_need_resilver, 1006eda14cbcSMatt Macy .vdev_op_hold = NULL, 1007eda14cbcSMatt Macy .vdev_op_rele = NULL, 1008eda14cbcSMatt Macy .vdev_op_remap = NULL, 1009eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 10107877fdebSMatt Macy .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, 10117877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 10127877fdebSMatt Macy .vdev_op_config_generate = NULL, 10137877fdebSMatt Macy .vdev_op_nparity = NULL, 10147877fdebSMatt Macy .vdev_op_ndisks = NULL, 1015eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ 1016eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 1017eda14cbcSMatt Macy }; 1018eda14cbcSMatt Macy 1019eda14cbcSMatt Macy vdev_ops_t vdev_spare_ops = { 10207877fdebSMatt Macy .vdev_op_init = NULL, 10217877fdebSMatt Macy .vdev_op_fini = NULL, 1022eda14cbcSMatt Macy .vdev_op_open = vdev_mirror_open, 1023eda14cbcSMatt Macy .vdev_op_close = vdev_mirror_close, 1024eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 10257877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 10267877fdebSMatt Macy .vdev_op_min_alloc = NULL, 1027eda14cbcSMatt Macy .vdev_op_io_start = vdev_mirror_io_start, 1028eda14cbcSMatt Macy .vdev_op_io_done = vdev_mirror_io_done, 1029eda14cbcSMatt Macy .vdev_op_state_change = vdev_mirror_state_change, 10307877fdebSMatt Macy .vdev_op_need_resilver = vdev_default_need_resilver, 1031eda14cbcSMatt Macy .vdev_op_hold = NULL, 1032eda14cbcSMatt Macy .vdev_op_rele = NULL, 1033eda14cbcSMatt Macy .vdev_op_remap = NULL, 1034eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 10357877fdebSMatt Macy .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, 10367877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 10377877fdebSMatt Macy .vdev_op_config_generate = NULL, 10387877fdebSMatt Macy .vdev_op_nparity = NULL, 10397877fdebSMatt Macy .vdev_op_ndisks = NULL, 1040eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ 1041eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 1042eda14cbcSMatt Macy }; 1043eda14cbcSMatt Macy 1044eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, 1045c03c5b1cSMartin Matuska "Rotating media load increment for non-seeking I/Os"); 1046eda14cbcSMatt Macy 1047c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, 1048c03c5b1cSMartin Matuska ZMOD_RW, "Rotating media load increment for seeking I/Os"); 1049eda14cbcSMatt Macy 1050c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, 1051c03c5b1cSMartin Matuska ZMOD_RW, 1052eda14cbcSMatt Macy "Offset in bytes from the last I/O which triggers " 1053eda14cbcSMatt Macy "a reduced rotating media seek increment"); 1054c03c5b1cSMartin Matuska 1055c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, 1056c03c5b1cSMartin Matuska ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os"); 1057c03c5b1cSMartin Matuska 1058c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, 1059c03c5b1cSMartin Matuska ZMOD_RW, "Non-rotating media load increment for seeking I/Os"); 1060