1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 31eda14cbcSMatt Macy #include <sys/spa.h> 32eda14cbcSMatt Macy #include <sys/spa_impl.h> 33eda14cbcSMatt Macy #include <sys/dsl_pool.h> 34eda14cbcSMatt Macy #include <sys/dsl_scan.h> 35eda14cbcSMatt Macy #include <sys/vdev_impl.h> 367877fdebSMatt Macy #include <sys/vdev_draid.h> 37eda14cbcSMatt Macy #include <sys/zio.h> 38eda14cbcSMatt Macy #include <sys/abd.h> 39eda14cbcSMatt Macy #include <sys/fs/zfs.h> 40eda14cbcSMatt Macy 41eda14cbcSMatt Macy /* 42eda14cbcSMatt Macy * Vdev mirror kstats 43eda14cbcSMatt Macy */ 44eda14cbcSMatt Macy static kstat_t *mirror_ksp = NULL; 45eda14cbcSMatt Macy 46eda14cbcSMatt Macy typedef struct mirror_stats { 47eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_rotating_linear; 48eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_rotating_offset; 49eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_rotating_seek; 50eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_non_rotating_linear; 51eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_non_rotating_seek; 52eda14cbcSMatt Macy 53eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_preferred_found; 54eda14cbcSMatt Macy kstat_named_t vdev_mirror_stat_preferred_not_found; 55eda14cbcSMatt Macy } mirror_stats_t; 56eda14cbcSMatt Macy 57eda14cbcSMatt Macy static mirror_stats_t mirror_stats = { 58eda14cbcSMatt Macy /* New I/O follows directly the last I/O */ 59eda14cbcSMatt Macy { "rotating_linear", KSTAT_DATA_UINT64 }, 60eda14cbcSMatt Macy /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ 61eda14cbcSMatt Macy { "rotating_offset", KSTAT_DATA_UINT64 }, 62eda14cbcSMatt Macy /* New I/O requires random seek */ 63eda14cbcSMatt Macy { "rotating_seek", KSTAT_DATA_UINT64 }, 64eda14cbcSMatt Macy /* New I/O follows directly the last I/O (nonrot) */ 65eda14cbcSMatt Macy { "non_rotating_linear", KSTAT_DATA_UINT64 }, 66eda14cbcSMatt Macy /* New I/O requires random seek (nonrot) */ 67eda14cbcSMatt Macy { "non_rotating_seek", KSTAT_DATA_UINT64 }, 68eda14cbcSMatt Macy /* Preferred child vdev found */ 69eda14cbcSMatt Macy { "preferred_found", KSTAT_DATA_UINT64 }, 70eda14cbcSMatt Macy /* Preferred child vdev not found or equal load */ 71eda14cbcSMatt Macy { "preferred_not_found", KSTAT_DATA_UINT64 }, 72eda14cbcSMatt Macy 73eda14cbcSMatt Macy }; 74eda14cbcSMatt Macy 75eda14cbcSMatt Macy #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) 76eda14cbcSMatt Macy #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) 77eda14cbcSMatt Macy #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) 78eda14cbcSMatt Macy 79eda14cbcSMatt Macy void 80eda14cbcSMatt Macy vdev_mirror_stat_init(void) 81eda14cbcSMatt Macy { 82eda14cbcSMatt Macy mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", 83eda14cbcSMatt Macy "misc", KSTAT_TYPE_NAMED, 84eda14cbcSMatt Macy sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 85eda14cbcSMatt Macy if (mirror_ksp != NULL) { 86eda14cbcSMatt Macy mirror_ksp->ks_data = &mirror_stats; 87eda14cbcSMatt Macy kstat_install(mirror_ksp); 88eda14cbcSMatt Macy } 89eda14cbcSMatt Macy } 90eda14cbcSMatt Macy 91eda14cbcSMatt Macy void 92eda14cbcSMatt Macy vdev_mirror_stat_fini(void) 93eda14cbcSMatt Macy { 94eda14cbcSMatt Macy if (mirror_ksp != NULL) { 95eda14cbcSMatt Macy kstat_delete(mirror_ksp); 96eda14cbcSMatt Macy mirror_ksp = NULL; 97eda14cbcSMatt Macy } 98eda14cbcSMatt Macy } 99eda14cbcSMatt Macy 100eda14cbcSMatt Macy /* 101eda14cbcSMatt Macy * Virtual device vector for mirroring. 102eda14cbcSMatt Macy */ 103eda14cbcSMatt Macy typedef struct mirror_child { 104eda14cbcSMatt Macy vdev_t *mc_vd; 105eda14cbcSMatt Macy uint64_t mc_offset; 106eda14cbcSMatt Macy int mc_error; 107eda14cbcSMatt Macy int mc_load; 108eda14cbcSMatt Macy uint8_t mc_tried; 109eda14cbcSMatt Macy uint8_t mc_skipped; 110eda14cbcSMatt Macy uint8_t mc_speculative; 1117877fdebSMatt Macy uint8_t mc_rebuilding; 112eda14cbcSMatt Macy } mirror_child_t; 113eda14cbcSMatt Macy 114eda14cbcSMatt Macy typedef struct mirror_map { 115eda14cbcSMatt Macy int *mm_preferred; 116eda14cbcSMatt Macy int mm_preferred_cnt; 117eda14cbcSMatt Macy int mm_children; 118eda14cbcSMatt Macy boolean_t mm_resilvering; 1197877fdebSMatt Macy boolean_t mm_rebuilding; 120eda14cbcSMatt Macy boolean_t mm_root; 121eda14cbcSMatt Macy mirror_child_t mm_child[]; 122eda14cbcSMatt Macy } mirror_map_t; 123eda14cbcSMatt Macy 124*e92ffd9bSMartin Matuska static const int vdev_mirror_shift = 21; 125eda14cbcSMatt Macy 126eda14cbcSMatt Macy /* 127eda14cbcSMatt Macy * The load configuration settings below are tuned by default for 128eda14cbcSMatt Macy * the case where all devices are of the same rotational type. 129eda14cbcSMatt Macy * 130eda14cbcSMatt Macy * If there is a mixture of rotating and non-rotating media, setting 131eda14cbcSMatt Macy * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results 132eda14cbcSMatt Macy * as it will direct more reads to the non-rotating vdevs which are more likely 133eda14cbcSMatt Macy * to have a higher performance. 134eda14cbcSMatt Macy */ 135eda14cbcSMatt Macy 136eda14cbcSMatt Macy /* Rotating media load calculation configuration. */ 137eda14cbcSMatt Macy static int zfs_vdev_mirror_rotating_inc = 0; 138eda14cbcSMatt Macy static int zfs_vdev_mirror_rotating_seek_inc = 5; 139eda14cbcSMatt Macy static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; 140eda14cbcSMatt Macy 141eda14cbcSMatt Macy /* Non-rotating media load calculation configuration. */ 142eda14cbcSMatt Macy static int zfs_vdev_mirror_non_rotating_inc = 0; 143eda14cbcSMatt Macy static int zfs_vdev_mirror_non_rotating_seek_inc = 1; 144eda14cbcSMatt Macy 145eda14cbcSMatt Macy static inline size_t 146eda14cbcSMatt Macy vdev_mirror_map_size(int children) 147eda14cbcSMatt Macy { 148eda14cbcSMatt Macy return (offsetof(mirror_map_t, mm_child[children]) + 149eda14cbcSMatt Macy sizeof (int) * children); 150eda14cbcSMatt Macy } 151eda14cbcSMatt Macy 152eda14cbcSMatt Macy static inline mirror_map_t * 153eda14cbcSMatt Macy vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) 154eda14cbcSMatt Macy { 155eda14cbcSMatt Macy mirror_map_t *mm; 156eda14cbcSMatt Macy 157eda14cbcSMatt Macy mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); 158eda14cbcSMatt Macy mm->mm_children = children; 159eda14cbcSMatt Macy mm->mm_resilvering = resilvering; 160eda14cbcSMatt Macy mm->mm_root = root; 161eda14cbcSMatt Macy mm->mm_preferred = (int *)((uintptr_t)mm + 162eda14cbcSMatt Macy offsetof(mirror_map_t, mm_child[children])); 163eda14cbcSMatt Macy 164eda14cbcSMatt Macy return (mm); 165eda14cbcSMatt Macy } 166eda14cbcSMatt Macy 167eda14cbcSMatt Macy static void 168eda14cbcSMatt Macy vdev_mirror_map_free(zio_t *zio) 169eda14cbcSMatt Macy { 170eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 171eda14cbcSMatt Macy 172eda14cbcSMatt Macy kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); 173eda14cbcSMatt Macy } 174eda14cbcSMatt Macy 175eda14cbcSMatt Macy static const zio_vsd_ops_t vdev_mirror_vsd_ops = { 176eda14cbcSMatt Macy .vsd_free = vdev_mirror_map_free, 177eda14cbcSMatt Macy }; 178eda14cbcSMatt Macy 179eda14cbcSMatt Macy static int 180eda14cbcSMatt Macy vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) 181eda14cbcSMatt Macy { 182eda14cbcSMatt Macy uint64_t last_offset; 183eda14cbcSMatt Macy int64_t offset_diff; 184eda14cbcSMatt Macy int load; 185eda14cbcSMatt Macy 186eda14cbcSMatt Macy /* All DVAs have equal weight at the root. */ 187eda14cbcSMatt Macy if (mm->mm_root) 188eda14cbcSMatt Macy return (INT_MAX); 189eda14cbcSMatt Macy 190eda14cbcSMatt Macy /* 191eda14cbcSMatt Macy * We don't return INT_MAX if the device is resilvering i.e. 192eda14cbcSMatt Macy * vdev_resilver_txg != 0 as when tested performance was slightly 193eda14cbcSMatt Macy * worse overall when resilvering with compared to without. 194eda14cbcSMatt Macy */ 195eda14cbcSMatt Macy 196eda14cbcSMatt Macy /* Fix zio_offset for leaf vdevs */ 197eda14cbcSMatt Macy if (vd->vdev_ops->vdev_op_leaf) 198eda14cbcSMatt Macy zio_offset += VDEV_LABEL_START_SIZE; 199eda14cbcSMatt Macy 200eda14cbcSMatt Macy /* Standard load based on pending queue length. */ 201eda14cbcSMatt Macy load = vdev_queue_length(vd); 202eda14cbcSMatt Macy last_offset = vdev_queue_last_offset(vd); 203eda14cbcSMatt Macy 204eda14cbcSMatt Macy if (vd->vdev_nonrot) { 205eda14cbcSMatt Macy /* Non-rotating media. */ 206eda14cbcSMatt Macy if (last_offset == zio_offset) { 207eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); 208eda14cbcSMatt Macy return (load + zfs_vdev_mirror_non_rotating_inc); 209eda14cbcSMatt Macy } 210eda14cbcSMatt Macy 211eda14cbcSMatt Macy /* 212eda14cbcSMatt Macy * Apply a seek penalty even for non-rotating devices as 213eda14cbcSMatt Macy * sequential I/O's can be aggregated into fewer operations on 214eda14cbcSMatt Macy * the device, thus avoiding unnecessary per-command overhead 215eda14cbcSMatt Macy * and boosting performance. 216eda14cbcSMatt Macy */ 217eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); 218eda14cbcSMatt Macy return (load + zfs_vdev_mirror_non_rotating_seek_inc); 219eda14cbcSMatt Macy } 220eda14cbcSMatt Macy 221eda14cbcSMatt Macy /* Rotating media I/O's which directly follow the last I/O. */ 222eda14cbcSMatt Macy if (last_offset == zio_offset) { 223eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_rotating_linear); 224eda14cbcSMatt Macy return (load + zfs_vdev_mirror_rotating_inc); 225eda14cbcSMatt Macy } 226eda14cbcSMatt Macy 227eda14cbcSMatt Macy /* 228eda14cbcSMatt Macy * Apply half the seek increment to I/O's within seek offset 229eda14cbcSMatt Macy * of the last I/O issued to this vdev as they should incur less 230eda14cbcSMatt Macy * of a seek increment. 231eda14cbcSMatt Macy */ 232eda14cbcSMatt Macy offset_diff = (int64_t)(last_offset - zio_offset); 233eda14cbcSMatt Macy if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { 234eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_rotating_offset); 235eda14cbcSMatt Macy return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); 236eda14cbcSMatt Macy } 237eda14cbcSMatt Macy 238eda14cbcSMatt Macy /* Apply the full seek increment to all other I/O's. */ 239eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_rotating_seek); 240eda14cbcSMatt Macy return (load + zfs_vdev_mirror_rotating_seek_inc); 241eda14cbcSMatt Macy } 242eda14cbcSMatt Macy 2437877fdebSMatt Macy static boolean_t 2447877fdebSMatt Macy vdev_mirror_rebuilding(vdev_t *vd) 2457877fdebSMatt Macy { 2467877fdebSMatt Macy if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) 2477877fdebSMatt Macy return (B_TRUE); 2487877fdebSMatt Macy 2497877fdebSMatt Macy for (int i = 0; i < vd->vdev_children; i++) { 2507877fdebSMatt Macy if (vdev_mirror_rebuilding(vd->vdev_child[i])) { 2517877fdebSMatt Macy return (B_TRUE); 2527877fdebSMatt Macy } 2537877fdebSMatt Macy } 2547877fdebSMatt Macy 2557877fdebSMatt Macy return (B_FALSE); 2567877fdebSMatt Macy } 2577877fdebSMatt Macy 258eda14cbcSMatt Macy /* 259eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_mirror_io_start(), which 260eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack. 261eda14cbcSMatt Macy */ 262eda14cbcSMatt Macy noinline static mirror_map_t * 263eda14cbcSMatt Macy vdev_mirror_map_init(zio_t *zio) 264eda14cbcSMatt Macy { 265eda14cbcSMatt Macy mirror_map_t *mm = NULL; 266eda14cbcSMatt Macy mirror_child_t *mc; 267eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 268eda14cbcSMatt Macy int c; 269eda14cbcSMatt Macy 270eda14cbcSMatt Macy if (vd == NULL) { 271eda14cbcSMatt Macy dva_t *dva = zio->io_bp->blk_dva; 272eda14cbcSMatt Macy spa_t *spa = zio->io_spa; 273eda14cbcSMatt Macy dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 274eda14cbcSMatt Macy dva_t dva_copy[SPA_DVAS_PER_BP]; 275eda14cbcSMatt Macy 276eda14cbcSMatt Macy /* 277eda14cbcSMatt Macy * The sequential scrub code sorts and issues all DVAs 278eda14cbcSMatt Macy * of a bp separately. Each of these IOs includes all 279eda14cbcSMatt Macy * original DVA copies so that repairs can be performed 280eda14cbcSMatt Macy * in the event of an error, but we only actually want 281eda14cbcSMatt Macy * to check the first DVA since the others will be 282eda14cbcSMatt Macy * checked by their respective sorted IOs. Only if we 283eda14cbcSMatt Macy * hit an error will we try all DVAs upon retrying. 284eda14cbcSMatt Macy * 285eda14cbcSMatt Macy * Note: This check is safe even if the user switches 286eda14cbcSMatt Macy * from a legacy scrub to a sequential one in the middle 287eda14cbcSMatt Macy * of processing, since scn_is_sorted isn't updated until 288eda14cbcSMatt Macy * all outstanding IOs from the previous scrub pass 289eda14cbcSMatt Macy * complete. 290eda14cbcSMatt Macy */ 291eda14cbcSMatt Macy if ((zio->io_flags & ZIO_FLAG_SCRUB) && 292eda14cbcSMatt Macy !(zio->io_flags & ZIO_FLAG_IO_RETRY) && 293eda14cbcSMatt Macy dsl_scan_scrubbing(spa->spa_dsl_pool) && 294eda14cbcSMatt Macy scn->scn_is_sorted) { 295eda14cbcSMatt Macy c = 1; 296eda14cbcSMatt Macy } else { 297eda14cbcSMatt Macy c = BP_GET_NDVAS(zio->io_bp); 298eda14cbcSMatt Macy } 299eda14cbcSMatt Macy 300eda14cbcSMatt Macy /* 301eda14cbcSMatt Macy * If the pool cannot be written to, then infer that some 302eda14cbcSMatt Macy * DVAs might be invalid or point to vdevs that do not exist. 303eda14cbcSMatt Macy * We skip them. 304eda14cbcSMatt Macy */ 305eda14cbcSMatt Macy if (!spa_writeable(spa)) { 306eda14cbcSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 307eda14cbcSMatt Macy int j = 0; 308eda14cbcSMatt Macy for (int i = 0; i < c; i++) { 309eda14cbcSMatt Macy if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) 310eda14cbcSMatt Macy dva_copy[j++] = dva[i]; 311eda14cbcSMatt Macy } 312eda14cbcSMatt Macy if (j == 0) { 313eda14cbcSMatt Macy zio->io_vsd = NULL; 314eda14cbcSMatt Macy zio->io_error = ENXIO; 315eda14cbcSMatt Macy return (NULL); 316eda14cbcSMatt Macy } 317eda14cbcSMatt Macy if (j < c) { 318eda14cbcSMatt Macy dva = dva_copy; 319eda14cbcSMatt Macy c = j; 320eda14cbcSMatt Macy } 321eda14cbcSMatt Macy } 322eda14cbcSMatt Macy 323eda14cbcSMatt Macy mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); 324eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 325eda14cbcSMatt Macy mc = &mm->mm_child[c]; 326eda14cbcSMatt Macy 327eda14cbcSMatt Macy mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); 328eda14cbcSMatt Macy mc->mc_offset = DVA_GET_OFFSET(&dva[c]); 329eda14cbcSMatt Macy if (mc->mc_vd == NULL) { 330eda14cbcSMatt Macy kmem_free(mm, vdev_mirror_map_size( 331eda14cbcSMatt Macy mm->mm_children)); 332eda14cbcSMatt Macy zio->io_vsd = NULL; 333eda14cbcSMatt Macy zio->io_error = ENXIO; 334eda14cbcSMatt Macy return (NULL); 335eda14cbcSMatt Macy } 336eda14cbcSMatt Macy } 337eda14cbcSMatt Macy } else { 338eda14cbcSMatt Macy /* 339eda14cbcSMatt Macy * If we are resilvering, then we should handle scrub reads 340eda14cbcSMatt Macy * differently; we shouldn't issue them to the resilvering 341eda14cbcSMatt Macy * device because it might not have those blocks. 342eda14cbcSMatt Macy * 343eda14cbcSMatt Macy * We are resilvering iff: 344eda14cbcSMatt Macy * 1) We are a replacing vdev (ie our name is "replacing-1" or 345eda14cbcSMatt Macy * "spare-1" or something like that), and 346eda14cbcSMatt Macy * 2) The pool is currently being resilvered. 347eda14cbcSMatt Macy * 348eda14cbcSMatt Macy * We cannot simply check vd->vdev_resilver_txg, because it's 349eda14cbcSMatt Macy * not set in this path. 350eda14cbcSMatt Macy * 351eda14cbcSMatt Macy * Nor can we just check our vdev_ops; there are cases (such as 352eda14cbcSMatt Macy * when a user types "zpool replace pool odev spare_dev" and 353eda14cbcSMatt Macy * spare_dev is in the spare list, or when a spare device is 354eda14cbcSMatt Macy * automatically used to replace a DEGRADED device) when 355eda14cbcSMatt Macy * resilvering is complete but both the original vdev and the 356eda14cbcSMatt Macy * spare vdev remain in the pool. That behavior is intentional. 357eda14cbcSMatt Macy * It helps implement the policy that a spare should be 358eda14cbcSMatt Macy * automatically removed from the pool after the user replaces 359eda14cbcSMatt Macy * the device that originally failed. 360eda14cbcSMatt Macy * 361eda14cbcSMatt Macy * If a spa load is in progress, then spa_dsl_pool may be 362eda14cbcSMatt Macy * uninitialized. But we shouldn't be resilvering during a spa 363eda14cbcSMatt Macy * load anyway. 364eda14cbcSMatt Macy */ 365eda14cbcSMatt Macy boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || 366eda14cbcSMatt Macy vd->vdev_ops == &vdev_spare_ops) && 367eda14cbcSMatt Macy spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && 368eda14cbcSMatt Macy dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); 369eda14cbcSMatt Macy mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, 370eda14cbcSMatt Macy B_FALSE); 371eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 372eda14cbcSMatt Macy mc = &mm->mm_child[c]; 373eda14cbcSMatt Macy mc->mc_vd = vd->vdev_child[c]; 374eda14cbcSMatt Macy mc->mc_offset = zio->io_offset; 3757877fdebSMatt Macy 3767877fdebSMatt Macy if (vdev_mirror_rebuilding(mc->mc_vd)) 3777877fdebSMatt Macy mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; 378eda14cbcSMatt Macy } 379eda14cbcSMatt Macy } 380eda14cbcSMatt Macy 381eda14cbcSMatt Macy return (mm); 382eda14cbcSMatt Macy } 383eda14cbcSMatt Macy 384eda14cbcSMatt Macy static int 385eda14cbcSMatt Macy vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 386eda14cbcSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift) 387eda14cbcSMatt Macy { 388eda14cbcSMatt Macy int numerrors = 0; 389eda14cbcSMatt Macy int lasterror = 0; 390eda14cbcSMatt Macy 391eda14cbcSMatt Macy if (vd->vdev_children == 0) { 392eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 393eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 394eda14cbcSMatt Macy } 395eda14cbcSMatt Macy 396eda14cbcSMatt Macy vdev_open_children(vd); 397eda14cbcSMatt Macy 398eda14cbcSMatt Macy for (int c = 0; c < vd->vdev_children; c++) { 399eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 400eda14cbcSMatt Macy 401eda14cbcSMatt Macy if (cvd->vdev_open_error) { 402eda14cbcSMatt Macy lasterror = cvd->vdev_open_error; 403eda14cbcSMatt Macy numerrors++; 404eda14cbcSMatt Macy continue; 405eda14cbcSMatt Macy } 406eda14cbcSMatt Macy 407eda14cbcSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 408eda14cbcSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 409eda14cbcSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 410eda14cbcSMatt Macy *physical_ashift = MAX(*physical_ashift, 4112c48331dSMatt Macy cvd->vdev_physical_ashift); 412eda14cbcSMatt Macy } 413eda14cbcSMatt Macy 414eda14cbcSMatt Macy if (numerrors == vd->vdev_children) { 415eda14cbcSMatt Macy if (vdev_children_are_offline(vd)) 416eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; 417eda14cbcSMatt Macy else 418eda14cbcSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 419eda14cbcSMatt Macy return (lasterror); 420eda14cbcSMatt Macy } 421eda14cbcSMatt Macy 422eda14cbcSMatt Macy return (0); 423eda14cbcSMatt Macy } 424eda14cbcSMatt Macy 425eda14cbcSMatt Macy static void 426eda14cbcSMatt Macy vdev_mirror_close(vdev_t *vd) 427eda14cbcSMatt Macy { 428eda14cbcSMatt Macy for (int c = 0; c < vd->vdev_children; c++) 429eda14cbcSMatt Macy vdev_close(vd->vdev_child[c]); 430eda14cbcSMatt Macy } 431eda14cbcSMatt Macy 432eda14cbcSMatt Macy static void 433eda14cbcSMatt Macy vdev_mirror_child_done(zio_t *zio) 434eda14cbcSMatt Macy { 435eda14cbcSMatt Macy mirror_child_t *mc = zio->io_private; 436eda14cbcSMatt Macy 437eda14cbcSMatt Macy mc->mc_error = zio->io_error; 438eda14cbcSMatt Macy mc->mc_tried = 1; 439eda14cbcSMatt Macy mc->mc_skipped = 0; 440eda14cbcSMatt Macy } 441eda14cbcSMatt Macy 442eda14cbcSMatt Macy static void 443eda14cbcSMatt Macy vdev_mirror_scrub_done(zio_t *zio) 444eda14cbcSMatt Macy { 445eda14cbcSMatt Macy mirror_child_t *mc = zio->io_private; 446eda14cbcSMatt Macy 447eda14cbcSMatt Macy if (zio->io_error == 0) { 448eda14cbcSMatt Macy zio_t *pio; 449eda14cbcSMatt Macy zio_link_t *zl = NULL; 450eda14cbcSMatt Macy 451eda14cbcSMatt Macy mutex_enter(&zio->io_lock); 452eda14cbcSMatt Macy while ((pio = zio_walk_parents(zio, &zl)) != NULL) { 453eda14cbcSMatt Macy mutex_enter(&pio->io_lock); 454eda14cbcSMatt Macy ASSERT3U(zio->io_size, >=, pio->io_size); 455eda14cbcSMatt Macy abd_copy(pio->io_abd, zio->io_abd, pio->io_size); 456eda14cbcSMatt Macy mutex_exit(&pio->io_lock); 457eda14cbcSMatt Macy } 458eda14cbcSMatt Macy mutex_exit(&zio->io_lock); 459eda14cbcSMatt Macy } 460eda14cbcSMatt Macy 461eda14cbcSMatt Macy abd_free(zio->io_abd); 462eda14cbcSMatt Macy 463eda14cbcSMatt Macy mc->mc_error = zio->io_error; 464eda14cbcSMatt Macy mc->mc_tried = 1; 465eda14cbcSMatt Macy mc->mc_skipped = 0; 466eda14cbcSMatt Macy } 467eda14cbcSMatt Macy 468eda14cbcSMatt Macy /* 469eda14cbcSMatt Macy * Check the other, lower-index DVAs to see if they're on the same 470eda14cbcSMatt Macy * vdev as the child we picked. If they are, use them since they 471eda14cbcSMatt Macy * are likely to have been allocated from the primary metaslab in 472eda14cbcSMatt Macy * use at the time, and hence are more likely to have locality with 473eda14cbcSMatt Macy * single-copy data. 474eda14cbcSMatt Macy */ 475eda14cbcSMatt Macy static int 476eda14cbcSMatt Macy vdev_mirror_dva_select(zio_t *zio, int p) 477eda14cbcSMatt Macy { 478eda14cbcSMatt Macy dva_t *dva = zio->io_bp->blk_dva; 479eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 480eda14cbcSMatt Macy int preferred; 481eda14cbcSMatt Macy int c; 482eda14cbcSMatt Macy 483eda14cbcSMatt Macy preferred = mm->mm_preferred[p]; 484eda14cbcSMatt Macy for (p--; p >= 0; p--) { 485eda14cbcSMatt Macy c = mm->mm_preferred[p]; 486eda14cbcSMatt Macy if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) 487eda14cbcSMatt Macy preferred = c; 488eda14cbcSMatt Macy } 489eda14cbcSMatt Macy return (preferred); 490eda14cbcSMatt Macy } 491eda14cbcSMatt Macy 492eda14cbcSMatt Macy static int 493eda14cbcSMatt Macy vdev_mirror_preferred_child_randomize(zio_t *zio) 494eda14cbcSMatt Macy { 495eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 496eda14cbcSMatt Macy int p; 497eda14cbcSMatt Macy 498eda14cbcSMatt Macy if (mm->mm_root) { 49933b8c039SMartin Matuska p = random_in_range(mm->mm_preferred_cnt); 500eda14cbcSMatt Macy return (vdev_mirror_dva_select(zio, p)); 501eda14cbcSMatt Macy } 502eda14cbcSMatt Macy 503eda14cbcSMatt Macy /* 504eda14cbcSMatt Macy * To ensure we don't always favour the first matching vdev, 505eda14cbcSMatt Macy * which could lead to wear leveling issues on SSD's, we 506eda14cbcSMatt Macy * use the I/O offset as a pseudo random seed into the vdevs 507eda14cbcSMatt Macy * which have the lowest load. 508eda14cbcSMatt Macy */ 509eda14cbcSMatt Macy p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; 510eda14cbcSMatt Macy return (mm->mm_preferred[p]); 511eda14cbcSMatt Macy } 512eda14cbcSMatt Macy 5137877fdebSMatt Macy static boolean_t 5147877fdebSMatt Macy vdev_mirror_child_readable(mirror_child_t *mc) 5157877fdebSMatt Macy { 5167877fdebSMatt Macy vdev_t *vd = mc->mc_vd; 5177877fdebSMatt Macy 5187877fdebSMatt Macy if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) 5197877fdebSMatt Macy return (vdev_draid_readable(vd, mc->mc_offset)); 5207877fdebSMatt Macy else 5217877fdebSMatt Macy return (vdev_readable(vd)); 5227877fdebSMatt Macy } 5237877fdebSMatt Macy 5247877fdebSMatt Macy static boolean_t 5257877fdebSMatt Macy vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) 5267877fdebSMatt Macy { 5277877fdebSMatt Macy vdev_t *vd = mc->mc_vd; 5287877fdebSMatt Macy 5297877fdebSMatt Macy if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) 5307877fdebSMatt Macy return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); 5317877fdebSMatt Macy else 5327877fdebSMatt Macy return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); 5337877fdebSMatt Macy } 5347877fdebSMatt Macy 535eda14cbcSMatt Macy /* 536eda14cbcSMatt Macy * Try to find a vdev whose DTL doesn't contain the block we want to read 5377877fdebSMatt Macy * preferring vdevs based on determined load. If we can't, try the read on 5387877fdebSMatt Macy * any vdev we haven't already tried. 539eda14cbcSMatt Macy * 5407877fdebSMatt Macy * Distributed spares are an exception to the above load rule. They are 5417877fdebSMatt Macy * always preferred in order to detect gaps in the distributed spare which 5427877fdebSMatt Macy * are created when another disk in the dRAID fails. In order to restore 5437877fdebSMatt Macy * redundancy those gaps must be read to trigger the required repair IO. 544eda14cbcSMatt Macy */ 545eda14cbcSMatt Macy static int 546eda14cbcSMatt Macy vdev_mirror_child_select(zio_t *zio) 547eda14cbcSMatt Macy { 548eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 549eda14cbcSMatt Macy uint64_t txg = zio->io_txg; 550eda14cbcSMatt Macy int c, lowest_load; 551eda14cbcSMatt Macy 552eda14cbcSMatt Macy ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); 553eda14cbcSMatt Macy 554eda14cbcSMatt Macy lowest_load = INT_MAX; 555eda14cbcSMatt Macy mm->mm_preferred_cnt = 0; 556eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 557eda14cbcSMatt Macy mirror_child_t *mc; 558eda14cbcSMatt Macy 559eda14cbcSMatt Macy mc = &mm->mm_child[c]; 560eda14cbcSMatt Macy if (mc->mc_tried || mc->mc_skipped) 561eda14cbcSMatt Macy continue; 562eda14cbcSMatt Macy 5637877fdebSMatt Macy if (mc->mc_vd == NULL || 5647877fdebSMatt Macy !vdev_mirror_child_readable(mc)) { 565eda14cbcSMatt Macy mc->mc_error = SET_ERROR(ENXIO); 566eda14cbcSMatt Macy mc->mc_tried = 1; /* don't even try */ 567eda14cbcSMatt Macy mc->mc_skipped = 1; 568eda14cbcSMatt Macy continue; 569eda14cbcSMatt Macy } 570eda14cbcSMatt Macy 5717877fdebSMatt Macy if (vdev_mirror_child_missing(mc, txg, 1)) { 572eda14cbcSMatt Macy mc->mc_error = SET_ERROR(ESTALE); 573eda14cbcSMatt Macy mc->mc_skipped = 1; 574eda14cbcSMatt Macy mc->mc_speculative = 1; 575eda14cbcSMatt Macy continue; 576eda14cbcSMatt Macy } 577eda14cbcSMatt Macy 5787877fdebSMatt Macy if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { 5797877fdebSMatt Macy mm->mm_preferred[0] = c; 5807877fdebSMatt Macy mm->mm_preferred_cnt = 1; 5817877fdebSMatt Macy break; 5827877fdebSMatt Macy } 5837877fdebSMatt Macy 584eda14cbcSMatt Macy mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); 585eda14cbcSMatt Macy if (mc->mc_load > lowest_load) 586eda14cbcSMatt Macy continue; 587eda14cbcSMatt Macy 588eda14cbcSMatt Macy if (mc->mc_load < lowest_load) { 589eda14cbcSMatt Macy lowest_load = mc->mc_load; 590eda14cbcSMatt Macy mm->mm_preferred_cnt = 0; 591eda14cbcSMatt Macy } 592eda14cbcSMatt Macy mm->mm_preferred[mm->mm_preferred_cnt] = c; 593eda14cbcSMatt Macy mm->mm_preferred_cnt++; 594eda14cbcSMatt Macy } 595eda14cbcSMatt Macy 596eda14cbcSMatt Macy if (mm->mm_preferred_cnt == 1) { 597eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_preferred_found); 598eda14cbcSMatt Macy return (mm->mm_preferred[0]); 599eda14cbcSMatt Macy } 600eda14cbcSMatt Macy 601eda14cbcSMatt Macy if (mm->mm_preferred_cnt > 1) { 602eda14cbcSMatt Macy MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); 603eda14cbcSMatt Macy return (vdev_mirror_preferred_child_randomize(zio)); 604eda14cbcSMatt Macy } 605eda14cbcSMatt Macy 606eda14cbcSMatt Macy /* 607eda14cbcSMatt Macy * Every device is either missing or has this txg in its DTL. 608eda14cbcSMatt Macy * Look for any child we haven't already tried before giving up. 609eda14cbcSMatt Macy */ 610eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 611eda14cbcSMatt Macy if (!mm->mm_child[c].mc_tried) 612eda14cbcSMatt Macy return (c); 613eda14cbcSMatt Macy } 614eda14cbcSMatt Macy 615eda14cbcSMatt Macy /* 616eda14cbcSMatt Macy * Every child failed. There's no place left to look. 617eda14cbcSMatt Macy */ 618eda14cbcSMatt Macy return (-1); 619eda14cbcSMatt Macy } 620eda14cbcSMatt Macy 621eda14cbcSMatt Macy static void 622eda14cbcSMatt Macy vdev_mirror_io_start(zio_t *zio) 623eda14cbcSMatt Macy { 624eda14cbcSMatt Macy mirror_map_t *mm; 625eda14cbcSMatt Macy mirror_child_t *mc; 626eda14cbcSMatt Macy int c, children; 627eda14cbcSMatt Macy 628eda14cbcSMatt Macy mm = vdev_mirror_map_init(zio); 629f9693befSMartin Matuska zio->io_vsd = mm; 630f9693befSMartin Matuska zio->io_vsd_ops = &vdev_mirror_vsd_ops; 631eda14cbcSMatt Macy 632eda14cbcSMatt Macy if (mm == NULL) { 633eda14cbcSMatt Macy ASSERT(!spa_trust_config(zio->io_spa)); 634eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 635eda14cbcSMatt Macy zio_execute(zio); 636eda14cbcSMatt Macy return; 637eda14cbcSMatt Macy } 638eda14cbcSMatt Macy 639eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_READ) { 640eda14cbcSMatt Macy if (zio->io_bp != NULL && 641eda14cbcSMatt Macy (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { 642eda14cbcSMatt Macy /* 643eda14cbcSMatt Macy * For scrubbing reads (if we can verify the 644eda14cbcSMatt Macy * checksum here, as indicated by io_bp being 645eda14cbcSMatt Macy * non-NULL) we need to allocate a read buffer for 646eda14cbcSMatt Macy * each child and issue reads to all children. If 647eda14cbcSMatt Macy * any child succeeds, it will copy its data into 648eda14cbcSMatt Macy * zio->io_data in vdev_mirror_scrub_done. 649eda14cbcSMatt Macy */ 650eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 651eda14cbcSMatt Macy mc = &mm->mm_child[c]; 65216038816SMartin Matuska 65316038816SMartin Matuska /* Don't issue ZIOs to offline children */ 65416038816SMartin Matuska if (!vdev_mirror_child_readable(mc)) { 65516038816SMartin Matuska mc->mc_error = SET_ERROR(ENXIO); 65616038816SMartin Matuska mc->mc_tried = 1; 65716038816SMartin Matuska mc->mc_skipped = 1; 65816038816SMartin Matuska continue; 65916038816SMartin Matuska } 66016038816SMartin Matuska 661eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 662eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, 663eda14cbcSMatt Macy abd_alloc_sametype(zio->io_abd, 664eda14cbcSMatt Macy zio->io_size), zio->io_size, 665eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 666eda14cbcSMatt Macy vdev_mirror_scrub_done, mc)); 667eda14cbcSMatt Macy } 668eda14cbcSMatt Macy zio_execute(zio); 669eda14cbcSMatt Macy return; 670eda14cbcSMatt Macy } 671eda14cbcSMatt Macy /* 672eda14cbcSMatt Macy * For normal reads just pick one child. 673eda14cbcSMatt Macy */ 674eda14cbcSMatt Macy c = vdev_mirror_child_select(zio); 675eda14cbcSMatt Macy children = (c >= 0); 676eda14cbcSMatt Macy } else { 677eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_WRITE); 678eda14cbcSMatt Macy 679eda14cbcSMatt Macy /* 680eda14cbcSMatt Macy * Writes go to all children. 681eda14cbcSMatt Macy */ 682eda14cbcSMatt Macy c = 0; 683eda14cbcSMatt Macy children = mm->mm_children; 684eda14cbcSMatt Macy } 685eda14cbcSMatt Macy 686eda14cbcSMatt Macy while (children--) { 687eda14cbcSMatt Macy mc = &mm->mm_child[c]; 6887877fdebSMatt Macy c++; 6897877fdebSMatt Macy 6907877fdebSMatt Macy /* 6917877fdebSMatt Macy * When sequentially resilvering only issue write repair 6927877fdebSMatt Macy * IOs to the vdev which is being rebuilt since performance 6937877fdebSMatt Macy * is limited by the slowest child. This is an issue for 6947877fdebSMatt Macy * faster replacement devices such as distributed spares. 6957877fdebSMatt Macy */ 6967877fdebSMatt Macy if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && 6977877fdebSMatt Macy (zio->io_flags & ZIO_FLAG_IO_REPAIR) && 6987877fdebSMatt Macy !(zio->io_flags & ZIO_FLAG_SCRUB) && 6997877fdebSMatt Macy mm->mm_rebuilding && !mc->mc_rebuilding) { 7007877fdebSMatt Macy continue; 7017877fdebSMatt Macy } 7027877fdebSMatt Macy 703eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 704eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, 705eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0, 706eda14cbcSMatt Macy vdev_mirror_child_done, mc)); 707eda14cbcSMatt Macy } 708eda14cbcSMatt Macy 709eda14cbcSMatt Macy zio_execute(zio); 710eda14cbcSMatt Macy } 711eda14cbcSMatt Macy 712eda14cbcSMatt Macy static int 713eda14cbcSMatt Macy vdev_mirror_worst_error(mirror_map_t *mm) 714eda14cbcSMatt Macy { 715eda14cbcSMatt Macy int error[2] = { 0, 0 }; 716eda14cbcSMatt Macy 717eda14cbcSMatt Macy for (int c = 0; c < mm->mm_children; c++) { 718eda14cbcSMatt Macy mirror_child_t *mc = &mm->mm_child[c]; 719eda14cbcSMatt Macy int s = mc->mc_speculative; 720eda14cbcSMatt Macy error[s] = zio_worst_error(error[s], mc->mc_error); 721eda14cbcSMatt Macy } 722eda14cbcSMatt Macy 723eda14cbcSMatt Macy return (error[0] ? error[0] : error[1]); 724eda14cbcSMatt Macy } 725eda14cbcSMatt Macy 726eda14cbcSMatt Macy static void 727eda14cbcSMatt Macy vdev_mirror_io_done(zio_t *zio) 728eda14cbcSMatt Macy { 729eda14cbcSMatt Macy mirror_map_t *mm = zio->io_vsd; 730eda14cbcSMatt Macy mirror_child_t *mc; 731eda14cbcSMatt Macy int c; 732eda14cbcSMatt Macy int good_copies = 0; 733eda14cbcSMatt Macy int unexpected_errors = 0; 734eda14cbcSMatt Macy 735eda14cbcSMatt Macy if (mm == NULL) 736eda14cbcSMatt Macy return; 737eda14cbcSMatt Macy 738eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 739eda14cbcSMatt Macy mc = &mm->mm_child[c]; 740eda14cbcSMatt Macy 741eda14cbcSMatt Macy if (mc->mc_error) { 742eda14cbcSMatt Macy if (!mc->mc_skipped) 743eda14cbcSMatt Macy unexpected_errors++; 744eda14cbcSMatt Macy } else if (mc->mc_tried) { 745eda14cbcSMatt Macy good_copies++; 746eda14cbcSMatt Macy } 747eda14cbcSMatt Macy } 748eda14cbcSMatt Macy 749eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) { 750eda14cbcSMatt Macy /* 751eda14cbcSMatt Macy * XXX -- for now, treat partial writes as success. 752eda14cbcSMatt Macy * 753eda14cbcSMatt Macy * Now that we support write reallocation, it would be better 754eda14cbcSMatt Macy * to treat partial failure as real failure unless there are 755eda14cbcSMatt Macy * no non-degraded top-level vdevs left, and not update DTLs 756eda14cbcSMatt Macy * if we intend to reallocate. 757eda14cbcSMatt Macy */ 758eda14cbcSMatt Macy /* XXPOLICY */ 759eda14cbcSMatt Macy if (good_copies != mm->mm_children) { 760eda14cbcSMatt Macy /* 761eda14cbcSMatt Macy * Always require at least one good copy. 762eda14cbcSMatt Macy * 763eda14cbcSMatt Macy * For ditto blocks (io_vd == NULL), require 764eda14cbcSMatt Macy * all copies to be good. 765eda14cbcSMatt Macy * 766eda14cbcSMatt Macy * XXX -- for replacing vdevs, there's no great answer. 767eda14cbcSMatt Macy * If the old device is really dead, we may not even 768eda14cbcSMatt Macy * be able to access it -- so we only want to 769eda14cbcSMatt Macy * require good writes to the new device. But if 770eda14cbcSMatt Macy * the new device turns out to be flaky, we want 771eda14cbcSMatt Macy * to be able to detach it -- which requires all 772eda14cbcSMatt Macy * writes to the old device to have succeeded. 773eda14cbcSMatt Macy */ 774eda14cbcSMatt Macy if (good_copies == 0 || zio->io_vd == NULL) 775eda14cbcSMatt Macy zio->io_error = vdev_mirror_worst_error(mm); 776eda14cbcSMatt Macy } 777eda14cbcSMatt Macy return; 778eda14cbcSMatt Macy } 779eda14cbcSMatt Macy 780eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 781eda14cbcSMatt Macy 782eda14cbcSMatt Macy /* 783eda14cbcSMatt Macy * If we don't have a good copy yet, keep trying other children. 784eda14cbcSMatt Macy */ 785eda14cbcSMatt Macy /* XXPOLICY */ 786eda14cbcSMatt Macy if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { 787eda14cbcSMatt Macy ASSERT(c >= 0 && c < mm->mm_children); 788eda14cbcSMatt Macy mc = &mm->mm_child[c]; 789eda14cbcSMatt Macy zio_vdev_io_redone(zio); 790eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 791eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, 792eda14cbcSMatt Macy ZIO_TYPE_READ, zio->io_priority, 0, 793eda14cbcSMatt Macy vdev_mirror_child_done, mc)); 794eda14cbcSMatt Macy return; 795eda14cbcSMatt Macy } 796eda14cbcSMatt Macy 797eda14cbcSMatt Macy /* XXPOLICY */ 798eda14cbcSMatt Macy if (good_copies == 0) { 799eda14cbcSMatt Macy zio->io_error = vdev_mirror_worst_error(mm); 800eda14cbcSMatt Macy ASSERT(zio->io_error != 0); 801eda14cbcSMatt Macy } 802eda14cbcSMatt Macy 803eda14cbcSMatt Macy if (good_copies && spa_writeable(zio->io_spa) && 804eda14cbcSMatt Macy (unexpected_errors || 805eda14cbcSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER) || 806eda14cbcSMatt Macy ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { 807eda14cbcSMatt Macy /* 808eda14cbcSMatt Macy * Use the good data we have in hand to repair damaged children. 809eda14cbcSMatt Macy */ 810eda14cbcSMatt Macy for (c = 0; c < mm->mm_children; c++) { 811eda14cbcSMatt Macy /* 812eda14cbcSMatt Macy * Don't rewrite known good children. 813eda14cbcSMatt Macy * Not only is it unnecessary, it could 814eda14cbcSMatt Macy * actually be harmful: if the system lost 815eda14cbcSMatt Macy * power while rewriting the only good copy, 816eda14cbcSMatt Macy * there would be no good copies left! 817eda14cbcSMatt Macy */ 818eda14cbcSMatt Macy mc = &mm->mm_child[c]; 819eda14cbcSMatt Macy 820eda14cbcSMatt Macy if (mc->mc_error == 0) { 8217877fdebSMatt Macy vdev_ops_t *ops = mc->mc_vd->vdev_ops; 8227877fdebSMatt Macy 823eda14cbcSMatt Macy if (mc->mc_tried) 824eda14cbcSMatt Macy continue; 825eda14cbcSMatt Macy /* 826eda14cbcSMatt Macy * We didn't try this child. We need to 827eda14cbcSMatt Macy * repair it if: 828eda14cbcSMatt Macy * 1. it's a scrub (in which case we have 829eda14cbcSMatt Macy * tried everything that was healthy) 830eda14cbcSMatt Macy * - or - 8317877fdebSMatt Macy * 2. it's an indirect or distributed spare 8327877fdebSMatt Macy * vdev (in which case it could point to any 8337877fdebSMatt Macy * other vdev, which might have a bad DTL) 834eda14cbcSMatt Macy * - or - 835eda14cbcSMatt Macy * 3. the DTL indicates that this data is 836eda14cbcSMatt Macy * missing from this vdev 837eda14cbcSMatt Macy */ 838eda14cbcSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SCRUB) && 8397877fdebSMatt Macy ops != &vdev_indirect_ops && 8407877fdebSMatt Macy ops != &vdev_draid_spare_ops && 841eda14cbcSMatt Macy !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, 842eda14cbcSMatt Macy zio->io_txg, 1)) 843eda14cbcSMatt Macy continue; 844eda14cbcSMatt Macy mc->mc_error = SET_ERROR(ESTALE); 845eda14cbcSMatt Macy } 846eda14cbcSMatt Macy 847eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 848eda14cbcSMatt Macy mc->mc_vd, mc->mc_offset, 849eda14cbcSMatt Macy zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, 850eda14cbcSMatt Macy zio->io_priority == ZIO_PRIORITY_REBUILD ? 851eda14cbcSMatt Macy ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, 852eda14cbcSMatt Macy ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 853eda14cbcSMatt Macy ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 854eda14cbcSMatt Macy } 855eda14cbcSMatt Macy } 856eda14cbcSMatt Macy } 857eda14cbcSMatt Macy 858eda14cbcSMatt Macy static void 859eda14cbcSMatt Macy vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) 860eda14cbcSMatt Macy { 861eda14cbcSMatt Macy if (faulted == vd->vdev_children) { 862eda14cbcSMatt Macy if (vdev_children_are_offline(vd)) { 863eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, 864eda14cbcSMatt Macy VDEV_AUX_CHILDREN_OFFLINE); 865eda14cbcSMatt Macy } else { 866eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 867eda14cbcSMatt Macy VDEV_AUX_NO_REPLICAS); 868eda14cbcSMatt Macy } 869eda14cbcSMatt Macy } else if (degraded + faulted != 0) { 870eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 871eda14cbcSMatt Macy } else { 872eda14cbcSMatt Macy vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 873eda14cbcSMatt Macy } 874eda14cbcSMatt Macy } 875eda14cbcSMatt Macy 8767877fdebSMatt Macy /* 8777877fdebSMatt Macy * Return the maximum asize for a rebuild zio in the provided range. 8787877fdebSMatt Macy */ 8797877fdebSMatt Macy static uint64_t 8807877fdebSMatt Macy vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, 8817877fdebSMatt Macy uint64_t max_segment) 8827877fdebSMatt Macy { 883*e92ffd9bSMartin Matuska (void) start; 884*e92ffd9bSMartin Matuska 8857877fdebSMatt Macy uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), 8867877fdebSMatt Macy SPA_MAXBLOCKSIZE); 8877877fdebSMatt Macy 8887877fdebSMatt Macy return (MIN(asize, vdev_psize_to_asize(vd, psize))); 8897877fdebSMatt Macy } 8907877fdebSMatt Macy 891eda14cbcSMatt Macy vdev_ops_t vdev_mirror_ops = { 8927877fdebSMatt Macy .vdev_op_init = NULL, 8937877fdebSMatt Macy .vdev_op_fini = NULL, 894eda14cbcSMatt Macy .vdev_op_open = vdev_mirror_open, 895eda14cbcSMatt Macy .vdev_op_close = vdev_mirror_close, 896eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 8977877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 8987877fdebSMatt Macy .vdev_op_min_alloc = NULL, 899eda14cbcSMatt Macy .vdev_op_io_start = vdev_mirror_io_start, 900eda14cbcSMatt Macy .vdev_op_io_done = vdev_mirror_io_done, 901eda14cbcSMatt Macy .vdev_op_state_change = vdev_mirror_state_change, 9027877fdebSMatt Macy .vdev_op_need_resilver = vdev_default_need_resilver, 903eda14cbcSMatt Macy .vdev_op_hold = NULL, 904eda14cbcSMatt Macy .vdev_op_rele = NULL, 905eda14cbcSMatt Macy .vdev_op_remap = NULL, 906eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 9077877fdebSMatt Macy .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, 9087877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 9097877fdebSMatt Macy .vdev_op_config_generate = NULL, 9107877fdebSMatt Macy .vdev_op_nparity = NULL, 9117877fdebSMatt Macy .vdev_op_ndisks = NULL, 912eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ 913eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 914eda14cbcSMatt Macy }; 915eda14cbcSMatt Macy 916eda14cbcSMatt Macy vdev_ops_t vdev_replacing_ops = { 9177877fdebSMatt Macy .vdev_op_init = NULL, 9187877fdebSMatt Macy .vdev_op_fini = NULL, 919eda14cbcSMatt Macy .vdev_op_open = vdev_mirror_open, 920eda14cbcSMatt Macy .vdev_op_close = vdev_mirror_close, 921eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 9227877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 9237877fdebSMatt Macy .vdev_op_min_alloc = NULL, 924eda14cbcSMatt Macy .vdev_op_io_start = vdev_mirror_io_start, 925eda14cbcSMatt Macy .vdev_op_io_done = vdev_mirror_io_done, 926eda14cbcSMatt Macy .vdev_op_state_change = vdev_mirror_state_change, 9277877fdebSMatt Macy .vdev_op_need_resilver = vdev_default_need_resilver, 928eda14cbcSMatt Macy .vdev_op_hold = NULL, 929eda14cbcSMatt Macy .vdev_op_rele = NULL, 930eda14cbcSMatt Macy .vdev_op_remap = NULL, 931eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 9327877fdebSMatt Macy .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, 9337877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 9347877fdebSMatt Macy .vdev_op_config_generate = NULL, 9357877fdebSMatt Macy .vdev_op_nparity = NULL, 9367877fdebSMatt Macy .vdev_op_ndisks = NULL, 937eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ 938eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 939eda14cbcSMatt Macy }; 940eda14cbcSMatt Macy 941eda14cbcSMatt Macy vdev_ops_t vdev_spare_ops = { 9427877fdebSMatt Macy .vdev_op_init = NULL, 9437877fdebSMatt Macy .vdev_op_fini = NULL, 944eda14cbcSMatt Macy .vdev_op_open = vdev_mirror_open, 945eda14cbcSMatt Macy .vdev_op_close = vdev_mirror_close, 946eda14cbcSMatt Macy .vdev_op_asize = vdev_default_asize, 9477877fdebSMatt Macy .vdev_op_min_asize = vdev_default_min_asize, 9487877fdebSMatt Macy .vdev_op_min_alloc = NULL, 949eda14cbcSMatt Macy .vdev_op_io_start = vdev_mirror_io_start, 950eda14cbcSMatt Macy .vdev_op_io_done = vdev_mirror_io_done, 951eda14cbcSMatt Macy .vdev_op_state_change = vdev_mirror_state_change, 9527877fdebSMatt Macy .vdev_op_need_resilver = vdev_default_need_resilver, 953eda14cbcSMatt Macy .vdev_op_hold = NULL, 954eda14cbcSMatt Macy .vdev_op_rele = NULL, 955eda14cbcSMatt Macy .vdev_op_remap = NULL, 956eda14cbcSMatt Macy .vdev_op_xlate = vdev_default_xlate, 9577877fdebSMatt Macy .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, 9587877fdebSMatt Macy .vdev_op_metaslab_init = NULL, 9597877fdebSMatt Macy .vdev_op_config_generate = NULL, 9607877fdebSMatt Macy .vdev_op_nparity = NULL, 9617877fdebSMatt Macy .vdev_op_ndisks = NULL, 962eda14cbcSMatt Macy .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ 963eda14cbcSMatt Macy .vdev_op_leaf = B_FALSE /* not a leaf vdev */ 964eda14cbcSMatt Macy }; 965eda14cbcSMatt Macy 966eda14cbcSMatt Macy /* BEGIN CSTYLED */ 967eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, 968eda14cbcSMatt Macy "Rotating media load increment for non-seeking I/O's"); 969eda14cbcSMatt Macy 970eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, 971eda14cbcSMatt Macy "Rotating media load increment for seeking I/O's"); 972eda14cbcSMatt Macy 973eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, 974eda14cbcSMatt Macy "Offset in bytes from the last I/O which triggers " 975eda14cbcSMatt Macy "a reduced rotating media seek increment"); 976eda14cbcSMatt Macy 977eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, 978eda14cbcSMatt Macy "Non-rotating media load increment for non-seeking I/O's"); 979eda14cbcSMatt Macy 980eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW, 981eda14cbcSMatt Macy "Non-rotating media load increment for seeking I/O's"); 982eda14cbcSMatt Macy /* END CSTYLED */ 983