1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23*eda14cbcSMatt Macy * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24*eda14cbcSMatt Macy * Copyright (c) 2017, Intel Corporation. 25*eda14cbcSMatt Macy */ 26*eda14cbcSMatt Macy 27*eda14cbcSMatt Macy /* 28*eda14cbcSMatt Macy * ZFS fault injection 29*eda14cbcSMatt Macy * 30*eda14cbcSMatt Macy * To handle fault injection, we keep track of a series of zinject_record_t 31*eda14cbcSMatt Macy * structures which describe which logical block(s) should be injected with a 32*eda14cbcSMatt Macy * fault. These are kept in a global list. Each record corresponds to a given 33*eda14cbcSMatt Macy * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 34*eda14cbcSMatt Macy * or exported while the injection record exists. 35*eda14cbcSMatt Macy * 36*eda14cbcSMatt Macy * Device level injection is done using the 'zi_guid' field. If this is set, it 37*eda14cbcSMatt Macy * means that the error is destined for a particular device, not a piece of 38*eda14cbcSMatt Macy * data. 39*eda14cbcSMatt Macy * 40*eda14cbcSMatt Macy * This is a rather poor data structure and algorithm, but we don't expect more 41*eda14cbcSMatt Macy * than a few faults at any one time, so it should be sufficient for our needs. 42*eda14cbcSMatt Macy */ 43*eda14cbcSMatt Macy 44*eda14cbcSMatt Macy #include <sys/arc.h> 45*eda14cbcSMatt Macy #include <sys/zio.h> 46*eda14cbcSMatt Macy #include <sys/zfs_ioctl.h> 47*eda14cbcSMatt Macy #include <sys/vdev_impl.h> 48*eda14cbcSMatt Macy #include <sys/dmu_objset.h> 49*eda14cbcSMatt Macy #include <sys/dsl_dataset.h> 50*eda14cbcSMatt Macy #include <sys/fs/zfs.h> 51*eda14cbcSMatt Macy 52*eda14cbcSMatt Macy uint32_t zio_injection_enabled = 0; 53*eda14cbcSMatt Macy 54*eda14cbcSMatt Macy /* 55*eda14cbcSMatt Macy * Data describing each zinject handler registered on the system, and 56*eda14cbcSMatt Macy * contains the list node linking the handler in the global zinject 57*eda14cbcSMatt Macy * handler list. 58*eda14cbcSMatt Macy */ 59*eda14cbcSMatt Macy typedef struct inject_handler { 60*eda14cbcSMatt Macy int zi_id; 61*eda14cbcSMatt Macy spa_t *zi_spa; 62*eda14cbcSMatt Macy zinject_record_t zi_record; 63*eda14cbcSMatt Macy uint64_t *zi_lanes; 64*eda14cbcSMatt Macy int zi_next_lane; 65*eda14cbcSMatt Macy list_node_t zi_link; 66*eda14cbcSMatt Macy } inject_handler_t; 67*eda14cbcSMatt Macy 68*eda14cbcSMatt Macy /* 69*eda14cbcSMatt Macy * List of all zinject handlers registered on the system, protected by 70*eda14cbcSMatt Macy * the inject_lock defined below. 71*eda14cbcSMatt Macy */ 72*eda14cbcSMatt Macy static list_t inject_handlers; 73*eda14cbcSMatt Macy 74*eda14cbcSMatt Macy /* 75*eda14cbcSMatt Macy * This protects insertion into, and traversal of, the inject handler 76*eda14cbcSMatt Macy * list defined above; as well as the inject_delay_count. Any time a 77*eda14cbcSMatt Macy * handler is inserted or removed from the list, this lock should be 78*eda14cbcSMatt Macy * taken as a RW_WRITER; and any time traversal is done over the list 79*eda14cbcSMatt Macy * (without modification to it) this lock should be taken as a RW_READER. 80*eda14cbcSMatt Macy */ 81*eda14cbcSMatt Macy static krwlock_t inject_lock; 82*eda14cbcSMatt Macy 83*eda14cbcSMatt Macy /* 84*eda14cbcSMatt Macy * This holds the number of zinject delay handlers that have been 85*eda14cbcSMatt Macy * registered on the system. It is protected by the inject_lock defined 86*eda14cbcSMatt Macy * above. Thus modifications to this count must be a RW_WRITER of the 87*eda14cbcSMatt Macy * inject_lock, and reads of this count must be (at least) a RW_READER 88*eda14cbcSMatt Macy * of the lock. 89*eda14cbcSMatt Macy */ 90*eda14cbcSMatt Macy static int inject_delay_count = 0; 91*eda14cbcSMatt Macy 92*eda14cbcSMatt Macy /* 93*eda14cbcSMatt Macy * This lock is used only in zio_handle_io_delay(), refer to the comment 94*eda14cbcSMatt Macy * in that function for more details. 95*eda14cbcSMatt Macy */ 96*eda14cbcSMatt Macy static kmutex_t inject_delay_mtx; 97*eda14cbcSMatt Macy 98*eda14cbcSMatt Macy /* 99*eda14cbcSMatt Macy * Used to assign unique identifying numbers to each new zinject handler. 100*eda14cbcSMatt Macy */ 101*eda14cbcSMatt Macy static int inject_next_id = 1; 102*eda14cbcSMatt Macy 103*eda14cbcSMatt Macy /* 104*eda14cbcSMatt Macy * Test if the requested frequency was triggered 105*eda14cbcSMatt Macy */ 106*eda14cbcSMatt Macy static boolean_t 107*eda14cbcSMatt Macy freq_triggered(uint32_t frequency) 108*eda14cbcSMatt Macy { 109*eda14cbcSMatt Macy /* 110*eda14cbcSMatt Macy * zero implies always (100%) 111*eda14cbcSMatt Macy */ 112*eda14cbcSMatt Macy if (frequency == 0) 113*eda14cbcSMatt Macy return (B_TRUE); 114*eda14cbcSMatt Macy 115*eda14cbcSMatt Macy /* 116*eda14cbcSMatt Macy * Note: we still handle legacy (unscaled) frequency values 117*eda14cbcSMatt Macy */ 118*eda14cbcSMatt Macy uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; 119*eda14cbcSMatt Macy 120*eda14cbcSMatt Macy return (spa_get_random(maximum) < frequency); 121*eda14cbcSMatt Macy } 122*eda14cbcSMatt Macy 123*eda14cbcSMatt Macy /* 124*eda14cbcSMatt Macy * Returns true if the given record matches the I/O in progress. 125*eda14cbcSMatt Macy */ 126*eda14cbcSMatt Macy static boolean_t 127*eda14cbcSMatt Macy zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, 128*eda14cbcSMatt Macy zinject_record_t *record, int error) 129*eda14cbcSMatt Macy { 130*eda14cbcSMatt Macy /* 131*eda14cbcSMatt Macy * Check for a match against the MOS, which is based on type 132*eda14cbcSMatt Macy */ 133*eda14cbcSMatt Macy if (zb->zb_objset == DMU_META_OBJSET && 134*eda14cbcSMatt Macy record->zi_objset == DMU_META_OBJSET && 135*eda14cbcSMatt Macy record->zi_object == DMU_META_DNODE_OBJECT) { 136*eda14cbcSMatt Macy if (record->zi_type == DMU_OT_NONE || 137*eda14cbcSMatt Macy type == record->zi_type) 138*eda14cbcSMatt Macy return (freq_triggered(record->zi_freq)); 139*eda14cbcSMatt Macy else 140*eda14cbcSMatt Macy return (B_FALSE); 141*eda14cbcSMatt Macy } 142*eda14cbcSMatt Macy 143*eda14cbcSMatt Macy /* 144*eda14cbcSMatt Macy * Check for an exact match. 145*eda14cbcSMatt Macy */ 146*eda14cbcSMatt Macy if (zb->zb_objset == record->zi_objset && 147*eda14cbcSMatt Macy zb->zb_object == record->zi_object && 148*eda14cbcSMatt Macy zb->zb_level == record->zi_level && 149*eda14cbcSMatt Macy zb->zb_blkid >= record->zi_start && 150*eda14cbcSMatt Macy zb->zb_blkid <= record->zi_end && 151*eda14cbcSMatt Macy (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && 152*eda14cbcSMatt Macy error == record->zi_error) { 153*eda14cbcSMatt Macy return (freq_triggered(record->zi_freq)); 154*eda14cbcSMatt Macy } 155*eda14cbcSMatt Macy 156*eda14cbcSMatt Macy return (B_FALSE); 157*eda14cbcSMatt Macy } 158*eda14cbcSMatt Macy 159*eda14cbcSMatt Macy /* 160*eda14cbcSMatt Macy * Panic the system when a config change happens in the function 161*eda14cbcSMatt Macy * specified by tag. 162*eda14cbcSMatt Macy */ 163*eda14cbcSMatt Macy void 164*eda14cbcSMatt Macy zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) 165*eda14cbcSMatt Macy { 166*eda14cbcSMatt Macy inject_handler_t *handler; 167*eda14cbcSMatt Macy 168*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 169*eda14cbcSMatt Macy 170*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 171*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 172*eda14cbcSMatt Macy 173*eda14cbcSMatt Macy if (spa != handler->zi_spa) 174*eda14cbcSMatt Macy continue; 175*eda14cbcSMatt Macy 176*eda14cbcSMatt Macy if (handler->zi_record.zi_type == type && 177*eda14cbcSMatt Macy strcmp(tag, handler->zi_record.zi_func) == 0) 178*eda14cbcSMatt Macy panic("Panic requested in function %s\n", tag); 179*eda14cbcSMatt Macy } 180*eda14cbcSMatt Macy 181*eda14cbcSMatt Macy rw_exit(&inject_lock); 182*eda14cbcSMatt Macy } 183*eda14cbcSMatt Macy 184*eda14cbcSMatt Macy /* 185*eda14cbcSMatt Macy * Inject a decryption failure. Decryption failures can occur in 186*eda14cbcSMatt Macy * both the ARC and the ZIO layers. 187*eda14cbcSMatt Macy */ 188*eda14cbcSMatt Macy int 189*eda14cbcSMatt Macy zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, 190*eda14cbcSMatt Macy uint64_t type, int error) 191*eda14cbcSMatt Macy { 192*eda14cbcSMatt Macy int ret = 0; 193*eda14cbcSMatt Macy inject_handler_t *handler; 194*eda14cbcSMatt Macy 195*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 196*eda14cbcSMatt Macy 197*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 198*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 199*eda14cbcSMatt Macy 200*eda14cbcSMatt Macy if (spa != handler->zi_spa || 201*eda14cbcSMatt Macy handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) 202*eda14cbcSMatt Macy continue; 203*eda14cbcSMatt Macy 204*eda14cbcSMatt Macy if (zio_match_handler(zb, type, ZI_NO_DVA, 205*eda14cbcSMatt Macy &handler->zi_record, error)) { 206*eda14cbcSMatt Macy ret = error; 207*eda14cbcSMatt Macy break; 208*eda14cbcSMatt Macy } 209*eda14cbcSMatt Macy } 210*eda14cbcSMatt Macy 211*eda14cbcSMatt Macy rw_exit(&inject_lock); 212*eda14cbcSMatt Macy return (ret); 213*eda14cbcSMatt Macy } 214*eda14cbcSMatt Macy 215*eda14cbcSMatt Macy /* 216*eda14cbcSMatt Macy * If this is a physical I/O for a vdev child determine which DVA it is 217*eda14cbcSMatt Macy * for. We iterate backwards through the DVAs matching on the offset so 218*eda14cbcSMatt Macy * that we end up with ZI_NO_DVA (-1) if we don't find a match. 219*eda14cbcSMatt Macy */ 220*eda14cbcSMatt Macy static int 221*eda14cbcSMatt Macy zio_match_dva(zio_t *zio) 222*eda14cbcSMatt Macy { 223*eda14cbcSMatt Macy int i = ZI_NO_DVA; 224*eda14cbcSMatt Macy 225*eda14cbcSMatt Macy if (zio->io_bp != NULL && zio->io_vd != NULL && 226*eda14cbcSMatt Macy zio->io_child_type == ZIO_CHILD_VDEV) { 227*eda14cbcSMatt Macy for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { 228*eda14cbcSMatt Macy dva_t *dva = &zio->io_bp->blk_dva[i]; 229*eda14cbcSMatt Macy uint64_t off = DVA_GET_OFFSET(dva); 230*eda14cbcSMatt Macy vdev_t *vd = vdev_lookup_top(zio->io_spa, 231*eda14cbcSMatt Macy DVA_GET_VDEV(dva)); 232*eda14cbcSMatt Macy 233*eda14cbcSMatt Macy /* Compensate for vdev label added to leaves */ 234*eda14cbcSMatt Macy if (zio->io_vd->vdev_ops->vdev_op_leaf) 235*eda14cbcSMatt Macy off += VDEV_LABEL_START_SIZE; 236*eda14cbcSMatt Macy 237*eda14cbcSMatt Macy if (zio->io_vd == vd && zio->io_offset == off) 238*eda14cbcSMatt Macy break; 239*eda14cbcSMatt Macy } 240*eda14cbcSMatt Macy } 241*eda14cbcSMatt Macy 242*eda14cbcSMatt Macy return (i); 243*eda14cbcSMatt Macy } 244*eda14cbcSMatt Macy 245*eda14cbcSMatt Macy 246*eda14cbcSMatt Macy /* 247*eda14cbcSMatt Macy * Determine if the I/O in question should return failure. Returns the errno 248*eda14cbcSMatt Macy * to be returned to the caller. 249*eda14cbcSMatt Macy */ 250*eda14cbcSMatt Macy int 251*eda14cbcSMatt Macy zio_handle_fault_injection(zio_t *zio, int error) 252*eda14cbcSMatt Macy { 253*eda14cbcSMatt Macy int ret = 0; 254*eda14cbcSMatt Macy inject_handler_t *handler; 255*eda14cbcSMatt Macy 256*eda14cbcSMatt Macy /* 257*eda14cbcSMatt Macy * Ignore I/O not associated with any logical data. 258*eda14cbcSMatt Macy */ 259*eda14cbcSMatt Macy if (zio->io_logical == NULL) 260*eda14cbcSMatt Macy return (0); 261*eda14cbcSMatt Macy 262*eda14cbcSMatt Macy /* 263*eda14cbcSMatt Macy * Currently, we only support fault injection on reads. 264*eda14cbcSMatt Macy */ 265*eda14cbcSMatt Macy if (zio->io_type != ZIO_TYPE_READ) 266*eda14cbcSMatt Macy return (0); 267*eda14cbcSMatt Macy 268*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 269*eda14cbcSMatt Macy 270*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 271*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 272*eda14cbcSMatt Macy if (zio->io_spa != handler->zi_spa || 273*eda14cbcSMatt Macy handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 274*eda14cbcSMatt Macy continue; 275*eda14cbcSMatt Macy 276*eda14cbcSMatt Macy /* If this handler matches, return the specified error */ 277*eda14cbcSMatt Macy if (zio_match_handler(&zio->io_logical->io_bookmark, 278*eda14cbcSMatt Macy zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 279*eda14cbcSMatt Macy zio_match_dva(zio), &handler->zi_record, error)) { 280*eda14cbcSMatt Macy ret = error; 281*eda14cbcSMatt Macy break; 282*eda14cbcSMatt Macy } 283*eda14cbcSMatt Macy } 284*eda14cbcSMatt Macy 285*eda14cbcSMatt Macy rw_exit(&inject_lock); 286*eda14cbcSMatt Macy 287*eda14cbcSMatt Macy return (ret); 288*eda14cbcSMatt Macy } 289*eda14cbcSMatt Macy 290*eda14cbcSMatt Macy /* 291*eda14cbcSMatt Macy * Determine if the zio is part of a label update and has an injection 292*eda14cbcSMatt Macy * handler associated with that portion of the label. Currently, we 293*eda14cbcSMatt Macy * allow error injection in either the nvlist or the uberblock region of 294*eda14cbcSMatt Macy * of the vdev label. 295*eda14cbcSMatt Macy */ 296*eda14cbcSMatt Macy int 297*eda14cbcSMatt Macy zio_handle_label_injection(zio_t *zio, int error) 298*eda14cbcSMatt Macy { 299*eda14cbcSMatt Macy inject_handler_t *handler; 300*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 301*eda14cbcSMatt Macy uint64_t offset = zio->io_offset; 302*eda14cbcSMatt Macy int label; 303*eda14cbcSMatt Macy int ret = 0; 304*eda14cbcSMatt Macy 305*eda14cbcSMatt Macy if (offset >= VDEV_LABEL_START_SIZE && 306*eda14cbcSMatt Macy offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 307*eda14cbcSMatt Macy return (0); 308*eda14cbcSMatt Macy 309*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 310*eda14cbcSMatt Macy 311*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 312*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 313*eda14cbcSMatt Macy uint64_t start = handler->zi_record.zi_start; 314*eda14cbcSMatt Macy uint64_t end = handler->zi_record.zi_end; 315*eda14cbcSMatt Macy 316*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 317*eda14cbcSMatt Macy continue; 318*eda14cbcSMatt Macy 319*eda14cbcSMatt Macy /* 320*eda14cbcSMatt Macy * The injection region is the relative offsets within a 321*eda14cbcSMatt Macy * vdev label. We must determine the label which is being 322*eda14cbcSMatt Macy * updated and adjust our region accordingly. 323*eda14cbcSMatt Macy */ 324*eda14cbcSMatt Macy label = vdev_label_number(vd->vdev_psize, offset); 325*eda14cbcSMatt Macy start = vdev_label_offset(vd->vdev_psize, label, start); 326*eda14cbcSMatt Macy end = vdev_label_offset(vd->vdev_psize, label, end); 327*eda14cbcSMatt Macy 328*eda14cbcSMatt Macy if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 329*eda14cbcSMatt Macy (offset >= start && offset <= end)) { 330*eda14cbcSMatt Macy ret = error; 331*eda14cbcSMatt Macy break; 332*eda14cbcSMatt Macy } 333*eda14cbcSMatt Macy } 334*eda14cbcSMatt Macy rw_exit(&inject_lock); 335*eda14cbcSMatt Macy return (ret); 336*eda14cbcSMatt Macy } 337*eda14cbcSMatt Macy 338*eda14cbcSMatt Macy /*ARGSUSED*/ 339*eda14cbcSMatt Macy static int 340*eda14cbcSMatt Macy zio_inject_bitflip_cb(void *data, size_t len, void *private) 341*eda14cbcSMatt Macy { 342*eda14cbcSMatt Macy zio_t *zio __maybe_unused = private; 343*eda14cbcSMatt Macy uint8_t *buffer = data; 344*eda14cbcSMatt Macy uint_t byte = spa_get_random(len); 345*eda14cbcSMatt Macy 346*eda14cbcSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ); 347*eda14cbcSMatt Macy 348*eda14cbcSMatt Macy /* flip a single random bit in an abd data buffer */ 349*eda14cbcSMatt Macy buffer[byte] ^= 1 << spa_get_random(8); 350*eda14cbcSMatt Macy 351*eda14cbcSMatt Macy return (1); /* stop after first flip */ 352*eda14cbcSMatt Macy } 353*eda14cbcSMatt Macy 354*eda14cbcSMatt Macy static int 355*eda14cbcSMatt Macy zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) 356*eda14cbcSMatt Macy { 357*eda14cbcSMatt Macy inject_handler_t *handler; 358*eda14cbcSMatt Macy int ret = 0; 359*eda14cbcSMatt Macy 360*eda14cbcSMatt Macy /* 361*eda14cbcSMatt Macy * We skip over faults in the labels unless it's during 362*eda14cbcSMatt Macy * device open (i.e. zio == NULL). 363*eda14cbcSMatt Macy */ 364*eda14cbcSMatt Macy if (zio != NULL) { 365*eda14cbcSMatt Macy uint64_t offset = zio->io_offset; 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy if (offset < VDEV_LABEL_START_SIZE || 368*eda14cbcSMatt Macy offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 369*eda14cbcSMatt Macy return (0); 370*eda14cbcSMatt Macy } 371*eda14cbcSMatt Macy 372*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 373*eda14cbcSMatt Macy 374*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 375*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 376*eda14cbcSMatt Macy 377*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 378*eda14cbcSMatt Macy continue; 379*eda14cbcSMatt Macy 380*eda14cbcSMatt Macy if (vd->vdev_guid == handler->zi_record.zi_guid) { 381*eda14cbcSMatt Macy if (handler->zi_record.zi_failfast && 382*eda14cbcSMatt Macy (zio == NULL || (zio->io_flags & 383*eda14cbcSMatt Macy (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 384*eda14cbcSMatt Macy continue; 385*eda14cbcSMatt Macy } 386*eda14cbcSMatt Macy 387*eda14cbcSMatt Macy /* Handle type specific I/O failures */ 388*eda14cbcSMatt Macy if (zio != NULL && 389*eda14cbcSMatt Macy handler->zi_record.zi_iotype != ZIO_TYPES && 390*eda14cbcSMatt Macy handler->zi_record.zi_iotype != zio->io_type) 391*eda14cbcSMatt Macy continue; 392*eda14cbcSMatt Macy 393*eda14cbcSMatt Macy if (handler->zi_record.zi_error == err1 || 394*eda14cbcSMatt Macy handler->zi_record.zi_error == err2) { 395*eda14cbcSMatt Macy /* 396*eda14cbcSMatt Macy * limit error injection if requested 397*eda14cbcSMatt Macy */ 398*eda14cbcSMatt Macy if (!freq_triggered(handler->zi_record.zi_freq)) 399*eda14cbcSMatt Macy continue; 400*eda14cbcSMatt Macy 401*eda14cbcSMatt Macy /* 402*eda14cbcSMatt Macy * For a failed open, pretend like the device 403*eda14cbcSMatt Macy * has gone away. 404*eda14cbcSMatt Macy */ 405*eda14cbcSMatt Macy if (err1 == ENXIO) 406*eda14cbcSMatt Macy vd->vdev_stat.vs_aux = 407*eda14cbcSMatt Macy VDEV_AUX_OPEN_FAILED; 408*eda14cbcSMatt Macy 409*eda14cbcSMatt Macy /* 410*eda14cbcSMatt Macy * Treat these errors as if they had been 411*eda14cbcSMatt Macy * retried so that all the appropriate stats 412*eda14cbcSMatt Macy * and FMA events are generated. 413*eda14cbcSMatt Macy */ 414*eda14cbcSMatt Macy if (!handler->zi_record.zi_failfast && 415*eda14cbcSMatt Macy zio != NULL) 416*eda14cbcSMatt Macy zio->io_flags |= ZIO_FLAG_IO_RETRY; 417*eda14cbcSMatt Macy 418*eda14cbcSMatt Macy /* 419*eda14cbcSMatt Macy * EILSEQ means flip a bit after a read 420*eda14cbcSMatt Macy */ 421*eda14cbcSMatt Macy if (handler->zi_record.zi_error == EILSEQ) { 422*eda14cbcSMatt Macy if (zio == NULL) 423*eda14cbcSMatt Macy break; 424*eda14cbcSMatt Macy 425*eda14cbcSMatt Macy /* locate buffer data and flip a bit */ 426*eda14cbcSMatt Macy (void) abd_iterate_func(zio->io_abd, 0, 427*eda14cbcSMatt Macy zio->io_size, zio_inject_bitflip_cb, 428*eda14cbcSMatt Macy zio); 429*eda14cbcSMatt Macy break; 430*eda14cbcSMatt Macy } 431*eda14cbcSMatt Macy 432*eda14cbcSMatt Macy ret = handler->zi_record.zi_error; 433*eda14cbcSMatt Macy break; 434*eda14cbcSMatt Macy } 435*eda14cbcSMatt Macy if (handler->zi_record.zi_error == ENXIO) { 436*eda14cbcSMatt Macy ret = SET_ERROR(EIO); 437*eda14cbcSMatt Macy break; 438*eda14cbcSMatt Macy } 439*eda14cbcSMatt Macy } 440*eda14cbcSMatt Macy } 441*eda14cbcSMatt Macy 442*eda14cbcSMatt Macy rw_exit(&inject_lock); 443*eda14cbcSMatt Macy 444*eda14cbcSMatt Macy return (ret); 445*eda14cbcSMatt Macy } 446*eda14cbcSMatt Macy 447*eda14cbcSMatt Macy int 448*eda14cbcSMatt Macy zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 449*eda14cbcSMatt Macy { 450*eda14cbcSMatt Macy return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); 451*eda14cbcSMatt Macy } 452*eda14cbcSMatt Macy 453*eda14cbcSMatt Macy int 454*eda14cbcSMatt Macy zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) 455*eda14cbcSMatt Macy { 456*eda14cbcSMatt Macy return (zio_handle_device_injection_impl(vd, zio, err1, err2)); 457*eda14cbcSMatt Macy } 458*eda14cbcSMatt Macy 459*eda14cbcSMatt Macy /* 460*eda14cbcSMatt Macy * Simulate hardware that ignores cache flushes. For requested number 461*eda14cbcSMatt Macy * of seconds nix the actual writing to disk. 462*eda14cbcSMatt Macy */ 463*eda14cbcSMatt Macy void 464*eda14cbcSMatt Macy zio_handle_ignored_writes(zio_t *zio) 465*eda14cbcSMatt Macy { 466*eda14cbcSMatt Macy inject_handler_t *handler; 467*eda14cbcSMatt Macy 468*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 469*eda14cbcSMatt Macy 470*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 471*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 472*eda14cbcSMatt Macy 473*eda14cbcSMatt Macy /* Ignore errors not destined for this pool */ 474*eda14cbcSMatt Macy if (zio->io_spa != handler->zi_spa || 475*eda14cbcSMatt Macy handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 476*eda14cbcSMatt Macy continue; 477*eda14cbcSMatt Macy 478*eda14cbcSMatt Macy /* 479*eda14cbcSMatt Macy * Positive duration implies # of seconds, negative 480*eda14cbcSMatt Macy * a number of txgs 481*eda14cbcSMatt Macy */ 482*eda14cbcSMatt Macy if (handler->zi_record.zi_timer == 0) { 483*eda14cbcSMatt Macy if (handler->zi_record.zi_duration > 0) 484*eda14cbcSMatt Macy handler->zi_record.zi_timer = ddi_get_lbolt64(); 485*eda14cbcSMatt Macy else 486*eda14cbcSMatt Macy handler->zi_record.zi_timer = zio->io_txg; 487*eda14cbcSMatt Macy } 488*eda14cbcSMatt Macy 489*eda14cbcSMatt Macy /* Have a "problem" writing 60% of the time */ 490*eda14cbcSMatt Macy if (spa_get_random(100) < 60) 491*eda14cbcSMatt Macy zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 492*eda14cbcSMatt Macy break; 493*eda14cbcSMatt Macy } 494*eda14cbcSMatt Macy 495*eda14cbcSMatt Macy rw_exit(&inject_lock); 496*eda14cbcSMatt Macy } 497*eda14cbcSMatt Macy 498*eda14cbcSMatt Macy void 499*eda14cbcSMatt Macy spa_handle_ignored_writes(spa_t *spa) 500*eda14cbcSMatt Macy { 501*eda14cbcSMatt Macy inject_handler_t *handler; 502*eda14cbcSMatt Macy 503*eda14cbcSMatt Macy if (zio_injection_enabled == 0) 504*eda14cbcSMatt Macy return; 505*eda14cbcSMatt Macy 506*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 507*eda14cbcSMatt Macy 508*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 509*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) { 510*eda14cbcSMatt Macy 511*eda14cbcSMatt Macy if (spa != handler->zi_spa || 512*eda14cbcSMatt Macy handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 513*eda14cbcSMatt Macy continue; 514*eda14cbcSMatt Macy 515*eda14cbcSMatt Macy if (handler->zi_record.zi_duration > 0) { 516*eda14cbcSMatt Macy VERIFY(handler->zi_record.zi_timer == 0 || 517*eda14cbcSMatt Macy ddi_time_after64( 518*eda14cbcSMatt Macy (int64_t)handler->zi_record.zi_timer + 519*eda14cbcSMatt Macy handler->zi_record.zi_duration * hz, 520*eda14cbcSMatt Macy ddi_get_lbolt64())); 521*eda14cbcSMatt Macy } else { 522*eda14cbcSMatt Macy /* duration is negative so the subtraction here adds */ 523*eda14cbcSMatt Macy VERIFY(handler->zi_record.zi_timer == 0 || 524*eda14cbcSMatt Macy handler->zi_record.zi_timer - 525*eda14cbcSMatt Macy handler->zi_record.zi_duration >= 526*eda14cbcSMatt Macy spa_syncing_txg(spa)); 527*eda14cbcSMatt Macy } 528*eda14cbcSMatt Macy } 529*eda14cbcSMatt Macy 530*eda14cbcSMatt Macy rw_exit(&inject_lock); 531*eda14cbcSMatt Macy } 532*eda14cbcSMatt Macy 533*eda14cbcSMatt Macy hrtime_t 534*eda14cbcSMatt Macy zio_handle_io_delay(zio_t *zio) 535*eda14cbcSMatt Macy { 536*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 537*eda14cbcSMatt Macy inject_handler_t *min_handler = NULL; 538*eda14cbcSMatt Macy hrtime_t min_target = 0; 539*eda14cbcSMatt Macy 540*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 541*eda14cbcSMatt Macy 542*eda14cbcSMatt Macy /* 543*eda14cbcSMatt Macy * inject_delay_count is a subset of zio_injection_enabled that 544*eda14cbcSMatt Macy * is only incremented for delay handlers. These checks are 545*eda14cbcSMatt Macy * mainly added to remind the reader why we're not explicitly 546*eda14cbcSMatt Macy * checking zio_injection_enabled like the other functions. 547*eda14cbcSMatt Macy */ 548*eda14cbcSMatt Macy IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 549*eda14cbcSMatt Macy IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 550*eda14cbcSMatt Macy 551*eda14cbcSMatt Macy /* 552*eda14cbcSMatt Macy * If there aren't any inject delay handlers registered, then we 553*eda14cbcSMatt Macy * can short circuit and simply return 0 here. A value of zero 554*eda14cbcSMatt Macy * informs zio_delay_interrupt() that this request should not be 555*eda14cbcSMatt Macy * delayed. This short circuit keeps us from acquiring the 556*eda14cbcSMatt Macy * inject_delay_mutex unnecessarily. 557*eda14cbcSMatt Macy */ 558*eda14cbcSMatt Macy if (inject_delay_count == 0) { 559*eda14cbcSMatt Macy rw_exit(&inject_lock); 560*eda14cbcSMatt Macy return (0); 561*eda14cbcSMatt Macy } 562*eda14cbcSMatt Macy 563*eda14cbcSMatt Macy /* 564*eda14cbcSMatt Macy * Each inject handler has a number of "lanes" associated with 565*eda14cbcSMatt Macy * it. Each lane is able to handle requests independently of one 566*eda14cbcSMatt Macy * another, and at a latency defined by the inject handler 567*eda14cbcSMatt Macy * record's zi_timer field. Thus if a handler in configured with 568*eda14cbcSMatt Macy * a single lane with a 10ms latency, it will delay requests 569*eda14cbcSMatt Macy * such that only a single request is completed every 10ms. So, 570*eda14cbcSMatt Macy * if more than one request is attempted per each 10ms interval, 571*eda14cbcSMatt Macy * the average latency of the requests will be greater than 572*eda14cbcSMatt Macy * 10ms; but if only a single request is submitted each 10ms 573*eda14cbcSMatt Macy * interval the average latency will be 10ms. 574*eda14cbcSMatt Macy * 575*eda14cbcSMatt Macy * We need to acquire this mutex to prevent multiple concurrent 576*eda14cbcSMatt Macy * threads being assigned to the same lane of a given inject 577*eda14cbcSMatt Macy * handler. The mutex allows us to perform the following two 578*eda14cbcSMatt Macy * operations atomically: 579*eda14cbcSMatt Macy * 580*eda14cbcSMatt Macy * 1. determine the minimum handler and minimum target 581*eda14cbcSMatt Macy * value of all the possible handlers 582*eda14cbcSMatt Macy * 2. update that minimum handler's lane array 583*eda14cbcSMatt Macy * 584*eda14cbcSMatt Macy * Without atomicity, two (or more) threads could pick the same 585*eda14cbcSMatt Macy * lane in step (1), and then conflict with each other in step 586*eda14cbcSMatt Macy * (2). This could allow a single lane handler to process 587*eda14cbcSMatt Macy * multiple requests simultaneously, which shouldn't be possible. 588*eda14cbcSMatt Macy */ 589*eda14cbcSMatt Macy mutex_enter(&inject_delay_mtx); 590*eda14cbcSMatt Macy 591*eda14cbcSMatt Macy for (inject_handler_t *handler = list_head(&inject_handlers); 592*eda14cbcSMatt Macy handler != NULL; handler = list_next(&inject_handlers, handler)) { 593*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 594*eda14cbcSMatt Macy continue; 595*eda14cbcSMatt Macy 596*eda14cbcSMatt Macy if (!freq_triggered(handler->zi_record.zi_freq)) 597*eda14cbcSMatt Macy continue; 598*eda14cbcSMatt Macy 599*eda14cbcSMatt Macy if (vd->vdev_guid != handler->zi_record.zi_guid) 600*eda14cbcSMatt Macy continue; 601*eda14cbcSMatt Macy 602*eda14cbcSMatt Macy /* 603*eda14cbcSMatt Macy * Defensive; should never happen as the array allocation 604*eda14cbcSMatt Macy * occurs prior to inserting this handler on the list. 605*eda14cbcSMatt Macy */ 606*eda14cbcSMatt Macy ASSERT3P(handler->zi_lanes, !=, NULL); 607*eda14cbcSMatt Macy 608*eda14cbcSMatt Macy /* 609*eda14cbcSMatt Macy * This should never happen, the zinject command should 610*eda14cbcSMatt Macy * prevent a user from setting an IO delay with zero lanes. 611*eda14cbcSMatt Macy */ 612*eda14cbcSMatt Macy ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 613*eda14cbcSMatt Macy 614*eda14cbcSMatt Macy ASSERT3U(handler->zi_record.zi_nlanes, >, 615*eda14cbcSMatt Macy handler->zi_next_lane); 616*eda14cbcSMatt Macy 617*eda14cbcSMatt Macy /* 618*eda14cbcSMatt Macy * We want to issue this IO to the lane that will become 619*eda14cbcSMatt Macy * idle the soonest, so we compare the soonest this 620*eda14cbcSMatt Macy * specific handler can complete the IO with all other 621*eda14cbcSMatt Macy * handlers, to find the lowest value of all possible 622*eda14cbcSMatt Macy * lanes. We then use this lane to submit the request. 623*eda14cbcSMatt Macy * 624*eda14cbcSMatt Macy * Since each handler has a constant value for its 625*eda14cbcSMatt Macy * delay, we can just use the "next" lane for that 626*eda14cbcSMatt Macy * handler; as it will always be the lane with the 627*eda14cbcSMatt Macy * lowest value for that particular handler (i.e. the 628*eda14cbcSMatt Macy * lane that will become idle the soonest). This saves a 629*eda14cbcSMatt Macy * scan of each handler's lanes array. 630*eda14cbcSMatt Macy * 631*eda14cbcSMatt Macy * There's two cases to consider when determining when 632*eda14cbcSMatt Macy * this specific IO request should complete. If this 633*eda14cbcSMatt Macy * lane is idle, we want to "submit" the request now so 634*eda14cbcSMatt Macy * it will complete after zi_timer milliseconds. Thus, 635*eda14cbcSMatt Macy * we set the target to now + zi_timer. 636*eda14cbcSMatt Macy * 637*eda14cbcSMatt Macy * If the lane is busy, we want this request to complete 638*eda14cbcSMatt Macy * zi_timer milliseconds after the lane becomes idle. 639*eda14cbcSMatt Macy * Since the 'zi_lanes' array holds the time at which 640*eda14cbcSMatt Macy * each lane will become idle, we use that value to 641*eda14cbcSMatt Macy * determine when this request should complete. 642*eda14cbcSMatt Macy */ 643*eda14cbcSMatt Macy hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 644*eda14cbcSMatt Macy hrtime_t busy = handler->zi_record.zi_timer + 645*eda14cbcSMatt Macy handler->zi_lanes[handler->zi_next_lane]; 646*eda14cbcSMatt Macy hrtime_t target = MAX(idle, busy); 647*eda14cbcSMatt Macy 648*eda14cbcSMatt Macy if (min_handler == NULL) { 649*eda14cbcSMatt Macy min_handler = handler; 650*eda14cbcSMatt Macy min_target = target; 651*eda14cbcSMatt Macy continue; 652*eda14cbcSMatt Macy } 653*eda14cbcSMatt Macy 654*eda14cbcSMatt Macy ASSERT3P(min_handler, !=, NULL); 655*eda14cbcSMatt Macy ASSERT3U(min_target, !=, 0); 656*eda14cbcSMatt Macy 657*eda14cbcSMatt Macy /* 658*eda14cbcSMatt Macy * We don't yet increment the "next lane" variable since 659*eda14cbcSMatt Macy * we still might find a lower value lane in another 660*eda14cbcSMatt Macy * handler during any remaining iterations. Once we're 661*eda14cbcSMatt Macy * sure we've selected the absolute minimum, we'll claim 662*eda14cbcSMatt Macy * the lane and increment the handler's "next lane" 663*eda14cbcSMatt Macy * field below. 664*eda14cbcSMatt Macy */ 665*eda14cbcSMatt Macy 666*eda14cbcSMatt Macy if (target < min_target) { 667*eda14cbcSMatt Macy min_handler = handler; 668*eda14cbcSMatt Macy min_target = target; 669*eda14cbcSMatt Macy } 670*eda14cbcSMatt Macy } 671*eda14cbcSMatt Macy 672*eda14cbcSMatt Macy /* 673*eda14cbcSMatt Macy * 'min_handler' will be NULL if no IO delays are registered for 674*eda14cbcSMatt Macy * this vdev, otherwise it will point to the handler containing 675*eda14cbcSMatt Macy * the lane that will become idle the soonest. 676*eda14cbcSMatt Macy */ 677*eda14cbcSMatt Macy if (min_handler != NULL) { 678*eda14cbcSMatt Macy ASSERT3U(min_target, !=, 0); 679*eda14cbcSMatt Macy min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 680*eda14cbcSMatt Macy 681*eda14cbcSMatt Macy /* 682*eda14cbcSMatt Macy * If we've used all possible lanes for this handler, 683*eda14cbcSMatt Macy * loop back and start using the first lane again; 684*eda14cbcSMatt Macy * otherwise, just increment the lane index. 685*eda14cbcSMatt Macy */ 686*eda14cbcSMatt Macy min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 687*eda14cbcSMatt Macy min_handler->zi_record.zi_nlanes; 688*eda14cbcSMatt Macy } 689*eda14cbcSMatt Macy 690*eda14cbcSMatt Macy mutex_exit(&inject_delay_mtx); 691*eda14cbcSMatt Macy rw_exit(&inject_lock); 692*eda14cbcSMatt Macy 693*eda14cbcSMatt Macy return (min_target); 694*eda14cbcSMatt Macy } 695*eda14cbcSMatt Macy 696*eda14cbcSMatt Macy static int 697*eda14cbcSMatt Macy zio_calculate_range(const char *pool, zinject_record_t *record) 698*eda14cbcSMatt Macy { 699*eda14cbcSMatt Macy dsl_pool_t *dp; 700*eda14cbcSMatt Macy dsl_dataset_t *ds; 701*eda14cbcSMatt Macy objset_t *os = NULL; 702*eda14cbcSMatt Macy dnode_t *dn = NULL; 703*eda14cbcSMatt Macy int error; 704*eda14cbcSMatt Macy 705*eda14cbcSMatt Macy /* 706*eda14cbcSMatt Macy * Obtain the dnode for object using pool, objset, and object 707*eda14cbcSMatt Macy */ 708*eda14cbcSMatt Macy error = dsl_pool_hold(pool, FTAG, &dp); 709*eda14cbcSMatt Macy if (error) 710*eda14cbcSMatt Macy return (error); 711*eda14cbcSMatt Macy 712*eda14cbcSMatt Macy error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); 713*eda14cbcSMatt Macy dsl_pool_rele(dp, FTAG); 714*eda14cbcSMatt Macy if (error) 715*eda14cbcSMatt Macy return (error); 716*eda14cbcSMatt Macy 717*eda14cbcSMatt Macy error = dmu_objset_from_ds(ds, &os); 718*eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG); 719*eda14cbcSMatt Macy if (error) 720*eda14cbcSMatt Macy return (error); 721*eda14cbcSMatt Macy 722*eda14cbcSMatt Macy error = dnode_hold(os, record->zi_object, FTAG, &dn); 723*eda14cbcSMatt Macy if (error) 724*eda14cbcSMatt Macy return (error); 725*eda14cbcSMatt Macy 726*eda14cbcSMatt Macy /* 727*eda14cbcSMatt Macy * Translate the range into block IDs 728*eda14cbcSMatt Macy */ 729*eda14cbcSMatt Macy if (record->zi_start != 0 || record->zi_end != -1ULL) { 730*eda14cbcSMatt Macy record->zi_start >>= dn->dn_datablkshift; 731*eda14cbcSMatt Macy record->zi_end >>= dn->dn_datablkshift; 732*eda14cbcSMatt Macy } 733*eda14cbcSMatt Macy if (record->zi_level > 0) { 734*eda14cbcSMatt Macy if (record->zi_level >= dn->dn_nlevels) { 735*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 736*eda14cbcSMatt Macy return (SET_ERROR(EDOM)); 737*eda14cbcSMatt Macy } 738*eda14cbcSMatt Macy 739*eda14cbcSMatt Macy if (record->zi_start != 0 || record->zi_end != 0) { 740*eda14cbcSMatt Macy int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 741*eda14cbcSMatt Macy 742*eda14cbcSMatt Macy for (int level = record->zi_level; level > 0; level--) { 743*eda14cbcSMatt Macy record->zi_start >>= shift; 744*eda14cbcSMatt Macy record->zi_end >>= shift; 745*eda14cbcSMatt Macy } 746*eda14cbcSMatt Macy } 747*eda14cbcSMatt Macy } 748*eda14cbcSMatt Macy 749*eda14cbcSMatt Macy dnode_rele(dn, FTAG); 750*eda14cbcSMatt Macy return (0); 751*eda14cbcSMatt Macy } 752*eda14cbcSMatt Macy 753*eda14cbcSMatt Macy /* 754*eda14cbcSMatt Macy * Create a new handler for the given record. We add it to the list, adding 755*eda14cbcSMatt Macy * a reference to the spa_t in the process. We increment zio_injection_enabled, 756*eda14cbcSMatt Macy * which is the switch to trigger all fault injection. 757*eda14cbcSMatt Macy */ 758*eda14cbcSMatt Macy int 759*eda14cbcSMatt Macy zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 760*eda14cbcSMatt Macy { 761*eda14cbcSMatt Macy inject_handler_t *handler; 762*eda14cbcSMatt Macy int error; 763*eda14cbcSMatt Macy spa_t *spa; 764*eda14cbcSMatt Macy 765*eda14cbcSMatt Macy /* 766*eda14cbcSMatt Macy * If this is pool-wide metadata, make sure we unload the corresponding 767*eda14cbcSMatt Macy * spa_t, so that the next attempt to load it will trigger the fault. 768*eda14cbcSMatt Macy * We call spa_reset() to unload the pool appropriately. 769*eda14cbcSMatt Macy */ 770*eda14cbcSMatt Macy if (flags & ZINJECT_UNLOAD_SPA) 771*eda14cbcSMatt Macy if ((error = spa_reset(name)) != 0) 772*eda14cbcSMatt Macy return (error); 773*eda14cbcSMatt Macy 774*eda14cbcSMatt Macy if (record->zi_cmd == ZINJECT_DELAY_IO) { 775*eda14cbcSMatt Macy /* 776*eda14cbcSMatt Macy * A value of zero for the number of lanes or for the 777*eda14cbcSMatt Macy * delay time doesn't make sense. 778*eda14cbcSMatt Macy */ 779*eda14cbcSMatt Macy if (record->zi_timer == 0 || record->zi_nlanes == 0) 780*eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 781*eda14cbcSMatt Macy 782*eda14cbcSMatt Macy /* 783*eda14cbcSMatt Macy * The number of lanes is directly mapped to the size of 784*eda14cbcSMatt Macy * an array used by the handler. Thus, to ensure the 785*eda14cbcSMatt Macy * user doesn't trigger an allocation that's "too large" 786*eda14cbcSMatt Macy * we cap the number of lanes here. 787*eda14cbcSMatt Macy */ 788*eda14cbcSMatt Macy if (record->zi_nlanes >= UINT16_MAX) 789*eda14cbcSMatt Macy return (SET_ERROR(EINVAL)); 790*eda14cbcSMatt Macy } 791*eda14cbcSMatt Macy 792*eda14cbcSMatt Macy /* 793*eda14cbcSMatt Macy * If the supplied range was in bytes -- calculate the actual blkid 794*eda14cbcSMatt Macy */ 795*eda14cbcSMatt Macy if (flags & ZINJECT_CALC_RANGE) { 796*eda14cbcSMatt Macy error = zio_calculate_range(name, record); 797*eda14cbcSMatt Macy if (error != 0) 798*eda14cbcSMatt Macy return (error); 799*eda14cbcSMatt Macy } 800*eda14cbcSMatt Macy 801*eda14cbcSMatt Macy if (!(flags & ZINJECT_NULL)) { 802*eda14cbcSMatt Macy /* 803*eda14cbcSMatt Macy * spa_inject_ref() will add an injection reference, which will 804*eda14cbcSMatt Macy * prevent the pool from being removed from the namespace while 805*eda14cbcSMatt Macy * still allowing it to be unloaded. 806*eda14cbcSMatt Macy */ 807*eda14cbcSMatt Macy if ((spa = spa_inject_addref(name)) == NULL) 808*eda14cbcSMatt Macy return (SET_ERROR(ENOENT)); 809*eda14cbcSMatt Macy 810*eda14cbcSMatt Macy handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 811*eda14cbcSMatt Macy 812*eda14cbcSMatt Macy handler->zi_spa = spa; 813*eda14cbcSMatt Macy handler->zi_record = *record; 814*eda14cbcSMatt Macy 815*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 816*eda14cbcSMatt Macy handler->zi_lanes = kmem_zalloc( 817*eda14cbcSMatt Macy sizeof (*handler->zi_lanes) * 818*eda14cbcSMatt Macy handler->zi_record.zi_nlanes, KM_SLEEP); 819*eda14cbcSMatt Macy handler->zi_next_lane = 0; 820*eda14cbcSMatt Macy } else { 821*eda14cbcSMatt Macy handler->zi_lanes = NULL; 822*eda14cbcSMatt Macy handler->zi_next_lane = 0; 823*eda14cbcSMatt Macy } 824*eda14cbcSMatt Macy 825*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_WRITER); 826*eda14cbcSMatt Macy 827*eda14cbcSMatt Macy /* 828*eda14cbcSMatt Macy * We can't move this increment into the conditional 829*eda14cbcSMatt Macy * above because we need to hold the RW_WRITER lock of 830*eda14cbcSMatt Macy * inject_lock, and we don't want to hold that while 831*eda14cbcSMatt Macy * allocating the handler's zi_lanes array. 832*eda14cbcSMatt Macy */ 833*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 834*eda14cbcSMatt Macy ASSERT3S(inject_delay_count, >=, 0); 835*eda14cbcSMatt Macy inject_delay_count++; 836*eda14cbcSMatt Macy ASSERT3S(inject_delay_count, >, 0); 837*eda14cbcSMatt Macy } 838*eda14cbcSMatt Macy 839*eda14cbcSMatt Macy *id = handler->zi_id = inject_next_id++; 840*eda14cbcSMatt Macy list_insert_tail(&inject_handlers, handler); 841*eda14cbcSMatt Macy atomic_inc_32(&zio_injection_enabled); 842*eda14cbcSMatt Macy 843*eda14cbcSMatt Macy rw_exit(&inject_lock); 844*eda14cbcSMatt Macy } 845*eda14cbcSMatt Macy 846*eda14cbcSMatt Macy /* 847*eda14cbcSMatt Macy * Flush the ARC, so that any attempts to read this data will end up 848*eda14cbcSMatt Macy * going to the ZIO layer. Note that this is a little overkill, but 849*eda14cbcSMatt Macy * we don't have the necessary ARC interfaces to do anything else, and 850*eda14cbcSMatt Macy * fault injection isn't a performance critical path. 851*eda14cbcSMatt Macy */ 852*eda14cbcSMatt Macy if (flags & ZINJECT_FLUSH_ARC) 853*eda14cbcSMatt Macy /* 854*eda14cbcSMatt Macy * We must use FALSE to ensure arc_flush returns, since 855*eda14cbcSMatt Macy * we're not preventing concurrent ARC insertions. 856*eda14cbcSMatt Macy */ 857*eda14cbcSMatt Macy arc_flush(NULL, FALSE); 858*eda14cbcSMatt Macy 859*eda14cbcSMatt Macy return (0); 860*eda14cbcSMatt Macy } 861*eda14cbcSMatt Macy 862*eda14cbcSMatt Macy /* 863*eda14cbcSMatt Macy * Returns the next record with an ID greater than that supplied to the 864*eda14cbcSMatt Macy * function. Used to iterate over all handlers in the system. 865*eda14cbcSMatt Macy */ 866*eda14cbcSMatt Macy int 867*eda14cbcSMatt Macy zio_inject_list_next(int *id, char *name, size_t buflen, 868*eda14cbcSMatt Macy zinject_record_t *record) 869*eda14cbcSMatt Macy { 870*eda14cbcSMatt Macy inject_handler_t *handler; 871*eda14cbcSMatt Macy int ret; 872*eda14cbcSMatt Macy 873*eda14cbcSMatt Macy mutex_enter(&spa_namespace_lock); 874*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_READER); 875*eda14cbcSMatt Macy 876*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 877*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) 878*eda14cbcSMatt Macy if (handler->zi_id > *id) 879*eda14cbcSMatt Macy break; 880*eda14cbcSMatt Macy 881*eda14cbcSMatt Macy if (handler) { 882*eda14cbcSMatt Macy *record = handler->zi_record; 883*eda14cbcSMatt Macy *id = handler->zi_id; 884*eda14cbcSMatt Macy (void) strncpy(name, spa_name(handler->zi_spa), buflen); 885*eda14cbcSMatt Macy ret = 0; 886*eda14cbcSMatt Macy } else { 887*eda14cbcSMatt Macy ret = SET_ERROR(ENOENT); 888*eda14cbcSMatt Macy } 889*eda14cbcSMatt Macy 890*eda14cbcSMatt Macy rw_exit(&inject_lock); 891*eda14cbcSMatt Macy mutex_exit(&spa_namespace_lock); 892*eda14cbcSMatt Macy 893*eda14cbcSMatt Macy return (ret); 894*eda14cbcSMatt Macy } 895*eda14cbcSMatt Macy 896*eda14cbcSMatt Macy /* 897*eda14cbcSMatt Macy * Clear the fault handler with the given identifier, or return ENOENT if none 898*eda14cbcSMatt Macy * exists. 899*eda14cbcSMatt Macy */ 900*eda14cbcSMatt Macy int 901*eda14cbcSMatt Macy zio_clear_fault(int id) 902*eda14cbcSMatt Macy { 903*eda14cbcSMatt Macy inject_handler_t *handler; 904*eda14cbcSMatt Macy 905*eda14cbcSMatt Macy rw_enter(&inject_lock, RW_WRITER); 906*eda14cbcSMatt Macy 907*eda14cbcSMatt Macy for (handler = list_head(&inject_handlers); handler != NULL; 908*eda14cbcSMatt Macy handler = list_next(&inject_handlers, handler)) 909*eda14cbcSMatt Macy if (handler->zi_id == id) 910*eda14cbcSMatt Macy break; 911*eda14cbcSMatt Macy 912*eda14cbcSMatt Macy if (handler == NULL) { 913*eda14cbcSMatt Macy rw_exit(&inject_lock); 914*eda14cbcSMatt Macy return (SET_ERROR(ENOENT)); 915*eda14cbcSMatt Macy } 916*eda14cbcSMatt Macy 917*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 918*eda14cbcSMatt Macy ASSERT3S(inject_delay_count, >, 0); 919*eda14cbcSMatt Macy inject_delay_count--; 920*eda14cbcSMatt Macy ASSERT3S(inject_delay_count, >=, 0); 921*eda14cbcSMatt Macy } 922*eda14cbcSMatt Macy 923*eda14cbcSMatt Macy list_remove(&inject_handlers, handler); 924*eda14cbcSMatt Macy rw_exit(&inject_lock); 925*eda14cbcSMatt Macy 926*eda14cbcSMatt Macy if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 927*eda14cbcSMatt Macy ASSERT3P(handler->zi_lanes, !=, NULL); 928*eda14cbcSMatt Macy kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 929*eda14cbcSMatt Macy handler->zi_record.zi_nlanes); 930*eda14cbcSMatt Macy } else { 931*eda14cbcSMatt Macy ASSERT3P(handler->zi_lanes, ==, NULL); 932*eda14cbcSMatt Macy } 933*eda14cbcSMatt Macy 934*eda14cbcSMatt Macy spa_inject_delref(handler->zi_spa); 935*eda14cbcSMatt Macy kmem_free(handler, sizeof (inject_handler_t)); 936*eda14cbcSMatt Macy atomic_dec_32(&zio_injection_enabled); 937*eda14cbcSMatt Macy 938*eda14cbcSMatt Macy return (0); 939*eda14cbcSMatt Macy } 940*eda14cbcSMatt Macy 941*eda14cbcSMatt Macy void 942*eda14cbcSMatt Macy zio_inject_init(void) 943*eda14cbcSMatt Macy { 944*eda14cbcSMatt Macy rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 945*eda14cbcSMatt Macy mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 946*eda14cbcSMatt Macy list_create(&inject_handlers, sizeof (inject_handler_t), 947*eda14cbcSMatt Macy offsetof(inject_handler_t, zi_link)); 948*eda14cbcSMatt Macy } 949*eda14cbcSMatt Macy 950*eda14cbcSMatt Macy void 951*eda14cbcSMatt Macy zio_inject_fini(void) 952*eda14cbcSMatt Macy { 953*eda14cbcSMatt Macy list_destroy(&inject_handlers); 954*eda14cbcSMatt Macy mutex_destroy(&inject_delay_mtx); 955*eda14cbcSMatt Macy rw_destroy(&inject_lock); 956*eda14cbcSMatt Macy } 957*eda14cbcSMatt Macy 958*eda14cbcSMatt Macy #if defined(_KERNEL) 959*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_injection_enabled); 960*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_inject_fault); 961*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_inject_list_next); 962*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_clear_fault); 963*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_handle_fault_injection); 964*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_handle_device_injection); 965*eda14cbcSMatt Macy EXPORT_SYMBOL(zio_handle_label_injection); 966*eda14cbcSMatt Macy #endif 967