1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221544Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens #include <sys/zfs_context.h> 29789Sahrens #include <sys/spa.h> 30789Sahrens #include <sys/vdev_impl.h> 31789Sahrens #include <sys/zio.h> 32789Sahrens #include <sys/zio_checksum.h> 33789Sahrens #include <sys/fs/zfs.h> 341544Seschrock #include <sys/fm/fs/zfs.h> 35789Sahrens 36789Sahrens /* 37789Sahrens * Virtual device vector for RAID-Z. 38789Sahrens */ 39789Sahrens 40789Sahrens /* 41789Sahrens * We currently allow up to two-way replication (i.e. single-fault 42789Sahrens * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs 43789Sahrens * must all be multiples of two times the leaf vdev blocksize. 44789Sahrens */ 45789Sahrens #define VDEV_RAIDZ_ALIGN 2ULL 46789Sahrens 47789Sahrens typedef struct raidz_col { 48789Sahrens uint64_t rc_col; 49789Sahrens uint64_t rc_offset; 50789Sahrens uint64_t rc_size; 51789Sahrens void *rc_data; 52789Sahrens int rc_error; 53789Sahrens short rc_tried; 54789Sahrens short rc_skipped; 55789Sahrens } raidz_col_t; 56789Sahrens 57789Sahrens typedef struct raidz_map { 58789Sahrens uint64_t rm_cols; 59789Sahrens uint64_t rm_bigcols; 60789Sahrens uint64_t rm_asize; 61789Sahrens int rm_missing_child; 62789Sahrens int rm_firstdatacol; 63789Sahrens raidz_col_t rm_col[1]; 64789Sahrens } raidz_map_t; 65789Sahrens 66789Sahrens static raidz_map_t * 671133Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols) 68789Sahrens { 69789Sahrens raidz_map_t *rm; 70789Sahrens uint64_t b = zio->io_offset >> unit_shift; 71789Sahrens uint64_t s = zio->io_size >> unit_shift; 72789Sahrens uint64_t f = b % dcols; 73789Sahrens uint64_t o = (b / dcols) << unit_shift; 74789Sahrens uint64_t q, r, c, bc, col, acols, coff; 75789Sahrens int firstdatacol; 76789Sahrens 771133Seschrock q = s / (dcols - 1); 781133Seschrock r = s - q * (dcols - 1); 791133Seschrock bc = r + !!r; 801133Seschrock firstdatacol = 1; 81789Sahrens 82789Sahrens acols = (q == 0 ? bc : dcols); 83789Sahrens 84789Sahrens rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 85789Sahrens 86789Sahrens rm->rm_cols = acols; 87789Sahrens rm->rm_bigcols = bc; 88789Sahrens rm->rm_asize = 0; 89789Sahrens rm->rm_missing_child = -1; 90789Sahrens rm->rm_firstdatacol = firstdatacol; 91789Sahrens 92789Sahrens for (c = 0; c < acols; c++) { 93789Sahrens col = f + c; 94789Sahrens coff = o; 95789Sahrens if (col >= dcols) { 96789Sahrens col -= dcols; 97789Sahrens coff += 1ULL << unit_shift; 98789Sahrens } 99789Sahrens rm->rm_col[c].rc_col = col; 100789Sahrens rm->rm_col[c].rc_offset = coff; 101789Sahrens rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 102789Sahrens rm->rm_col[c].rc_data = NULL; 103789Sahrens rm->rm_col[c].rc_error = 0; 104789Sahrens rm->rm_col[c].rc_tried = 0; 105789Sahrens rm->rm_col[c].rc_skipped = 0; 106789Sahrens rm->rm_asize += rm->rm_col[c].rc_size; 107789Sahrens } 108789Sahrens 109789Sahrens rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); 110789Sahrens 111789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 112789Sahrens rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 113789Sahrens 114789Sahrens rm->rm_col[c].rc_data = zio->io_data; 115789Sahrens 116789Sahrens for (c = c + 1; c < acols; c++) 117789Sahrens rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 118789Sahrens rm->rm_col[c - 1].rc_size; 119789Sahrens 1201133Seschrock /* 1211133Seschrock * To prevent hot parity disks, switch the parity and data 1221133Seschrock * columns every 1MB. 1231133Seschrock */ 1241133Seschrock ASSERT(rm->rm_cols >= 2); 1251133Seschrock ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 126789Sahrens 1271133Seschrock if (zio->io_offset & (1ULL << 20)) { 1281133Seschrock col = rm->rm_col[0].rc_col; 1291133Seschrock o = rm->rm_col[0].rc_offset; 1301133Seschrock rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; 1311133Seschrock rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1321133Seschrock rm->rm_col[1].rc_col = col; 1331133Seschrock rm->rm_col[1].rc_offset = o; 134789Sahrens } 135789Sahrens 136789Sahrens zio->io_vsd = rm; 137789Sahrens return (rm); 138789Sahrens } 139789Sahrens 140789Sahrens static void 141789Sahrens vdev_raidz_map_free(zio_t *zio) 142789Sahrens { 143789Sahrens raidz_map_t *rm = zio->io_vsd; 144789Sahrens int c; 145789Sahrens 146789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) 147789Sahrens zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 148789Sahrens 149789Sahrens kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 150789Sahrens zio->io_vsd = NULL; 151789Sahrens } 152789Sahrens 153789Sahrens static void 154789Sahrens vdev_raidz_reconstruct(raidz_map_t *rm, int x) 155789Sahrens { 156789Sahrens uint64_t *dst, *src, count, xsize, csize; 157789Sahrens int i, c; 158789Sahrens 159789Sahrens for (c = 0; c < rm->rm_cols; c++) { 160789Sahrens if (c == x) 161789Sahrens continue; 162789Sahrens src = rm->rm_col[c].rc_data; 163789Sahrens dst = rm->rm_col[x].rc_data; 164789Sahrens csize = rm->rm_col[c].rc_size; 165789Sahrens xsize = rm->rm_col[x].rc_size; 166789Sahrens count = MIN(csize, xsize) / sizeof (uint64_t); 167789Sahrens if (c == !x) { 168789Sahrens /* 169789Sahrens * The initial copy happens at either c == 0 or c == 1. 170789Sahrens * Both of these columns are 'big' columns, so we'll 171789Sahrens * definitely initialize all of column x. 172789Sahrens */ 173789Sahrens ASSERT3U(xsize, <=, csize); 174789Sahrens for (i = 0; i < count; i++) 175789Sahrens *dst++ = *src++; 176789Sahrens } else { 177789Sahrens for (i = 0; i < count; i++) 178789Sahrens *dst++ ^= *src++; 179789Sahrens } 180789Sahrens } 181789Sahrens } 182789Sahrens 183789Sahrens static int 184789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 185789Sahrens { 186789Sahrens vdev_t *cvd; 187789Sahrens int c, error; 188789Sahrens int lasterror = 0; 189789Sahrens int numerrors = 0; 190789Sahrens 191789Sahrens /* 192789Sahrens * XXX -- minimum children should be raid-type-specific 193789Sahrens */ 194789Sahrens if (vd->vdev_children < 2) { 195789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 196789Sahrens return (EINVAL); 197789Sahrens } 198789Sahrens 199789Sahrens for (c = 0; c < vd->vdev_children; c++) { 200789Sahrens cvd = vd->vdev_child[c]; 201789Sahrens 202789Sahrens if ((error = vdev_open(cvd)) != 0) { 203789Sahrens lasterror = error; 204789Sahrens numerrors++; 205789Sahrens continue; 206789Sahrens } 207789Sahrens 208789Sahrens *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 209*1732Sbonwick *ashift = MAX(*ashift, cvd->vdev_ashift); 210789Sahrens } 211789Sahrens 212789Sahrens *asize *= vd->vdev_children; 213789Sahrens 214789Sahrens if (numerrors > 1) { 215789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 216789Sahrens return (lasterror); 217789Sahrens } 218789Sahrens 219789Sahrens return (0); 220789Sahrens } 221789Sahrens 222789Sahrens static void 223789Sahrens vdev_raidz_close(vdev_t *vd) 224789Sahrens { 225789Sahrens int c; 226789Sahrens 227789Sahrens for (c = 0; c < vd->vdev_children; c++) 228789Sahrens vdev_close(vd->vdev_child[c]); 229789Sahrens } 230789Sahrens 231789Sahrens static uint64_t 232789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize) 233789Sahrens { 234789Sahrens uint64_t asize; 235*1732Sbonwick uint64_t ashift = vd->vdev_top->vdev_ashift; 236789Sahrens uint64_t cols = vd->vdev_children; 237789Sahrens 238*1732Sbonwick asize = ((psize - 1) >> ashift) + 1; 239789Sahrens asize += (asize + cols - 2) / (cols - 1); 240*1732Sbonwick asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift; 241789Sahrens 242789Sahrens return (asize); 243789Sahrens } 244789Sahrens 245789Sahrens static void 246789Sahrens vdev_raidz_child_done(zio_t *zio) 247789Sahrens { 248789Sahrens raidz_col_t *rc = zio->io_private; 249789Sahrens 250789Sahrens rc->rc_error = zio->io_error; 251789Sahrens rc->rc_tried = 1; 252789Sahrens rc->rc_skipped = 0; 253789Sahrens } 254789Sahrens 255789Sahrens static void 256789Sahrens vdev_raidz_repair_done(zio_t *zio) 257789Sahrens { 258*1732Sbonwick ASSERT(zio->io_private == zio->io_parent); 259*1732Sbonwick vdev_raidz_map_free(zio->io_private); 260789Sahrens } 261789Sahrens 262789Sahrens static void 263789Sahrens vdev_raidz_io_start(zio_t *zio) 264789Sahrens { 265789Sahrens vdev_t *vd = zio->io_vd; 266*1732Sbonwick vdev_t *tvd = vd->vdev_top; 267789Sahrens vdev_t *cvd; 268789Sahrens blkptr_t *bp = zio->io_bp; 269789Sahrens raidz_map_t *rm; 270789Sahrens raidz_col_t *rc; 271789Sahrens int c; 272789Sahrens 273*1732Sbonwick rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children); 274789Sahrens 275789Sahrens if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { 276789Sahrens ASSERT3U(rm->rm_asize, ==, 277789Sahrens vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); 278789Sahrens } else { 279789Sahrens ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); 280789Sahrens } 281789Sahrens 282789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 283789Sahrens 284789Sahrens /* 285789Sahrens * Generate RAID parity in virtual column 0. 286789Sahrens */ 287789Sahrens vdev_raidz_reconstruct(rm, 0); 288789Sahrens 289789Sahrens for (c = 0; c < rm->rm_cols; c++) { 290789Sahrens rc = &rm->rm_col[c]; 291789Sahrens cvd = vd->vdev_child[rc->rc_col]; 292789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 293789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 294789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 295789Sahrens vdev_raidz_child_done, rc)); 296789Sahrens } 297789Sahrens zio_wait_children_done(zio); 298789Sahrens return; 299789Sahrens } 300789Sahrens 301789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 302789Sahrens 303789Sahrens for (c = rm->rm_cols - 1; c >= 0; c--) { 304789Sahrens rc = &rm->rm_col[c]; 305789Sahrens cvd = vd->vdev_child[rc->rc_col]; 306789Sahrens if (vdev_is_dead(cvd)) { 307789Sahrens rm->rm_missing_child = c; 308789Sahrens rc->rc_error = ENXIO; 309789Sahrens rc->rc_tried = 1; /* don't even try */ 310789Sahrens rc->rc_skipped = 1; 311789Sahrens continue; 312789Sahrens } 313789Sahrens if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 314789Sahrens rm->rm_missing_child = c; 315789Sahrens rc->rc_error = ESTALE; 316789Sahrens rc->rc_skipped = 1; 317789Sahrens continue; 318789Sahrens } 319789Sahrens if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || 320789Sahrens (zio->io_flags & ZIO_FLAG_SCRUB)) { 321789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 322789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 323789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 324789Sahrens vdev_raidz_child_done, rc)); 325789Sahrens } 326789Sahrens } 327789Sahrens 328789Sahrens zio_wait_children_done(zio); 329789Sahrens } 330789Sahrens 3311544Seschrock /* 3321544Seschrock * Report a checksum error for a child of a RAID-Z device. 3331544Seschrock */ 3341544Seschrock static void 3351544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 3361544Seschrock { 3371544Seschrock vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col]; 3381544Seschrock dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 3391544Seschrock vdev_description(vd)); 3401544Seschrock 3411544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3421544Seschrock mutex_enter(&vd->vdev_stat_lock); 3431544Seschrock vd->vdev_stat.vs_checksum_errors++; 3441544Seschrock mutex_exit(&vd->vdev_stat_lock); 3451544Seschrock } 3461544Seschrock 3471544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 3481544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 3491544Seschrock zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 3501544Seschrock } 3511544Seschrock 3521544Seschrock 353789Sahrens static void 354789Sahrens vdev_raidz_io_done(zio_t *zio) 355789Sahrens { 356789Sahrens vdev_t *vd = zio->io_vd; 357789Sahrens vdev_t *cvd; 358789Sahrens raidz_map_t *rm = zio->io_vsd; 359789Sahrens raidz_col_t *rc; 360789Sahrens blkptr_t *bp = zio->io_bp; 361789Sahrens int unexpected_errors = 0; 362789Sahrens int c; 363789Sahrens 364789Sahrens ASSERT(bp != NULL); /* XXX need to add code to enforce this */ 365789Sahrens 366789Sahrens zio->io_error = 0; 367789Sahrens zio->io_numerrors = 0; 368789Sahrens 369789Sahrens for (c = 0; c < rm->rm_cols; c++) { 370789Sahrens rc = &rm->rm_col[c]; 371789Sahrens 372789Sahrens /* 373789Sahrens * We preserve any EIOs because those may be worth retrying; 374789Sahrens * whereas ECKSUM and ENXIO are more likely to be persistent. 375789Sahrens */ 376789Sahrens if (rc->rc_error) { 377789Sahrens if (zio->io_error != EIO) 378789Sahrens zio->io_error = rc->rc_error; 379789Sahrens if (!rc->rc_skipped) 380789Sahrens unexpected_errors++; 381789Sahrens zio->io_numerrors++; 382789Sahrens } 383789Sahrens } 384789Sahrens 385789Sahrens if (zio->io_type == ZIO_TYPE_WRITE) { 386789Sahrens /* 387789Sahrens * If this is not a failfast write, and we were able to 388789Sahrens * write enough columns to reconstruct the data, good enough. 389789Sahrens */ 390789Sahrens /* XXPOLICY */ 391789Sahrens if (zio->io_numerrors <= rm->rm_firstdatacol && 392789Sahrens !(zio->io_flags & ZIO_FLAG_FAILFAST)) 393789Sahrens zio->io_error = 0; 394789Sahrens 395789Sahrens vdev_raidz_map_free(zio); 396789Sahrens zio_next_stage(zio); 397789Sahrens return; 398789Sahrens } 399789Sahrens 400789Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 401789Sahrens 402789Sahrens /* 403789Sahrens * If there were no I/O errors, and the data checksums correctly, 404789Sahrens * the read is complete. 405789Sahrens */ 406789Sahrens /* XXPOLICY */ 407789Sahrens if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { 408789Sahrens ASSERT(unexpected_errors == 0); 409789Sahrens ASSERT(zio->io_error == 0); 410789Sahrens 411789Sahrens /* 412789Sahrens * We know the data's good. If we read the parity, 413789Sahrens * verify that it's good as well. If not, fix it. 414789Sahrens */ 415789Sahrens for (c = 0; c < rm->rm_firstdatacol; c++) { 416789Sahrens void *orig; 417789Sahrens rc = &rm->rm_col[c]; 418789Sahrens if (!rc->rc_tried) 419789Sahrens continue; 420789Sahrens orig = zio_buf_alloc(rc->rc_size); 421789Sahrens bcopy(rc->rc_data, orig, rc->rc_size); 422789Sahrens vdev_raidz_reconstruct(rm, c); 423789Sahrens if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { 4241544Seschrock raidz_checksum_error(zio, rc); 425789Sahrens rc->rc_error = ECKSUM; 426789Sahrens unexpected_errors++; 427789Sahrens } 428789Sahrens zio_buf_free(orig, rc->rc_size); 429789Sahrens } 430789Sahrens goto done; 431789Sahrens } 432789Sahrens 433789Sahrens /* 434789Sahrens * If there was exactly one I/O error, it's the one we expected, 435789Sahrens * and the reconstructed data checksums, the read is complete. 436789Sahrens * This happens when one child is offline and vdev_fault_assess() 437789Sahrens * knows it, or when one child has stale data and the DTL knows it. 438789Sahrens */ 439789Sahrens if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { 440789Sahrens rc = &rm->rm_col[c]; 441789Sahrens ASSERT(unexpected_errors == 0); 442789Sahrens ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); 443789Sahrens vdev_raidz_reconstruct(rm, c); 444789Sahrens if (zio_checksum_error(zio) == 0) { 445789Sahrens zio->io_error = 0; 446789Sahrens goto done; 447789Sahrens } 448789Sahrens } 449789Sahrens 450789Sahrens /* 451789Sahrens * This isn't a typical error -- either we got a read error or 452789Sahrens * more than one child claimed a problem. Read every block we 453789Sahrens * haven't already so we can try combinatorial reconstruction. 454789Sahrens */ 455789Sahrens unexpected_errors = 1; 456789Sahrens rm->rm_missing_child = -1; 457789Sahrens 458789Sahrens for (c = 0; c < rm->rm_cols; c++) 459789Sahrens if (!rm->rm_col[c].rc_tried) 460789Sahrens break; 461789Sahrens 462789Sahrens if (c != rm->rm_cols) { 463789Sahrens zio->io_error = 0; 464789Sahrens zio_vdev_io_redone(zio); 465789Sahrens for (c = 0; c < rm->rm_cols; c++) { 466789Sahrens rc = &rm->rm_col[c]; 467789Sahrens if (rc->rc_tried) 468789Sahrens continue; 469789Sahrens zio_nowait(zio_vdev_child_io(zio, NULL, 470789Sahrens vd->vdev_child[rc->rc_col], 471789Sahrens rc->rc_offset, rc->rc_data, rc->rc_size, 472789Sahrens zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 473789Sahrens vdev_raidz_child_done, rc)); 474789Sahrens } 475789Sahrens zio_wait_children_done(zio); 476789Sahrens return; 477789Sahrens } 478789Sahrens 479789Sahrens /* 480789Sahrens * If there were more errors than parity disks, give up. 481789Sahrens */ 482789Sahrens if (zio->io_numerrors > rm->rm_firstdatacol) { 483789Sahrens ASSERT(zio->io_error != 0); 484789Sahrens goto done; 485789Sahrens } 486789Sahrens 487789Sahrens /* 488789Sahrens * The number of I/O errors is correctable. Correct them here. 489789Sahrens */ 490789Sahrens ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); 491789Sahrens for (c = 0; c < rm->rm_cols; c++) { 492789Sahrens rc = &rm->rm_col[c]; 493789Sahrens ASSERT(rc->rc_tried); 494789Sahrens if (rc->rc_error) { 495789Sahrens vdev_raidz_reconstruct(rm, c); 496789Sahrens if (zio_checksum_error(zio) == 0) 497789Sahrens zio->io_error = 0; 498789Sahrens else 499789Sahrens zio->io_error = rc->rc_error; 500789Sahrens goto done; 501789Sahrens } 502789Sahrens } 503789Sahrens 504789Sahrens /* 505789Sahrens * There were no I/O errors, but the data doesn't checksum. 506789Sahrens * Try all permutations to see if we can find one that does. 507789Sahrens */ 508789Sahrens ASSERT(zio->io_numerrors == 0); 509789Sahrens for (c = 0; c < rm->rm_cols; c++) { 510789Sahrens void *orig; 511789Sahrens rc = &rm->rm_col[c]; 512789Sahrens 513789Sahrens orig = zio_buf_alloc(rc->rc_size); 514789Sahrens bcopy(rc->rc_data, orig, rc->rc_size); 515789Sahrens vdev_raidz_reconstruct(rm, c); 516789Sahrens 517789Sahrens if (zio_checksum_error(zio) == 0) { 518789Sahrens zio_buf_free(orig, rc->rc_size); 519789Sahrens zio->io_error = 0; 520789Sahrens /* 521789Sahrens * If this child didn't know that it returned bad data, 522789Sahrens * inform it. 523789Sahrens */ 524789Sahrens if (rc->rc_tried && rc->rc_error == 0) 5251544Seschrock raidz_checksum_error(zio, rc); 526789Sahrens rc->rc_error = ECKSUM; 527789Sahrens goto done; 528789Sahrens } 529789Sahrens 530789Sahrens bcopy(orig, rc->rc_data, rc->rc_size); 531789Sahrens zio_buf_free(orig, rc->rc_size); 532789Sahrens } 533789Sahrens 534789Sahrens /* 5351544Seschrock * All combinations failed to checksum. Generate checksum ereports for 5361544Seschrock * every one. 537789Sahrens */ 538789Sahrens zio->io_error = ECKSUM; 5391544Seschrock if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 5401544Seschrock for (c = 0; c < rm->rm_cols; c++) { 5411544Seschrock rc = &rm->rm_col[c]; 5421544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 5431544Seschrock zio->io_spa, vd->vdev_child[rc->rc_col], zio, 5441544Seschrock rc->rc_offset, rc->rc_size); 5451544Seschrock } 5461544Seschrock } 547789Sahrens 548789Sahrens done: 549789Sahrens zio_checksum_verified(zio); 550789Sahrens 551789Sahrens if (zio->io_error == 0 && (spa_mode & FWRITE) && 552789Sahrens (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 553*1732Sbonwick zio_t *rio; 554*1732Sbonwick 555789Sahrens /* 556789Sahrens * Use the good data we have in hand to repair damaged children. 557*1732Sbonwick * 558*1732Sbonwick * We issue all repair I/Os as children of 'rio' to arrange 559*1732Sbonwick * that vdev_raidz_map_free(zio) will be invoked after all 560*1732Sbonwick * repairs complete, but before we advance to the next stage. 561789Sahrens */ 562*1732Sbonwick rio = zio_null(zio, zio->io_spa, 563*1732Sbonwick vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); 564*1732Sbonwick 565789Sahrens for (c = 0; c < rm->rm_cols; c++) { 566789Sahrens rc = &rm->rm_col[c]; 567789Sahrens cvd = vd->vdev_child[rc->rc_col]; 568789Sahrens 569*1732Sbonwick if (rc->rc_error == 0) 570*1732Sbonwick continue; 571*1732Sbonwick 572*1732Sbonwick dprintf("%s resilvered %s @ 0x%llx error %d\n", 573*1732Sbonwick vdev_description(vd), 574*1732Sbonwick vdev_description(cvd), 575*1732Sbonwick zio->io_offset, rc->rc_error); 576789Sahrens 577*1732Sbonwick zio_nowait(zio_vdev_child_io(rio, NULL, cvd, 578*1732Sbonwick rc->rc_offset, rc->rc_data, rc->rc_size, 579*1732Sbonwick ZIO_TYPE_WRITE, zio->io_priority, 580*1732Sbonwick ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | 581*1732Sbonwick ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); 582*1732Sbonwick } 583789Sahrens 584*1732Sbonwick zio_nowait(rio); 585*1732Sbonwick zio_wait_children_done(zio); 586*1732Sbonwick return; 587789Sahrens } 588789Sahrens 589789Sahrens vdev_raidz_map_free(zio); 590789Sahrens zio_next_stage(zio); 591789Sahrens } 592789Sahrens 593789Sahrens static void 594789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 595789Sahrens { 596789Sahrens if (faulted > 1) 5971544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 5981544Seschrock VDEV_AUX_NO_REPLICAS); 599789Sahrens else if (degraded + faulted != 0) 6001544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 601789Sahrens else 6021544Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 603789Sahrens } 604789Sahrens 605789Sahrens vdev_ops_t vdev_raidz_ops = { 606789Sahrens vdev_raidz_open, 607789Sahrens vdev_raidz_close, 608789Sahrens vdev_raidz_asize, 609789Sahrens vdev_raidz_io_start, 610789Sahrens vdev_raidz_io_done, 611789Sahrens vdev_raidz_state_change, 612789Sahrens VDEV_TYPE_RAIDZ, /* name of this vdev type */ 613789Sahrens B_FALSE /* not a leaf vdev */ 614789Sahrens }; 615