1 /* $OpenBSD: softraid_raid5.c,v 1.16 2014/09/14 14:17:24 jsg Exp $ */ 2 /* 3 * Copyright (c) 2014 Joel Sing <jsing@openbsd.org> 4 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 5 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include "bio.h" 21 22 #include <sys/param.h> 23 #include <sys/systm.h> 24 #include <sys/buf.h> 25 #include <sys/device.h> 26 #include <sys/ioctl.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/disklabel.h> 34 #include <sys/mount.h> 35 #include <sys/sensors.h> 36 #include <sys/stat.h> 37 #include <sys/task.h> 38 #include <sys/pool.h> 39 #include <sys/conf.h> 40 #include <sys/uio.h> 41 42 #include <scsi/scsi_all.h> 43 #include <scsi/scsiconf.h> 44 #include <scsi/scsi_disk.h> 45 46 #include <dev/softraidvar.h> 47 #include <dev/rndvar.h> 48 49 /* RAID 5 functions. */ 50 int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *, 51 int, int64_t); 52 int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *, 53 int, void *); 54 int sr_raid5_init(struct sr_discipline *); 55 int sr_raid5_rw(struct sr_workunit *); 56 int sr_raid5_openings(struct sr_discipline *); 57 void sr_raid5_intr(struct buf *); 58 int sr_raid5_wu_done(struct sr_workunit *); 59 void sr_raid5_set_chunk_state(struct sr_discipline *, int, int); 60 void sr_raid5_set_vol_state(struct sr_discipline *); 61 62 int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, daddr_t, 63 void *, int, int, void *); 64 int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, daddr_t, 65 void *); 66 int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int, 67 daddr_t, daddr_t, void *, int, int); 68 void sr_raid5_xor(void *, void *, int); 69 70 void sr_raid5_rebuild(struct sr_discipline *); 71 void sr_raid5_scrub(struct sr_discipline *); 72 73 /* discipline initialisation. */ 74 void 75 sr_raid5_discipline_init(struct sr_discipline *sd) 76 { 77 /* Fill out discipline members. */ 78 sd->sd_type = SR_MD_RAID5; 79 strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name)); 80 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE | 81 SR_CAP_REBUILD | SR_CAP_REDUNDANT; 82 sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */ 83 sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */ 84 85 /* Setup discipline specific function pointers. */ 86 sd->sd_assemble = sr_raid5_assemble; 87 sd->sd_create = sr_raid5_create; 88 sd->sd_openings = sr_raid5_openings; 89 sd->sd_rebuild = sr_raid5_rebuild; 90 sd->sd_scsi_rw = sr_raid5_rw; 91 sd->sd_scsi_intr = sr_raid5_intr; 92 sd->sd_scsi_wu_done = sr_raid5_wu_done; 93 sd->sd_set_chunk_state = sr_raid5_set_chunk_state; 94 sd->sd_set_vol_state = sr_raid5_set_vol_state; 95 } 96 97 int 98 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc, 99 int no_chunk, int64_t coerced_size) 100 { 101 if (no_chunk < 3) { 102 sr_error(sd->sd_sc, "%s requires three or more chunks", 103 sd->sd_name); 104 return EINVAL; 105 } 106 107 /* 108 * XXX add variable strip size later even though MAXPHYS is really 109 * the clever value, users like to tinker with that type of stuff. 110 */ 111 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; 112 sd->sd_meta->ssdi.ssd_size = (coerced_size & 113 ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >> 114 DEV_BSHIFT) - 1)) * (no_chunk - 1); 115 116 return sr_raid5_init(sd); 117 } 118 119 int 120 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, 121 int no_chunk, void *data) 122 { 123 return sr_raid5_init(sd); 124 } 125 126 int 127 sr_raid5_init(struct sr_discipline *sd) 128 { 129 /* Initialise runtime values. */ 130 sd->mds.mdd_raid5.sr5_strip_bits = 131 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 132 if (sd->mds.mdd_raid5.sr5_strip_bits == -1) { 133 sr_error(sd->sd_sc, "invalid strip size"); 134 return EINVAL; 135 } 136 137 return 0; 138 } 139 140 int 141 sr_raid5_openings(struct sr_discipline *sd) 142 { 143 /* Two work units per I/O, two for rebuild/scrub. */ 144 return ((sd->sd_max_wu - 2) >> 1); 145 } 146 147 void 148 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 149 { 150 int old_state, s; 151 152 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 153 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 154 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 155 156 /* ok to go to splbio since this only happens in error path */ 157 s = splbio(); 158 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 159 160 /* multiple IOs to the same chunk that fail will come through here */ 161 if (old_state == new_state) 162 goto done; 163 164 switch (old_state) { 165 case BIOC_SDONLINE: 166 switch (new_state) { 167 case BIOC_SDOFFLINE: 168 case BIOC_SDSCRUB: 169 break; 170 default: 171 goto die; 172 } 173 break; 174 175 case BIOC_SDOFFLINE: 176 if (new_state == BIOC_SDREBUILD) { 177 ; 178 } else 179 goto die; 180 break; 181 182 case BIOC_SDSCRUB: 183 switch (new_state) { 184 case BIOC_SDONLINE: 185 case BIOC_SDOFFLINE: 186 break; 187 default: 188 goto die; 189 } 190 break; 191 192 case BIOC_SDREBUILD: 193 switch (new_state) { 194 case BIOC_SDONLINE: 195 case BIOC_SDOFFLINE: 196 break; 197 default: 198 goto die; 199 } 200 break; 201 202 default: 203 die: 204 splx(s); /* XXX */ 205 panic("%s: %s: %s: invalid chunk state transition " 206 "%d -> %d", DEVNAME(sd->sd_sc), 207 sd->sd_meta->ssd_devname, 208 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 209 old_state, new_state); 210 /* NOTREACHED */ 211 } 212 213 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 214 sd->sd_set_vol_state(sd); 215 216 sd->sd_must_flush = 1; 217 task_add(systq, &sd->sd_meta_save_task); 218 done: 219 splx(s); 220 } 221 222 void 223 sr_raid5_set_vol_state(struct sr_discipline *sd) 224 { 225 int states[SR_MAX_STATES]; 226 int new_state, i, s, nd; 227 int old_state = sd->sd_vol_status; 228 229 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 230 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 231 232 nd = sd->sd_meta->ssdi.ssd_chunk_no; 233 234 for (i = 0; i < SR_MAX_STATES; i++) 235 states[i] = 0; 236 237 for (i = 0; i < nd; i++) { 238 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 239 if (s >= SR_MAX_STATES) 240 panic("%s: %s: %s: invalid chunk state", 241 DEVNAME(sd->sd_sc), 242 sd->sd_meta->ssd_devname, 243 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 244 states[s]++; 245 } 246 247 if (states[BIOC_SDONLINE] == nd) 248 new_state = BIOC_SVONLINE; 249 else if (states[BIOC_SDONLINE] < nd - 1) 250 new_state = BIOC_SVOFFLINE; 251 else if (states[BIOC_SDSCRUB] != 0) 252 new_state = BIOC_SVSCRUB; 253 else if (states[BIOC_SDREBUILD] != 0) 254 new_state = BIOC_SVREBUILD; 255 else if (states[BIOC_SDONLINE] == nd - 1) 256 new_state = BIOC_SVDEGRADED; 257 else { 258 #ifdef SR_DEBUG 259 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state " 260 "was %d\n", DEVNAME(sd->sd_sc), old_state); 261 for (i = 0; i < nd; i++) 262 DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n", 263 DEVNAME(sd->sd_sc), i, 264 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 265 #endif 266 panic("invalid volume state"); 267 } 268 269 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n", 270 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 271 old_state, new_state); 272 273 switch (old_state) { 274 case BIOC_SVONLINE: 275 switch (new_state) { 276 case BIOC_SVONLINE: /* can go to same state */ 277 case BIOC_SVOFFLINE: 278 case BIOC_SVDEGRADED: 279 case BIOC_SVREBUILD: /* happens on boot */ 280 break; 281 default: 282 goto die; 283 } 284 break; 285 286 case BIOC_SVOFFLINE: 287 /* XXX this might be a little too much */ 288 goto die; 289 290 case BIOC_SVDEGRADED: 291 switch (new_state) { 292 case BIOC_SVOFFLINE: 293 case BIOC_SVREBUILD: 294 case BIOC_SVDEGRADED: /* can go to the same state */ 295 break; 296 default: 297 goto die; 298 } 299 break; 300 301 case BIOC_SVBUILDING: 302 switch (new_state) { 303 case BIOC_SVONLINE: 304 case BIOC_SVOFFLINE: 305 case BIOC_SVBUILDING: /* can go to the same state */ 306 break; 307 default: 308 goto die; 309 } 310 break; 311 312 case BIOC_SVSCRUB: 313 switch (new_state) { 314 case BIOC_SVONLINE: 315 case BIOC_SVOFFLINE: 316 case BIOC_SVDEGRADED: 317 case BIOC_SVSCRUB: /* can go to same state */ 318 break; 319 default: 320 goto die; 321 } 322 break; 323 324 case BIOC_SVREBUILD: 325 switch (new_state) { 326 case BIOC_SVONLINE: 327 case BIOC_SVOFFLINE: 328 case BIOC_SVDEGRADED: 329 case BIOC_SVREBUILD: /* can go to the same state */ 330 break; 331 default: 332 goto die; 333 } 334 break; 335 336 default: 337 die: 338 panic("%s: %s: invalid volume state transition %d -> %d", 339 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 340 old_state, new_state); 341 /* NOTREACHED */ 342 } 343 344 sd->sd_vol_status = new_state; 345 } 346 347 static inline int 348 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk) 349 { 350 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 351 case BIOC_SDONLINE: 352 case BIOC_SDSCRUB: 353 return 1; 354 default: 355 return 0; 356 } 357 } 358 359 static inline int 360 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk) 361 { 362 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 363 case BIOC_SDREBUILD: 364 return 1; 365 default: 366 return 0; 367 } 368 } 369 370 int 371 sr_raid5_rw(struct sr_workunit *wu) 372 { 373 struct sr_workunit *wu_r = NULL; 374 struct sr_discipline *sd = wu->swu_dis; 375 struct scsi_xfer *xs = wu->swu_xs; 376 struct sr_chunk *scp; 377 daddr_t blk, lba; 378 int64_t chunk_offs, lbaoffs, phys_offs, strip_offs; 379 int64_t strip_bits, strip_no, strip_size; 380 int64_t chunk, no_chunk; 381 int64_t length, parity, datalen, row_size; 382 void *data; 383 int s; 384 385 /* blk and scsi error will be handled by sr_validate_io */ 386 if (sr_validate_io(wu, &blk, "sr_raid5_rw")) 387 goto bad; 388 389 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: lba %lld size %d\n", 390 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 391 (xs->flags & SCSI_DATA_IN) ? "read" : "write", 392 (long long)blk, xs->datalen); 393 394 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 395 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 396 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 397 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; 398 399 data = xs->data; 400 datalen = xs->datalen; 401 lbaoffs = blk << DEV_BSHIFT; 402 403 if (xs->flags & SCSI_DATA_OUT) { 404 if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){ 405 printf("%s: %s failed to get read work unit", 406 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 407 goto bad; 408 } 409 wu_r->swu_state = SR_WU_INPROGRESS; 410 wu_r->swu_flags |= SR_WUF_DISCIPLINE; 411 } 412 413 wu->swu_blk_start = 0; 414 while (datalen != 0) { 415 strip_no = lbaoffs >> strip_bits; 416 strip_offs = lbaoffs & (strip_size - 1); 417 chunk_offs = (strip_no / no_chunk) << strip_bits; 418 phys_offs = chunk_offs + strip_offs + 419 (sd->sd_meta->ssd_data_offset << DEV_BSHIFT); 420 421 /* get size remaining in this stripe */ 422 length = MIN(strip_size - strip_offs, datalen); 423 424 /* 425 * Map disk offset to data and parity chunks, using a left 426 * asymmetric algorithm for the parity assignment. 427 */ 428 chunk = strip_no % no_chunk; 429 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 430 if (chunk >= parity) 431 chunk++; 432 433 lba = phys_offs >> DEV_BSHIFT; 434 435 /* XXX big hammer.. exclude I/O from entire stripe */ 436 if (wu->swu_blk_start == 0) 437 wu->swu_blk_start = (strip_no / no_chunk) * row_size; 438 wu->swu_blk_end = (strip_no / no_chunk) * row_size + 439 (row_size - 1); 440 441 scp = sd->sd_vol.sv_chunks[chunk]; 442 if (xs->flags & SCSI_DATA_IN) { 443 switch (scp->src_meta.scm_status) { 444 case BIOC_SDONLINE: 445 case BIOC_SDSCRUB: 446 /* 447 * Chunk is online, issue a single read 448 * request. 449 */ 450 if (sr_raid5_addio(wu, chunk, lba, length, 451 data, xs->flags, 0, NULL)) 452 goto bad; 453 break; 454 case BIOC_SDOFFLINE: 455 case BIOC_SDREBUILD: 456 case BIOC_SDHOTSPARE: 457 if (sr_raid5_regenerate(wu, chunk, lba, 458 length, data)) 459 goto bad; 460 break; 461 default: 462 printf("%s: is offline, can't read\n", 463 DEVNAME(sd->sd_sc)); 464 goto bad; 465 } 466 } else { 467 if (sr_raid5_write(wu, wu_r, chunk, parity, lba, 468 length, data, xs->flags, 0)) 469 goto bad; 470 } 471 472 /* advance to next block */ 473 lbaoffs += length; 474 datalen -= length; 475 data += length; 476 } 477 478 s = splbio(); 479 if (wu_r) { 480 if (wu_r->swu_io_count > 0) { 481 /* collide write request with reads */ 482 wu_r->swu_blk_start = wu->swu_blk_start; 483 wu_r->swu_blk_end = wu->swu_blk_end; 484 485 wu->swu_state = SR_WU_DEFERRED; 486 wu_r->swu_collider = wu; 487 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link); 488 489 wu = wu_r; 490 } else { 491 sr_scsi_wu_put(sd, wu_r); 492 } 493 } 494 splx(s); 495 496 sr_schedule_wu(wu); 497 498 return (0); 499 500 bad: 501 /* wu is unwound by sr_wu_put */ 502 if (wu_r) 503 sr_scsi_wu_put(sd, wu_r); 504 return (1); 505 } 506 507 int 508 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno, 509 daddr_t len, void *data) 510 { 511 struct sr_discipline *sd = wu->swu_dis; 512 int i; 513 514 /* 515 * Regenerate a block on a RAID 5 volume by xoring the data and parity 516 * from all of the remaining online chunks. This requires the parity 517 * to already be correct. 518 */ 519 520 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, " 521 "regenerating block %llu\n", 522 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno); 523 524 memset(data, 0, len); 525 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 526 if (i == chunk) 527 continue; 528 if (!sr_raid5_chunk_online(sd, i)) 529 goto bad; 530 if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN, 531 0, data)) 532 goto bad; 533 } 534 return (0); 535 536 bad: 537 return (1); 538 } 539 540 int 541 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk, 542 int parity, daddr_t blkno, daddr_t len, void *data, int xsflags, 543 int ccbflags) 544 { 545 struct sr_discipline *sd = wu->swu_dis; 546 struct scsi_xfer *xs = wu->swu_xs; 547 void *xorbuf; 548 int chunk_online, chunk_rebuild; 549 int parity_online, parity_rebuild; 550 int other_offline = 0, other_rebuild = 0; 551 int i; 552 553 /* 554 * Perform a write to a RAID 5 volume. This write routine does not 555 * require the parity to already be correct and will operate on a 556 * uninitialised volume. 557 * 558 * There are four possible cases: 559 * 560 * 1) All data chunks and parity are online. In this case we read the 561 * data from all data chunks, except the one we are writing to, in 562 * order to calculate and write the new parity. 563 * 564 * 2) The parity chunk is offline. In this case we only need to write 565 * to the data chunk. No parity calculation is required. 566 * 567 * 3) The data chunk is offline. In this case we read the data from all 568 * online chunks in order to calculate and write the new parity. 569 * This is the same as (1) except we do not write the data chunk. 570 * 571 * 4) A different data chunk is offline. The new parity is calculated 572 * by taking the existing parity, xoring the original data and 573 * xoring in the new data. This requires that the parity already be 574 * correct, which it will be if any of the data chunks has 575 * previously been written. 576 * 577 * There is an additional complication introduced by a chunk that is 578 * being rebuilt. If this is the data or parity chunk, then we want 579 * to write to it as per normal. If it is another data chunk then we 580 * need to presume that it has not yet been regenerated and use the 581 * same method as detailed in (4) above. 582 */ 583 584 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i " 585 "blk %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 586 chunk, parity, (unsigned long long)blkno); 587 588 chunk_online = sr_raid5_chunk_online(sd, chunk); 589 chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk); 590 parity_online = sr_raid5_chunk_online(sd, parity); 591 parity_rebuild = sr_raid5_chunk_rebuild(sd, parity); 592 593 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 594 if (i == chunk || i == parity) 595 continue; 596 if (sr_raid5_chunk_rebuild(sd, i)) 597 other_rebuild = 1; 598 else if (!sr_raid5_chunk_online(sd, i)) 599 other_offline = 1; 600 } 601 602 DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, " 603 "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 604 chunk_online, parity_online, other_offline); 605 606 if (!parity_online && !parity_rebuild) 607 goto data_write; 608 609 xorbuf = sr_block_get(sd, len); 610 if (xorbuf == NULL) 611 goto bad; 612 memcpy(xorbuf, data, len); 613 614 if (other_offline || other_rebuild) { 615 616 /* 617 * XXX - If we can guarantee that this LBA has been scrubbed 618 * then we can also take this faster path. 619 */ 620 621 /* Read in existing data and existing parity. */ 622 if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL, 623 SCSI_DATA_IN, 0, xorbuf)) 624 goto bad; 625 if (sr_raid5_addio(wu_r, parity, blkno, len, NULL, 626 SCSI_DATA_IN, 0, xorbuf)) 627 goto bad; 628 629 } else { 630 631 /* Read in existing data from all other chunks. */ 632 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 633 if (i == chunk || i == parity) 634 continue; 635 if (sr_raid5_addio(wu_r, i, blkno, len, NULL, 636 SCSI_DATA_IN, 0, xorbuf)) 637 goto bad; 638 } 639 640 } 641 642 /* Write new parity. */ 643 if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags, 644 SR_CCBF_FREEBUF, NULL)) 645 goto bad; 646 647 data_write: 648 /* Write new data. */ 649 if (chunk_online || chunk_rebuild) 650 if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags, 651 0, NULL)) 652 goto bad; 653 654 return (0); 655 656 bad: 657 return (1); 658 } 659 660 void 661 sr_raid5_intr(struct buf *bp) 662 { 663 struct sr_ccb *ccb = (struct sr_ccb *)bp; 664 struct sr_workunit *wu = ccb->ccb_wu; 665 struct sr_discipline *sd = wu->swu_dis; 666 int s; 667 668 DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n", 669 DEVNAME(sd->sd_sc), bp, wu->swu_xs); 670 671 s = splbio(); 672 sr_ccb_done(ccb); 673 674 /* XXX - Should this be done via the taskq? */ 675 676 /* XOR data to result. */ 677 if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque) 678 sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data, 679 ccb->ccb_buf.b_bcount); 680 681 /* Free allocated data buffer. */ 682 if (ccb->ccb_flags & SR_CCBF_FREEBUF) { 683 sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); 684 ccb->ccb_buf.b_data = NULL; 685 } 686 687 sr_wu_done(wu); 688 splx(s); 689 } 690 691 int 692 sr_raid5_wu_done(struct sr_workunit *wu) 693 { 694 struct sr_discipline *sd = wu->swu_dis; 695 struct scsi_xfer *xs = wu->swu_xs; 696 697 /* XXX - we have no way of propagating errors... */ 698 if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD)) 699 return SR_WU_OK; 700 701 /* XXX - This is insufficient for RAID 5. */ 702 if (wu->swu_ios_succeeded > 0) { 703 xs->error = XS_NOERROR; 704 return SR_WU_OK; 705 } 706 707 if (xs->flags & SCSI_DATA_IN) { 708 printf("%s: retrying read on block %lld\n", 709 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 710 sr_wu_release_ccbs(wu); 711 wu->swu_state = SR_WU_RESTART; 712 if (sd->sd_scsi_rw(wu) == 0) 713 return SR_WU_RESTART; 714 } else { 715 /* XXX - retry write if we just went from online to degraded. */ 716 printf("%s: permanently fail write on block %lld\n", 717 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 718 } 719 720 wu->swu_state = SR_WU_FAILED; 721 xs->error = XS_DRIVER_STUFFUP; 722 723 return SR_WU_FAILED; 724 } 725 726 int 727 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno, 728 daddr_t len, void *data, int xsflags, int ccbflags, void *xorbuf) 729 { 730 struct sr_discipline *sd = wu->swu_dis; 731 struct sr_ccb *ccb; 732 733 DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld " 734 "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write", 735 chunk, (long long)blkno, (long long)len, xorbuf ? "X0R" : "-"); 736 737 /* Allocate temporary buffer. */ 738 if (data == NULL) { 739 data = sr_block_get(sd, len); 740 if (data == NULL) 741 return (-1); 742 ccbflags |= SR_CCBF_FREEBUF; 743 } 744 745 ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags); 746 if (ccb == NULL) { 747 if (ccbflags & SR_CCBF_FREEBUF) 748 sr_block_put(sd, data, len); 749 return (-1); 750 } 751 ccb->ccb_opaque = xorbuf; 752 sr_wu_enqueue_ccb(wu, ccb); 753 754 return (0); 755 } 756 757 void 758 sr_raid5_xor(void *a, void *b, int len) 759 { 760 uint32_t *xa = a, *xb = b; 761 762 len >>= 2; 763 while (len--) 764 *xa++ ^= *xb++; 765 } 766 767 void 768 sr_raid5_rebuild(struct sr_discipline *sd) 769 { 770 int64_t strip_no, strip_size, strip_bits, i, psz, rb; 771 int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size; 772 struct sr_workunit *wu_r, *wu_w; 773 int s, slept, percent = 0, old_percent = -1; 774 int rebuild_chunk = -1; 775 void *xorbuf; 776 777 /* Find the rebuild chunk. */ 778 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 779 if (sr_raid5_chunk_rebuild(sd, i)) { 780 rebuild_chunk = i; 781 break; 782 } 783 } 784 if (rebuild_chunk == -1) 785 goto bad; 786 787 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 788 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 789 chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1; 790 chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count; 791 chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits; 792 row_size = (chunk_count << strip_bits) >> DEV_BSHIFT; 793 794 /* XXX - handle restarts. */ 795 DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, " 796 "chunk count = %lld, chunk size = %lld, chunk strips = %lld, " 797 "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 798 sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips, 799 row_size); 800 801 for (strip_no = 0; strip_no < chunk_strips; strip_no++) { 802 chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no + 803 sd->sd_meta->ssd_data_offset; 804 805 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, " 806 "chunk lba = %lld\n", DEVNAME(sd->sd_sc), 807 sd->sd_meta->ssd_devname, strip_no, chunk_lba); 808 809 wu_w = sr_scsi_wu_get(sd, 0); 810 wu_r = sr_scsi_wu_get(sd, 0); 811 812 xorbuf = sr_block_get(sd, strip_size); 813 if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba, 814 strip_size, xorbuf)) 815 goto bad; 816 if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size, 817 xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL)) 818 goto bad; 819 820 /* Collide write work unit with read work unit. */ 821 wu_r->swu_state = SR_WU_INPROGRESS; 822 wu_r->swu_flags |= SR_WUF_REBUILD; 823 wu_w->swu_state = SR_WU_DEFERRED; 824 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 825 wu_r->swu_collider = wu_w; 826 827 /* Block I/O to this strip while we rebuild it. */ 828 wu_r->swu_blk_start = (strip_no / chunk_count) * row_size; 829 wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1; 830 wu_w->swu_blk_start = wu_r->swu_blk_start; 831 wu_w->swu_blk_end = wu_r->swu_blk_end; 832 833 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, " 834 "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc), 835 sd->sd_meta->ssd_devname, 836 wu_r->swu_blk_start, wu_r->swu_blk_end); 837 838 s = splbio(); 839 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 840 splx(s); 841 842 sr_schedule_wu(wu_r); 843 844 slept = 0; 845 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 846 tsleep(wu_w, PRIBIO, "sr_rebuild", 0); 847 slept = 1; 848 } 849 if (!slept) 850 tsleep(sd->sd_sc, PWAIT, "sr_yield", 1); 851 852 sr_scsi_wu_put(sd, wu_r); 853 sr_scsi_wu_put(sd, wu_w); 854 855 sd->sd_meta->ssd_rebuild = 856 (chunk_lba - sd->sd_meta->ssd_data_offset) * chunk_count; 857 858 psz = sd->sd_meta->ssdi.ssd_size; 859 rb = sd->sd_meta->ssd_rebuild; 860 if (rb > 0) 861 percent = 100 - ((psz * 100 - rb * 100) / psz) - 1; 862 else 863 percent = 0; 864 if (percent != old_percent && strip_no != chunk_strips - 1) { 865 if (sr_meta_save(sd, SR_META_DIRTY)) 866 printf("%s: could not save metadata to %s\n", 867 DEVNAME(sd->sd_sc), 868 sd->sd_meta->ssd_devname); 869 old_percent = percent; 870 } 871 872 if (sd->sd_reb_abort) 873 goto abort; 874 } 875 876 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc), 877 sd->sd_meta->ssd_devname); 878 879 /* all done */ 880 sd->sd_meta->ssd_rebuild = 0; 881 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 882 if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status == 883 BIOC_SDREBUILD) { 884 sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE); 885 break; 886 } 887 } 888 889 return; 890 891 abort: 892 if (sr_meta_save(sd, SR_META_DIRTY)) 893 printf("%s: could not save metadata to %s\n", 894 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 895 bad: 896 return; 897 } 898 899 #if 0 900 void 901 sr_raid5_scrub(struct sr_discipline *sd) 902 { 903 int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits; 904 int64_t i; 905 struct sr_workunit *wu_r, *wu_w; 906 int s, slept; 907 void *xorbuf; 908 909 wu_w = sr_scsi_wu_get(sd, 0); 910 wu_r = sr_scsi_wu_get(sd, 0); 911 912 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 913 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 914 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 915 max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits; 916 917 for (strip_no = 0; strip_no < max_strip; strip_no++) { 918 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 919 920 xorbuf = sr_block_get(sd, strip_size); 921 for (i = 0; i <= no_chunk; i++) { 922 if (i != parity) 923 sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size, 924 NULL, SCSI_DATA_IN, 0, xorbuf); 925 } 926 sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf, 927 SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL); 928 929 wu_r->swu_flags |= SR_WUF_REBUILD; 930 931 /* Collide wu_w with wu_r */ 932 wu_w->swu_state = SR_WU_DEFERRED; 933 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 934 wu_r->swu_collider = wu_w; 935 936 s = splbio(); 937 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 938 splx(s); 939 940 wu_r->swu_state = SR_WU_INPROGRESS; 941 sr_schedule_wu(wu_r); 942 943 slept = 0; 944 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 945 tsleep(wu_w, PRIBIO, "sr_scrub", 0); 946 slept = 1; 947 } 948 if (!slept) 949 tsleep(sd->sd_sc, PWAIT, "sr_yield", 1); 950 } 951 done: 952 return; 953 } 954 #endif 955