1 /* $OpenBSD: softraid_raid5.c,v 1.15 2014/01/23 00:22:35 jsing Exp $ */ 2 /* 3 * Copyright (c) 2014 Joel Sing <jsing@openbsd.org> 4 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 5 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include "bio.h" 21 22 #include <sys/param.h> 23 #include <sys/systm.h> 24 #include <sys/buf.h> 25 #include <sys/device.h> 26 #include <sys/ioctl.h> 27 #include <sys/proc.h> 28 #include <sys/malloc.h> 29 #include <sys/kernel.h> 30 #include <sys/disk.h> 31 #include <sys/rwlock.h> 32 #include <sys/queue.h> 33 #include <sys/fcntl.h> 34 #include <sys/disklabel.h> 35 #include <sys/mount.h> 36 #include <sys/sensors.h> 37 #include <sys/stat.h> 38 #include <sys/task.h> 39 #include <sys/pool.h> 40 #include <sys/conf.h> 41 #include <sys/uio.h> 42 43 #include <scsi/scsi_all.h> 44 #include <scsi/scsiconf.h> 45 #include <scsi/scsi_disk.h> 46 47 #include <dev/softraidvar.h> 48 #include <dev/rndvar.h> 49 50 /* RAID 5 functions. */ 51 int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *, 52 int, int64_t); 53 int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *, 54 int, void *); 55 int sr_raid5_init(struct sr_discipline *); 56 int sr_raid5_rw(struct sr_workunit *); 57 int sr_raid5_openings(struct sr_discipline *); 58 void sr_raid5_intr(struct buf *); 59 int sr_raid5_wu_done(struct sr_workunit *); 60 void sr_raid5_set_chunk_state(struct sr_discipline *, int, int); 61 void sr_raid5_set_vol_state(struct sr_discipline *); 62 63 int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, daddr_t, 64 void *, int, int, void *); 65 int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, daddr_t, 66 void *); 67 int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int, 68 daddr_t, daddr_t, void *, int, int); 69 void sr_raid5_xor(void *, void *, int); 70 71 void sr_raid5_rebuild(struct sr_discipline *); 72 void sr_raid5_scrub(struct sr_discipline *); 73 74 /* discipline initialisation. */ 75 void 76 sr_raid5_discipline_init(struct sr_discipline *sd) 77 { 78 /* Fill out discipline members. */ 79 sd->sd_type = SR_MD_RAID5; 80 strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name)); 81 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE | 82 SR_CAP_REBUILD | SR_CAP_REDUNDANT; 83 sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */ 84 sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */ 85 86 /* Setup discipline specific function pointers. */ 87 sd->sd_assemble = sr_raid5_assemble; 88 sd->sd_create = sr_raid5_create; 89 sd->sd_openings = sr_raid5_openings; 90 sd->sd_rebuild = sr_raid5_rebuild; 91 sd->sd_scsi_rw = sr_raid5_rw; 92 sd->sd_scsi_intr = sr_raid5_intr; 93 sd->sd_scsi_wu_done = sr_raid5_wu_done; 94 sd->sd_set_chunk_state = sr_raid5_set_chunk_state; 95 sd->sd_set_vol_state = sr_raid5_set_vol_state; 96 } 97 98 int 99 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc, 100 int no_chunk, int64_t coerced_size) 101 { 102 if (no_chunk < 3) { 103 sr_error(sd->sd_sc, "%s requires three or more chunks", 104 sd->sd_name); 105 return EINVAL; 106 } 107 108 /* 109 * XXX add variable strip size later even though MAXPHYS is really 110 * the clever value, users like to tinker with that type of stuff. 111 */ 112 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; 113 sd->sd_meta->ssdi.ssd_size = (coerced_size & 114 ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >> 115 DEV_BSHIFT) - 1)) * (no_chunk - 1); 116 117 return sr_raid5_init(sd); 118 } 119 120 int 121 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, 122 int no_chunk, void *data) 123 { 124 return sr_raid5_init(sd); 125 } 126 127 int 128 sr_raid5_init(struct sr_discipline *sd) 129 { 130 /* Initialise runtime values. */ 131 sd->mds.mdd_raid5.sr5_strip_bits = 132 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 133 if (sd->mds.mdd_raid5.sr5_strip_bits == -1) { 134 sr_error(sd->sd_sc, "invalid strip size"); 135 return EINVAL; 136 } 137 138 return 0; 139 } 140 141 int 142 sr_raid5_openings(struct sr_discipline *sd) 143 { 144 /* Two work units per I/O, two for rebuild/scrub. */ 145 return ((sd->sd_max_wu - 2) >> 1); 146 } 147 148 void 149 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 150 { 151 int old_state, s; 152 153 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 154 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 155 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 156 157 /* ok to go to splbio since this only happens in error path */ 158 s = splbio(); 159 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 160 161 /* multiple IOs to the same chunk that fail will come through here */ 162 if (old_state == new_state) 163 goto done; 164 165 switch (old_state) { 166 case BIOC_SDONLINE: 167 switch (new_state) { 168 case BIOC_SDOFFLINE: 169 case BIOC_SDSCRUB: 170 break; 171 default: 172 goto die; 173 } 174 break; 175 176 case BIOC_SDOFFLINE: 177 if (new_state == BIOC_SDREBUILD) { 178 ; 179 } else 180 goto die; 181 break; 182 183 case BIOC_SDSCRUB: 184 switch (new_state) { 185 case BIOC_SDONLINE: 186 case BIOC_SDOFFLINE: 187 break; 188 default: 189 goto die; 190 } 191 break; 192 193 case BIOC_SDREBUILD: 194 switch (new_state) { 195 case BIOC_SDONLINE: 196 case BIOC_SDOFFLINE: 197 break; 198 default: 199 goto die; 200 } 201 break; 202 203 default: 204 die: 205 splx(s); /* XXX */ 206 panic("%s: %s: %s: invalid chunk state transition " 207 "%d -> %d", DEVNAME(sd->sd_sc), 208 sd->sd_meta->ssd_devname, 209 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 210 old_state, new_state); 211 /* NOTREACHED */ 212 } 213 214 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 215 sd->sd_set_vol_state(sd); 216 217 sd->sd_must_flush = 1; 218 task_add(systq, &sd->sd_meta_save_task); 219 done: 220 splx(s); 221 } 222 223 void 224 sr_raid5_set_vol_state(struct sr_discipline *sd) 225 { 226 int states[SR_MAX_STATES]; 227 int new_state, i, s, nd; 228 int old_state = sd->sd_vol_status; 229 230 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 231 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 232 233 nd = sd->sd_meta->ssdi.ssd_chunk_no; 234 235 for (i = 0; i < SR_MAX_STATES; i++) 236 states[i] = 0; 237 238 for (i = 0; i < nd; i++) { 239 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 240 if (s >= SR_MAX_STATES) 241 panic("%s: %s: %s: invalid chunk state", 242 DEVNAME(sd->sd_sc), 243 sd->sd_meta->ssd_devname, 244 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 245 states[s]++; 246 } 247 248 if (states[BIOC_SDONLINE] == nd) 249 new_state = BIOC_SVONLINE; 250 else if (states[BIOC_SDONLINE] < nd - 1) 251 new_state = BIOC_SVOFFLINE; 252 else if (states[BIOC_SDSCRUB] != 0) 253 new_state = BIOC_SVSCRUB; 254 else if (states[BIOC_SDREBUILD] != 0) 255 new_state = BIOC_SVREBUILD; 256 else if (states[BIOC_SDONLINE] == nd - 1) 257 new_state = BIOC_SVDEGRADED; 258 else { 259 #ifdef SR_DEBUG 260 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state " 261 "was %d\n", DEVNAME(sd->sd_sc), old_state); 262 for (i = 0; i < nd; i++) 263 DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n", 264 DEVNAME(sd->sd_sc), i, 265 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 266 #endif 267 panic("invalid volume state"); 268 } 269 270 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n", 271 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 272 old_state, new_state); 273 274 switch (old_state) { 275 case BIOC_SVONLINE: 276 switch (new_state) { 277 case BIOC_SVONLINE: /* can go to same state */ 278 case BIOC_SVOFFLINE: 279 case BIOC_SVDEGRADED: 280 case BIOC_SVREBUILD: /* happens on boot */ 281 break; 282 default: 283 goto die; 284 } 285 break; 286 287 case BIOC_SVOFFLINE: 288 /* XXX this might be a little too much */ 289 goto die; 290 291 case BIOC_SVDEGRADED: 292 switch (new_state) { 293 case BIOC_SVOFFLINE: 294 case BIOC_SVREBUILD: 295 case BIOC_SVDEGRADED: /* can go to the same state */ 296 break; 297 default: 298 goto die; 299 } 300 break; 301 302 case BIOC_SVBUILDING: 303 switch (new_state) { 304 case BIOC_SVONLINE: 305 case BIOC_SVOFFLINE: 306 case BIOC_SVBUILDING: /* can go to the same state */ 307 break; 308 default: 309 goto die; 310 } 311 break; 312 313 case BIOC_SVSCRUB: 314 switch (new_state) { 315 case BIOC_SVONLINE: 316 case BIOC_SVOFFLINE: 317 case BIOC_SVDEGRADED: 318 case BIOC_SVSCRUB: /* can go to same state */ 319 break; 320 default: 321 goto die; 322 } 323 break; 324 325 case BIOC_SVREBUILD: 326 switch (new_state) { 327 case BIOC_SVONLINE: 328 case BIOC_SVOFFLINE: 329 case BIOC_SVDEGRADED: 330 case BIOC_SVREBUILD: /* can go to the same state */ 331 break; 332 default: 333 goto die; 334 } 335 break; 336 337 default: 338 die: 339 panic("%s: %s: invalid volume state transition %d -> %d", 340 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 341 old_state, new_state); 342 /* NOTREACHED */ 343 } 344 345 sd->sd_vol_status = new_state; 346 } 347 348 static inline int 349 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk) 350 { 351 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 352 case BIOC_SDONLINE: 353 case BIOC_SDSCRUB: 354 return 1; 355 default: 356 return 0; 357 } 358 } 359 360 static inline int 361 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk) 362 { 363 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 364 case BIOC_SDREBUILD: 365 return 1; 366 default: 367 return 0; 368 } 369 } 370 371 int 372 sr_raid5_rw(struct sr_workunit *wu) 373 { 374 struct sr_workunit *wu_r = NULL; 375 struct sr_discipline *sd = wu->swu_dis; 376 struct scsi_xfer *xs = wu->swu_xs; 377 struct sr_chunk *scp; 378 daddr_t blk, lba; 379 int64_t chunk_offs, lbaoffs, phys_offs, strip_offs; 380 int64_t strip_bits, strip_no, strip_size; 381 int64_t chunk, no_chunk; 382 int64_t length, parity, datalen, row_size; 383 void *data; 384 int s; 385 386 /* blk and scsi error will be handled by sr_validate_io */ 387 if (sr_validate_io(wu, &blk, "sr_raid5_rw")) 388 goto bad; 389 390 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: lba %lld size %d\n", 391 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 392 (xs->flags & SCSI_DATA_IN) ? "read" : "write", 393 (long long)blk, xs->datalen); 394 395 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 396 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 397 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 398 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; 399 400 data = xs->data; 401 datalen = xs->datalen; 402 lbaoffs = blk << DEV_BSHIFT; 403 404 if (xs->flags & SCSI_DATA_OUT) { 405 if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){ 406 printf("%s: %s failed to get read work unit", 407 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 408 goto bad; 409 } 410 wu_r->swu_state = SR_WU_INPROGRESS; 411 wu_r->swu_flags |= SR_WUF_DISCIPLINE; 412 } 413 414 wu->swu_blk_start = 0; 415 while (datalen != 0) { 416 strip_no = lbaoffs >> strip_bits; 417 strip_offs = lbaoffs & (strip_size - 1); 418 chunk_offs = (strip_no / no_chunk) << strip_bits; 419 phys_offs = chunk_offs + strip_offs + 420 (sd->sd_meta->ssd_data_offset << DEV_BSHIFT); 421 422 /* get size remaining in this stripe */ 423 length = MIN(strip_size - strip_offs, datalen); 424 425 /* 426 * Map disk offset to data and parity chunks, using a left 427 * asymmetric algorithm for the parity assignment. 428 */ 429 chunk = strip_no % no_chunk; 430 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 431 if (chunk >= parity) 432 chunk++; 433 434 lba = phys_offs >> DEV_BSHIFT; 435 436 /* XXX big hammer.. exclude I/O from entire stripe */ 437 if (wu->swu_blk_start == 0) 438 wu->swu_blk_start = (strip_no / no_chunk) * row_size; 439 wu->swu_blk_end = (strip_no / no_chunk) * row_size + 440 (row_size - 1); 441 442 scp = sd->sd_vol.sv_chunks[chunk]; 443 if (xs->flags & SCSI_DATA_IN) { 444 switch (scp->src_meta.scm_status) { 445 case BIOC_SDONLINE: 446 case BIOC_SDSCRUB: 447 /* 448 * Chunk is online, issue a single read 449 * request. 450 */ 451 if (sr_raid5_addio(wu, chunk, lba, length, 452 data, xs->flags, 0, NULL)) 453 goto bad; 454 break; 455 case BIOC_SDOFFLINE: 456 case BIOC_SDREBUILD: 457 case BIOC_SDHOTSPARE: 458 if (sr_raid5_regenerate(wu, chunk, lba, 459 length, data)) 460 goto bad; 461 break; 462 default: 463 printf("%s: is offline, can't read\n", 464 DEVNAME(sd->sd_sc)); 465 goto bad; 466 } 467 } else { 468 if (sr_raid5_write(wu, wu_r, chunk, parity, lba, 469 length, data, xs->flags, 0)) 470 goto bad; 471 } 472 473 /* advance to next block */ 474 lbaoffs += length; 475 datalen -= length; 476 data += length; 477 } 478 479 s = splbio(); 480 if (wu_r) { 481 if (wu_r->swu_io_count > 0) { 482 /* collide write request with reads */ 483 wu_r->swu_blk_start = wu->swu_blk_start; 484 wu_r->swu_blk_end = wu->swu_blk_end; 485 486 wu->swu_state = SR_WU_DEFERRED; 487 wu_r->swu_collider = wu; 488 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link); 489 490 wu = wu_r; 491 } else { 492 sr_scsi_wu_put(sd, wu_r); 493 } 494 } 495 splx(s); 496 497 sr_schedule_wu(wu); 498 499 return (0); 500 501 bad: 502 /* wu is unwound by sr_wu_put */ 503 if (wu_r) 504 sr_scsi_wu_put(sd, wu_r); 505 return (1); 506 } 507 508 int 509 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno, 510 daddr_t len, void *data) 511 { 512 struct sr_discipline *sd = wu->swu_dis; 513 int i; 514 515 /* 516 * Regenerate a block on a RAID 5 volume by xoring the data and parity 517 * from all of the remaining online chunks. This requires the parity 518 * to already be correct. 519 */ 520 521 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, " 522 "regenerating block %llu\n", 523 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno); 524 525 memset(data, 0, len); 526 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 527 if (i == chunk) 528 continue; 529 if (!sr_raid5_chunk_online(sd, i)) 530 goto bad; 531 if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN, 532 0, data)) 533 goto bad; 534 } 535 return (0); 536 537 bad: 538 return (1); 539 } 540 541 int 542 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk, 543 int parity, daddr_t blkno, daddr_t len, void *data, int xsflags, 544 int ccbflags) 545 { 546 struct sr_discipline *sd = wu->swu_dis; 547 struct scsi_xfer *xs = wu->swu_xs; 548 void *xorbuf; 549 int chunk_online, chunk_rebuild; 550 int parity_online, parity_rebuild; 551 int other_offline = 0, other_rebuild = 0; 552 int i; 553 554 /* 555 * Perform a write to a RAID 5 volume. This write routine does not 556 * require the parity to already be correct and will operate on a 557 * uninitialised volume. 558 * 559 * There are four possible cases: 560 * 561 * 1) All data chunks and parity are online. In this case we read the 562 * data from all data chunks, except the one we are writing to, in 563 * order to calculate and write the new parity. 564 * 565 * 2) The parity chunk is offline. In this case we only need to write 566 * to the data chunk. No parity calculation is required. 567 * 568 * 3) The data chunk is offline. In this case we read the data from all 569 * online chunks in order to calculate and write the new parity. 570 * This is the same as (1) except we do not write the data chunk. 571 * 572 * 4) A different data chunk is offline. The new parity is calculated 573 * by taking the existing parity, xoring the original data and 574 * xoring in the new data. This requires that the parity already be 575 * correct, which it will be if any of the data chunks has 576 * previously been written. 577 * 578 * There is an additional complication introduced by a chunk that is 579 * being rebuilt. If this is the data or parity chunk, then we want 580 * to write to it as per normal. If it is another data chunk then we 581 * need to presume that it has not yet been regenerated and use the 582 * same method as detailed in (4) above. 583 */ 584 585 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i " 586 "blk %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 587 chunk, parity, (unsigned long long)blkno); 588 589 chunk_online = sr_raid5_chunk_online(sd, chunk); 590 chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk); 591 parity_online = sr_raid5_chunk_online(sd, parity); 592 parity_rebuild = sr_raid5_chunk_rebuild(sd, parity); 593 594 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 595 if (i == chunk || i == parity) 596 continue; 597 if (sr_raid5_chunk_rebuild(sd, i)) 598 other_rebuild = 1; 599 else if (!sr_raid5_chunk_online(sd, i)) 600 other_offline = 1; 601 } 602 603 DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, " 604 "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 605 chunk_online, parity_online, other_offline); 606 607 if (!parity_online && !parity_rebuild) 608 goto data_write; 609 610 xorbuf = sr_block_get(sd, len); 611 if (xorbuf == NULL) 612 goto bad; 613 memcpy(xorbuf, data, len); 614 615 if (other_offline || other_rebuild) { 616 617 /* 618 * XXX - If we can guarantee that this LBA has been scrubbed 619 * then we can also take this faster path. 620 */ 621 622 /* Read in existing data and existing parity. */ 623 if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL, 624 SCSI_DATA_IN, 0, xorbuf)) 625 goto bad; 626 if (sr_raid5_addio(wu_r, parity, blkno, len, NULL, 627 SCSI_DATA_IN, 0, xorbuf)) 628 goto bad; 629 630 } else { 631 632 /* Read in existing data from all other chunks. */ 633 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 634 if (i == chunk || i == parity) 635 continue; 636 if (sr_raid5_addio(wu_r, i, blkno, len, NULL, 637 SCSI_DATA_IN, 0, xorbuf)) 638 goto bad; 639 } 640 641 } 642 643 /* Write new parity. */ 644 if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags, 645 SR_CCBF_FREEBUF, NULL)) 646 goto bad; 647 648 data_write: 649 /* Write new data. */ 650 if (chunk_online || chunk_rebuild) 651 if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags, 652 0, NULL)) 653 goto bad; 654 655 return (0); 656 657 bad: 658 return (1); 659 } 660 661 void 662 sr_raid5_intr(struct buf *bp) 663 { 664 struct sr_ccb *ccb = (struct sr_ccb *)bp; 665 struct sr_workunit *wu = ccb->ccb_wu; 666 struct sr_discipline *sd = wu->swu_dis; 667 int s; 668 669 DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n", 670 DEVNAME(sd->sd_sc), bp, wu->swu_xs); 671 672 s = splbio(); 673 sr_ccb_done(ccb); 674 675 /* XXX - Should this be done via the taskq? */ 676 677 /* XOR data to result. */ 678 if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque) 679 sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data, 680 ccb->ccb_buf.b_bcount); 681 682 /* Free allocated data buffer. */ 683 if (ccb->ccb_flags & SR_CCBF_FREEBUF) { 684 sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); 685 ccb->ccb_buf.b_data = NULL; 686 } 687 688 sr_wu_done(wu); 689 splx(s); 690 } 691 692 int 693 sr_raid5_wu_done(struct sr_workunit *wu) 694 { 695 struct sr_discipline *sd = wu->swu_dis; 696 struct scsi_xfer *xs = wu->swu_xs; 697 698 /* XXX - we have no way of propagating errors... */ 699 if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD)) 700 return SR_WU_OK; 701 702 /* XXX - This is insufficient for RAID 5. */ 703 if (wu->swu_ios_succeeded > 0) { 704 xs->error = XS_NOERROR; 705 return SR_WU_OK; 706 } 707 708 if (xs->flags & SCSI_DATA_IN) { 709 printf("%s: retrying read on block %lld\n", 710 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 711 sr_wu_release_ccbs(wu); 712 wu->swu_state = SR_WU_RESTART; 713 if (sd->sd_scsi_rw(wu) == 0) 714 return SR_WU_RESTART; 715 } else { 716 /* XXX - retry write if we just went from online to degraded. */ 717 printf("%s: permanently fail write on block %lld\n", 718 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 719 } 720 721 wu->swu_state = SR_WU_FAILED; 722 xs->error = XS_DRIVER_STUFFUP; 723 724 return SR_WU_FAILED; 725 } 726 727 int 728 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno, 729 daddr_t len, void *data, int xsflags, int ccbflags, void *xorbuf) 730 { 731 struct sr_discipline *sd = wu->swu_dis; 732 struct sr_ccb *ccb; 733 734 DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld " 735 "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write", 736 chunk, (long long)blkno, (long long)len, xorbuf ? "X0R" : "-"); 737 738 /* Allocate temporary buffer. */ 739 if (data == NULL) { 740 data = sr_block_get(sd, len); 741 if (data == NULL) 742 return (-1); 743 ccbflags |= SR_CCBF_FREEBUF; 744 } 745 746 ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags); 747 if (ccb == NULL) { 748 if (ccbflags & SR_CCBF_FREEBUF) 749 sr_block_put(sd, data, len); 750 return (-1); 751 } 752 ccb->ccb_opaque = xorbuf; 753 sr_wu_enqueue_ccb(wu, ccb); 754 755 return (0); 756 } 757 758 void 759 sr_raid5_xor(void *a, void *b, int len) 760 { 761 uint32_t *xa = a, *xb = b; 762 763 len >>= 2; 764 while (len--) 765 *xa++ ^= *xb++; 766 } 767 768 void 769 sr_raid5_rebuild(struct sr_discipline *sd) 770 { 771 int64_t strip_no, strip_size, strip_bits, i, psz, rb; 772 int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size; 773 struct sr_workunit *wu_r, *wu_w; 774 int s, slept, percent = 0, old_percent = -1; 775 int rebuild_chunk = -1; 776 void *xorbuf; 777 778 /* Find the rebuild chunk. */ 779 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 780 if (sr_raid5_chunk_rebuild(sd, i)) { 781 rebuild_chunk = i; 782 break; 783 } 784 } 785 if (rebuild_chunk == -1) 786 goto bad; 787 788 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 789 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 790 chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1; 791 chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count; 792 chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits; 793 row_size = (chunk_count << strip_bits) >> DEV_BSHIFT; 794 795 /* XXX - handle restarts. */ 796 DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, " 797 "chunk count = %lld, chunk size = %lld, chunk strips = %lld, " 798 "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 799 sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips, 800 row_size); 801 802 for (strip_no = 0; strip_no < chunk_strips; strip_no++) { 803 chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no + 804 sd->sd_meta->ssd_data_offset; 805 806 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, " 807 "chunk lba = %lld\n", DEVNAME(sd->sd_sc), 808 sd->sd_meta->ssd_devname, strip_no, chunk_lba); 809 810 wu_w = sr_scsi_wu_get(sd, 0); 811 wu_r = sr_scsi_wu_get(sd, 0); 812 813 xorbuf = sr_block_get(sd, strip_size); 814 if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba, 815 strip_size, xorbuf)) 816 goto bad; 817 if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size, 818 xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL)) 819 goto bad; 820 821 /* Collide write work unit with read work unit. */ 822 wu_r->swu_state = SR_WU_INPROGRESS; 823 wu_r->swu_flags |= SR_WUF_REBUILD; 824 wu_w->swu_state = SR_WU_DEFERRED; 825 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 826 wu_r->swu_collider = wu_w; 827 828 /* Block I/O to this strip while we rebuild it. */ 829 wu_r->swu_blk_start = (strip_no / chunk_count) * row_size; 830 wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1; 831 wu_w->swu_blk_start = wu_r->swu_blk_start; 832 wu_w->swu_blk_end = wu_r->swu_blk_end; 833 834 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, " 835 "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc), 836 sd->sd_meta->ssd_devname, 837 wu_r->swu_blk_start, wu_r->swu_blk_end); 838 839 s = splbio(); 840 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 841 splx(s); 842 843 sr_schedule_wu(wu_r); 844 845 slept = 0; 846 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 847 tsleep(wu_w, PRIBIO, "sr_rebuild", 0); 848 slept = 1; 849 } 850 if (!slept) 851 tsleep(sd->sd_sc, PWAIT, "sr_yield", 1); 852 853 sr_scsi_wu_put(sd, wu_r); 854 sr_scsi_wu_put(sd, wu_w); 855 856 sd->sd_meta->ssd_rebuild = 857 (chunk_lba - sd->sd_meta->ssd_data_offset) * chunk_count; 858 859 psz = sd->sd_meta->ssdi.ssd_size; 860 rb = sd->sd_meta->ssd_rebuild; 861 if (rb > 0) 862 percent = 100 - ((psz * 100 - rb * 100) / psz) - 1; 863 else 864 percent = 0; 865 if (percent != old_percent && strip_no != chunk_strips - 1) { 866 if (sr_meta_save(sd, SR_META_DIRTY)) 867 printf("%s: could not save metadata to %s\n", 868 DEVNAME(sd->sd_sc), 869 sd->sd_meta->ssd_devname); 870 old_percent = percent; 871 } 872 873 if (sd->sd_reb_abort) 874 goto abort; 875 } 876 877 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc), 878 sd->sd_meta->ssd_devname); 879 880 /* all done */ 881 sd->sd_meta->ssd_rebuild = 0; 882 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 883 if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status == 884 BIOC_SDREBUILD) { 885 sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE); 886 break; 887 } 888 } 889 890 return; 891 892 abort: 893 if (sr_meta_save(sd, SR_META_DIRTY)) 894 printf("%s: could not save metadata to %s\n", 895 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 896 bad: 897 return; 898 } 899 900 #if 0 901 void 902 sr_raid5_scrub(struct sr_discipline *sd) 903 { 904 int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits; 905 int64_t i; 906 struct sr_workunit *wu_r, *wu_w; 907 int s, slept; 908 void *xorbuf; 909 910 wu_w = sr_scsi_wu_get(sd, 0); 911 wu_r = sr_scsi_wu_get(sd, 0); 912 913 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 914 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 915 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 916 max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits; 917 918 for (strip_no = 0; strip_no < max_strip; strip_no++) { 919 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 920 921 xorbuf = sr_block_get(sd, strip_size); 922 for (i = 0; i <= no_chunk; i++) { 923 if (i != parity) 924 sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size, 925 NULL, SCSI_DATA_IN, 0, xorbuf); 926 } 927 sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf, 928 SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL); 929 930 wu_r->swu_flags |= SR_WUF_REBUILD; 931 932 /* Collide wu_w with wu_r */ 933 wu_w->swu_state = SR_WU_DEFERRED; 934 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 935 wu_r->swu_collider = wu_w; 936 937 s = splbio(); 938 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 939 splx(s); 940 941 wu_r->swu_state = SR_WU_INPROGRESS; 942 sr_schedule_wu(wu_r); 943 944 slept = 0; 945 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 946 tsleep(wu_w, PRIBIO, "sr_scrub", 0); 947 slept = 1; 948 } 949 if (!slept) 950 tsleep(sd->sd_sc, PWAIT, "sr_yield", 1); 951 } 952 done: 953 return; 954 } 955 #endif 956