1 /* $OpenBSD: softraid_raid6.c,v 1.25 2011/12/25 15:28:17 jsing Exp $ */ 2 /* 3 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 4 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include "bio.h" 20 21 #include <sys/param.h> 22 #include <sys/systm.h> 23 #include <sys/buf.h> 24 #include <sys/device.h> 25 #include <sys/ioctl.h> 26 #include <sys/proc.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/disklabel.h> 34 #include <sys/mount.h> 35 #include <sys/sensors.h> 36 #include <sys/stat.h> 37 #include <sys/conf.h> 38 #include <sys/uio.h> 39 40 #include <scsi/scsi_all.h> 41 #include <scsi/scsiconf.h> 42 #include <scsi/scsi_disk.h> 43 44 #include <dev/softraidvar.h> 45 #include <dev/rndvar.h> 46 47 uint8_t *gf_map[256]; 48 uint8_t gf_pow[768]; 49 int gf_log[256]; 50 51 /* RAID 6 functions. */ 52 int sr_raid6_create(struct sr_discipline *, struct bioc_createraid *, 53 int, int64_t); 54 int sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *, 55 int); 56 int sr_raid6_alloc_resources(struct sr_discipline *); 57 int sr_raid6_free_resources(struct sr_discipline *); 58 int sr_raid6_rw(struct sr_workunit *); 59 int sr_raid6_openings(struct sr_discipline *); 60 void sr_raid6_intr(struct buf *); 61 void sr_raid6_recreate_wu(struct sr_workunit *); 62 void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); 63 void sr_raid6_set_vol_state(struct sr_discipline *); 64 65 void sr_raid6_xorp(void *, void *, int); 66 void sr_raid6_xorq(void *, void *, int, int); 67 int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, 68 void *, int, int, void *, void *, int); 69 void sr_dump(void *, int); 70 void sr_raid6_scrub(struct sr_discipline *); 71 int sr_failio(struct sr_workunit *); 72 73 void *sr_get_block(struct sr_discipline *, int); 74 void sr_put_block(struct sr_discipline *, void *, int); 75 76 void gf_init(void); 77 uint8_t gf_inv(uint8_t); 78 int gf_premul(uint8_t); 79 uint8_t gf_mul(uint8_t, uint8_t); 80 81 #define SR_NOFAIL 0x00 82 #define SR_FAILX (1L << 0) 83 #define SR_FAILY (1L << 1) 84 #define SR_FAILP (1L << 2) 85 #define SR_FAILQ (1L << 3) 86 87 struct sr_raid6_opaque { 88 int gn; 89 void *pbuf; 90 void *qbuf; 91 }; 92 93 /* discipline initialisation. */ 94 void 95 sr_raid6_discipline_init(struct sr_discipline *sd) 96 { 97 98 /* Initialize GF256 tables. */ 99 gf_init(); 100 101 /* Fill out discipline members. */ 102 sd->sd_type = SR_MD_RAID6; 103 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE; 104 sd->sd_max_wu = SR_RAID6_NOWU; 105 106 /* Setup discipline specific function pointers. */ 107 sd->sd_alloc_resources = sr_raid6_alloc_resources; 108 sd->sd_assemble = sr_raid6_assemble; 109 sd->sd_create = sr_raid6_create; 110 sd->sd_free_resources = sr_raid6_free_resources; 111 sd->sd_openings = sr_raid6_openings; 112 sd->sd_scsi_rw = sr_raid6_rw; 113 sd->sd_set_chunk_state = sr_raid6_set_chunk_state; 114 sd->sd_set_vol_state = sr_raid6_set_vol_state; 115 } 116 117 int 118 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc, 119 int no_chunk, int64_t coerced_size) 120 { 121 122 if (no_chunk < 4) 123 return EINVAL; 124 125 strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name)); 126 127 /* 128 * XXX add variable strip size later even though MAXPHYS is really 129 * the clever value, users like * to tinker with that type of stuff. 130 */ 131 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; 132 sd->sd_meta->ssdi.ssd_size = (coerced_size & 133 ~((sd->sd_meta->ssdi.ssd_strip_size >> DEV_BSHIFT) - 1)) * 134 (no_chunk - 2); 135 136 /* only if stripsize <= MAXPHYS */ 137 sd->sd_max_ccb_per_wu = max(6, 2 * no_chunk); 138 139 return 0; 140 } 141 142 int 143 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, 144 int no_chunk) 145 { 146 147 /* only if stripsize <= MAXPHYS */ 148 sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); 149 150 return 0; 151 } 152 153 int 154 sr_raid6_openings(struct sr_discipline *sd) 155 { 156 return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ 157 } 158 159 int 160 sr_raid6_alloc_resources(struct sr_discipline *sd) 161 { 162 int rv = EINVAL; 163 164 if (!sd) 165 return (rv); 166 167 DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", 168 DEVNAME(sd->sd_sc)); 169 170 if (sr_wu_alloc(sd)) 171 goto bad; 172 if (sr_ccb_alloc(sd)) 173 goto bad; 174 175 /* setup runtime values */ 176 sd->mds.mdd_raid6.sr6_strip_bits = 177 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 178 if (sd->mds.mdd_raid6.sr6_strip_bits == -1) 179 goto bad; 180 181 rv = 0; 182 bad: 183 return (rv); 184 } 185 186 int 187 sr_raid6_free_resources(struct sr_discipline *sd) 188 { 189 int rv = EINVAL; 190 191 if (!sd) 192 return (rv); 193 194 DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", 195 DEVNAME(sd->sd_sc)); 196 197 sr_wu_free(sd); 198 sr_ccb_free(sd); 199 200 rv = 0; 201 return (rv); 202 } 203 204 void 205 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 206 { 207 int old_state, s; 208 209 /* XXX this is for RAID 0 */ 210 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 211 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 212 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 213 214 /* ok to go to splbio since this only happens in error path */ 215 s = splbio(); 216 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 217 218 /* multiple IOs to the same chunk that fail will come through here */ 219 if (old_state == new_state) 220 goto done; 221 222 switch (old_state) { 223 case BIOC_SDONLINE: 224 switch (new_state) { 225 case BIOC_SDOFFLINE: 226 case BIOC_SDSCRUB: 227 break; 228 default: 229 goto die; 230 } 231 break; 232 233 case BIOC_SDOFFLINE: 234 if (new_state == BIOC_SDREBUILD) { 235 ; 236 } else 237 goto die; 238 break; 239 240 case BIOC_SDSCRUB: 241 switch (new_state) { 242 case BIOC_SDONLINE: 243 case BIOC_SDOFFLINE: 244 break; 245 default: 246 goto die; 247 } 248 break; 249 250 case BIOC_SDREBUILD: 251 switch (new_state) { 252 case BIOC_SDONLINE: 253 case BIOC_SDOFFLINE: 254 break; 255 default: 256 goto die; 257 } 258 break; 259 260 default: 261 die: 262 splx(s); /* XXX */ 263 panic("%s: %s: %s: invalid chunk state transition " 264 "%d -> %d", DEVNAME(sd->sd_sc), 265 sd->sd_meta->ssd_devname, 266 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 267 old_state, new_state); 268 /* NOTREACHED */ 269 } 270 271 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 272 sd->sd_set_vol_state(sd); 273 274 sd->sd_must_flush = 1; 275 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 276 done: 277 splx(s); 278 } 279 280 void 281 sr_raid6_set_vol_state(struct sr_discipline *sd) 282 { 283 int states[SR_MAX_STATES]; 284 int new_state, i, s, nd; 285 int old_state = sd->sd_vol_status; 286 287 /* XXX this is for RAID 0 */ 288 289 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 290 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 291 292 nd = sd->sd_meta->ssdi.ssd_chunk_no; 293 294 for (i = 0; i < SR_MAX_STATES; i++) 295 states[i] = 0; 296 297 for (i = 0; i < nd; i++) { 298 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 299 if (s >= SR_MAX_STATES) 300 panic("%s: %s: %s: invalid chunk state", 301 DEVNAME(sd->sd_sc), 302 sd->sd_meta->ssd_devname, 303 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 304 states[s]++; 305 } 306 307 if (states[BIOC_SDONLINE] == nd) 308 new_state = BIOC_SVONLINE; 309 else if (states[BIOC_SDONLINE] < nd - 2) 310 new_state = BIOC_SVOFFLINE; 311 else if (states[BIOC_SDSCRUB] != 0) 312 new_state = BIOC_SVSCRUB; 313 else if (states[BIOC_SDREBUILD] != 0) 314 new_state = BIOC_SVREBUILD; 315 else if (states[BIOC_SDONLINE] < nd) 316 new_state = BIOC_SVDEGRADED; 317 else { 318 printf("old_state = %d, ", old_state); 319 for (i = 0; i < nd; i++) 320 printf("%d = %d, ", i, 321 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 322 panic("invalid new_state"); 323 } 324 325 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", 326 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 327 old_state, new_state); 328 329 switch (old_state) { 330 case BIOC_SVONLINE: 331 switch (new_state) { 332 case BIOC_SVONLINE: /* can go to same state */ 333 case BIOC_SVOFFLINE: 334 case BIOC_SVDEGRADED: 335 case BIOC_SVREBUILD: /* happens on boot */ 336 break; 337 default: 338 goto die; 339 } 340 break; 341 342 case BIOC_SVOFFLINE: 343 /* XXX this might be a little too much */ 344 goto die; 345 346 case BIOC_SVSCRUB: 347 switch (new_state) { 348 case BIOC_SVONLINE: 349 case BIOC_SVOFFLINE: 350 case BIOC_SVDEGRADED: 351 case BIOC_SVSCRUB: /* can go to same state */ 352 break; 353 default: 354 goto die; 355 } 356 break; 357 358 case BIOC_SVBUILDING: 359 switch (new_state) { 360 case BIOC_SVONLINE: 361 case BIOC_SVOFFLINE: 362 case BIOC_SVBUILDING: /* can go to the same state */ 363 break; 364 default: 365 goto die; 366 } 367 break; 368 369 case BIOC_SVREBUILD: 370 switch (new_state) { 371 case BIOC_SVONLINE: 372 case BIOC_SVOFFLINE: 373 case BIOC_SVDEGRADED: 374 case BIOC_SVREBUILD: /* can go to the same state */ 375 break; 376 default: 377 goto die; 378 } 379 break; 380 381 case BIOC_SVDEGRADED: 382 switch (new_state) { 383 case BIOC_SVOFFLINE: 384 case BIOC_SVREBUILD: 385 case BIOC_SVDEGRADED: /* can go to the same state */ 386 break; 387 default: 388 goto die; 389 } 390 break; 391 392 default: 393 die: 394 panic("%s: %s: invalid volume state transition %d -> %d", 395 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 396 old_state, new_state); 397 /* NOTREACHED */ 398 } 399 400 sd->sd_vol_status = new_state; 401 } 402 403 /* modes: 404 * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 405 * SR_CCBF_FREEBUF, qbuf, NULL, 0); 406 * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 407 * SR_CCBF_FREEBUF, pbuf, NULL, 0); 408 * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 409 * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); 410 */ 411 412 int 413 sr_raid6_rw(struct sr_workunit *wu) 414 { 415 struct sr_workunit *wu_r = NULL; 416 struct sr_discipline *sd = wu->swu_dis; 417 struct scsi_xfer *xs = wu->swu_xs; 418 struct sr_chunk *scp; 419 int s, fail, i, gxinv, pxinv; 420 daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; 421 daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; 422 daddr64_t strip_bits, length, strip_offs, datalen, row_size; 423 void *pbuf, *data, *qbuf; 424 425 /* blk and scsi error will be handled by sr_validate_io */ 426 if (sr_validate_io(wu, &blk, "sr_raid6_rw")) 427 goto bad; 428 429 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 430 strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; 431 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; 432 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; 433 434 data = xs->data; 435 datalen = xs->datalen; 436 lbaoffs = blk << DEV_BSHIFT; 437 438 if (xs->flags & SCSI_DATA_OUT) 439 /* create write workunit */ 440 if ((wu_r = scsi_io_get(&sd->sd_iopool, SCSI_NOSLEEP)) == NULL){ 441 printf("%s: can't get wu_r", DEVNAME(sd->sd_sc)); 442 goto bad; 443 } 444 445 wu->swu_blk_start = 0; 446 while (datalen != 0) { 447 strip_no = lbaoffs >> strip_bits; 448 strip_offs = lbaoffs & (strip_size - 1); 449 chunk_offs = (strip_no / no_chunk) << strip_bits; 450 phys_offs = chunk_offs + strip_offs + 451 (sd->sd_meta->ssd_data_offset << DEV_BSHIFT); 452 453 /* get size remaining in this stripe */ 454 length = MIN(strip_size - strip_offs, datalen); 455 456 /* map disk offset to parity/data drive */ 457 chunk = strip_no % no_chunk; 458 459 qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); 460 if (qchunk == 0) 461 pchunk = no_chunk + 1; 462 else 463 pchunk = qchunk - 1; 464 if (chunk >= pchunk) 465 chunk++; 466 if (chunk >= qchunk) 467 chunk++; 468 469 lba = phys_offs >> DEV_BSHIFT; 470 471 /* XXX big hammer.. exclude I/O from entire stripe */ 472 if (wu->swu_blk_start == 0) 473 wu->swu_blk_start = (strip_no / no_chunk) * row_size; 474 wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1); 475 476 fail = 0; 477 fchunk = -1; 478 479 /* Get disk-fail flags */ 480 for (i=0; i< no_chunk+2; i++) { 481 scp = sd->sd_vol.sv_chunks[i]; 482 switch (scp->src_meta.scm_status) { 483 case BIOC_SDOFFLINE: 484 case BIOC_SDREBUILD: 485 case BIOC_SDHOTSPARE: 486 if (i == qchunk) 487 fail |= SR_FAILQ; 488 else if (i == pchunk) 489 fail |= SR_FAILP; 490 else if (i == chunk) 491 fail |= SR_FAILX; 492 else { 493 /* dual data-disk failure */ 494 fail |= SR_FAILY; 495 fchunk = i; 496 } 497 break; 498 } 499 } 500 if (xs->flags & SCSI_DATA_IN) { 501 if (!(fail & SR_FAILX)) { 502 /* drive is good. issue single read request */ 503 if (sr_raid6_addio(wu, chunk, lba, length, 504 data, xs->flags, 0, NULL, NULL, 0)) 505 goto bad; 506 } else if (fail & SR_FAILP) { 507 /* Dx, P failed */ 508 printf("Disk %llx offline, " 509 "regenerating Dx+P\n", chunk); 510 511 gxinv = gf_inv(gf_pow[chunk]); 512 513 /* Calculate: Dx = (Q^Dz*gz)*inv(gx) */ 514 memset(data, 0, length); 515 if (sr_raid6_addio(wu, qchunk, lba, length, NULL, 516 SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, 517 gxinv)) 518 goto bad; 519 520 /* Read Dz * gz * inv(gx) */ 521 for (i = 0; i < no_chunk+2; i++) { 522 if (i == qchunk || i == pchunk || i == chunk) 523 continue; 524 525 if (sr_raid6_addio(wu, i, lba, 526 length, NULL, SCSI_DATA_IN, 527 SR_CCBF_FREEBUF, NULL, 528 data, gf_mul(gf_pow[i], gxinv))) 529 goto bad; 530 } 531 532 /* data will contain correct value on completion */ 533 } else if (fail & SR_FAILY) { 534 /* Dx, Dy failed */ 535 printf("Disk %llx & %llx offline, " 536 "regenerating Dx+Dy\n", chunk, fchunk); 537 538 gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]); 539 pxinv = gf_mul(gf_pow[fchunk], gxinv); 540 541 /* read Q * inv(gx + gy) */ 542 memset(data, 0, length); 543 if (sr_raid6_addio(wu, qchunk, lba, 544 length, NULL, SCSI_DATA_IN, 545 SR_CCBF_FREEBUF, NULL, 546 data, gxinv)) 547 goto bad; 548 549 /* read P * gy * inv(gx + gy) */ 550 if (sr_raid6_addio(wu, pchunk, lba, 551 length, NULL, SCSI_DATA_IN, 552 SR_CCBF_FREEBUF, NULL, 553 data, pxinv)) 554 goto bad; 555 556 /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz 557 * Q: sr_raid6_xorp(qbuf, --, length); 558 * P: sr_raid6_xorp(pbuf, --, length); 559 * Dz: sr_raid6_xorp(pbuf, --, length); 560 * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); 561 */ 562 for (i = 0; i < no_chunk+2; i++) { 563 if (i == qchunk || i == pchunk || 564 i == chunk || i == fchunk) 565 continue; 566 567 /* read Dz * (gz + gy) * inv(gx + gy) */ 568 if (sr_raid6_addio(wu, i, lba, 569 length, NULL, SCSI_DATA_IN, 570 SR_CCBF_FREEBUF, NULL, data, 571 pxinv ^ gf_mul(gf_pow[i], gxinv))) 572 goto bad; 573 } 574 } else { 575 /* Two cases: single disk (Dx) or (Dx+Q) 576 * Dx = Dz ^ P (same as RAID5) 577 */ 578 printf("Disk %llx offline, " 579 "regenerating Dx%s\n", chunk, 580 fail & SR_FAILQ ? "+Q" : " single"); 581 582 /* Calculate: Dx = P^Dz 583 * P: sr_raid6_xorp(data, ---, length); 584 * Dz: sr_raid6_xorp(data, ---, length); 585 */ 586 memset(data, 0, length); 587 for (i = 0; i < no_chunk+2; i++) { 588 if (i != chunk && i != qchunk) { 589 /* Read Dz */ 590 if (sr_raid6_addio(wu, i, lba, 591 length, NULL, SCSI_DATA_IN, 592 SR_CCBF_FREEBUF, data, 593 NULL, 0)) 594 goto bad; 595 } 596 } 597 598 /* data will contain correct value on completion */ 599 } 600 } else { 601 /* XXX handle writes to failed/offline disk? */ 602 if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP)) 603 goto bad; 604 605 /* 606 * initialize pbuf with contents of new data to be 607 * written. This will be XORed with old data and old 608 * parity in the intr routine. The result in pbuf 609 * is the new parity data. 610 */ 611 qbuf = sr_get_block(sd, length); 612 if (qbuf == NULL) 613 goto bad; 614 615 pbuf = sr_get_block(sd, length); 616 if (pbuf == NULL) 617 goto bad; 618 619 /* Calulate P = Dn; Q = gn * Dn */ 620 if (gf_premul(gf_pow[chunk])) 621 goto bad; 622 sr_raid6_xorp(pbuf, data, length); 623 sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); 624 625 /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ 626 if (sr_raid6_addio(wu_r, chunk, lba, length, NULL, 627 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, 628 gf_pow[chunk])) 629 goto bad; 630 631 /* Read old xor-parity: P ^= P' */ 632 if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL, 633 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) 634 goto bad; 635 636 /* Read old q-parity: Q ^= Q' */ 637 if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL, 638 SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) 639 goto bad; 640 641 /* write new data */ 642 if (sr_raid6_addio(wu, chunk, lba, length, data, 643 xs->flags, 0, NULL, NULL, 0)) 644 goto bad; 645 646 /* write new xor-parity */ 647 if (sr_raid6_addio(wu, pchunk, lba, length, pbuf, 648 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 649 goto bad; 650 651 /* write new q-parity */ 652 if (sr_raid6_addio(wu, qchunk, lba, length, qbuf, 653 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 654 goto bad; 655 } 656 657 /* advance to next block */ 658 lbaoffs += length; 659 datalen -= length; 660 data += length; 661 } 662 663 s = splbio(); 664 if (wu_r) { 665 /* collide write request with reads */ 666 wu_r->swu_blk_start = wu->swu_blk_start; 667 wu_r->swu_blk_end = wu->swu_blk_end; 668 669 wu->swu_state = SR_WU_DEFERRED; 670 wu_r->swu_collider = wu; 671 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link); 672 673 wu = wu_r; 674 } 675 676 /* rebuild io, let rebuild routine deal with it */ 677 if (wu->swu_flags & SR_WUF_REBUILD) 678 goto queued; 679 680 /* current io failed, restart */ 681 if (wu->swu_state == SR_WU_RESTART) 682 goto start; 683 684 /* deferred io failed, don't restart */ 685 if (wu->swu_state == SR_WU_REQUEUE) 686 goto queued; 687 688 if (sr_check_io_collision(wu)) 689 goto queued; 690 691 start: 692 sr_raid_startwu(wu); 693 queued: 694 splx(s); 695 return (0); 696 bad: 697 /* wu is unwound by sr_wu_put */ 698 if (wu_r) 699 scsi_io_put(&sd->sd_iopool, wu_r); 700 return (1); 701 } 702 703 /* Handle failure I/O completion */ 704 int 705 sr_failio(struct sr_workunit *wu) 706 { 707 struct sr_discipline *sd = wu->swu_dis; 708 struct sr_ccb *ccb; 709 710 if (!(wu->swu_flags & SR_WUF_FAIL)) 711 return (0); 712 713 /* Wu is a 'fake'.. don't do real I/O just intr */ 714 TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); 715 TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) 716 sr_raid6_intr(&ccb->ccb_buf); 717 return (1); 718 } 719 720 void 721 sr_raid6_intr(struct buf *bp) 722 { 723 struct sr_ccb *ccb = (struct sr_ccb *)bp; 724 struct sr_workunit *wu = ccb->ccb_wu, *wup; 725 struct sr_discipline *sd = wu->swu_dis; 726 struct scsi_xfer *xs = wu->swu_xs; 727 struct sr_softc *sc = sd->sd_sc; 728 struct sr_raid6_opaque *pq = ccb->ccb_opaque; 729 int s, pend; 730 731 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", 732 DEVNAME(sc), bp, xs); 733 734 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 735 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 736 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, 737 ccb->ccb_buf.b_blkno, ccb->ccb_target); 738 739 s = splbio(); 740 741 if (ccb->ccb_buf.b_flags & B_ERROR) { 742 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 743 DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); 744 printf("io error: disk %x\n", ccb->ccb_target); 745 wu->swu_ios_failed++; 746 ccb->ccb_state = SR_CCB_FAILED; 747 if (ccb->ccb_target != -1) 748 sd->sd_set_chunk_state(sd, ccb->ccb_target, 749 BIOC_SDOFFLINE); 750 else 751 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 752 } else { 753 ccb->ccb_state = SR_CCB_OK; 754 wu->swu_ios_succeeded++; 755 756 /* XOR data to result */ 757 if (pq) { 758 if (pq->pbuf) 759 /* Calculate xor-parity */ 760 sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, 761 ccb->ccb_buf.b_bcount); 762 if (pq->qbuf) 763 /* Calculate q-parity */ 764 sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, 765 ccb->ccb_buf.b_bcount, pq->gn); 766 free(pq, M_DEVBUF); 767 ccb->ccb_opaque = NULL; 768 } 769 } 770 771 /* free allocated data buffer */ 772 if (ccb->ccb_flag & SR_CCBF_FREEBUF) { 773 sr_put_block(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); 774 ccb->ccb_buf.b_data = NULL; 775 } 776 wu->swu_ios_complete++; 777 778 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 779 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 780 wu->swu_ios_failed); 781 782 if (wu->swu_ios_complete >= wu->swu_io_count) { 783 784 /* if all ios failed, retry reads and give up on writes */ 785 if (wu->swu_ios_failed == wu->swu_ios_complete) { 786 if (xs->flags & SCSI_DATA_IN) { 787 printf("%s: retrying read on block %lld\n", 788 DEVNAME(sc), ccb->ccb_buf.b_blkno); 789 sr_ccb_put(ccb); 790 TAILQ_INIT(&wu->swu_ccb); 791 wu->swu_state = SR_WU_RESTART; 792 if (sd->sd_scsi_rw(wu)) 793 goto bad; 794 else 795 goto retry; 796 } else { 797 printf("%s: permanently fail write on block " 798 "%lld\n", DEVNAME(sc), 799 ccb->ccb_buf.b_blkno); 800 xs->error = XS_DRIVER_STUFFUP; 801 goto bad; 802 } 803 } 804 805 if (xs != NULL) { 806 xs->error = XS_NOERROR; 807 xs->resid = 0; 808 } 809 810 pend = 0; 811 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 812 if (wu == wup) { 813 /* wu on pendq, remove */ 814 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 815 pend = 1; 816 817 if (wu->swu_collider) { 818 if (wu->swu_ios_failed) 819 /* toss all ccbs and recreate */ 820 sr_raid6_recreate_wu(wu->swu_collider); 821 822 /* restart deferred wu */ 823 wu->swu_collider->swu_state = 824 SR_WU_INPROGRESS; 825 TAILQ_REMOVE(&sd->sd_wu_defq, 826 wu->swu_collider, swu_link); 827 if (sr_failio(wu->swu_collider) == 0) 828 sr_raid_startwu(wu->swu_collider); 829 } 830 break; 831 } 832 } 833 834 if (!pend) 835 printf("%s: wu: %p not on pending queue\n", 836 DEVNAME(sc), wu); 837 838 if (wu->swu_flags & SR_WUF_REBUILD) { 839 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 840 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 841 wakeup(wu); 842 } 843 } else { 844 if (xs != NULL) 845 scsi_done(xs); 846 else 847 scsi_io_put(&sd->sd_iopool, wu); 848 } 849 850 if (sd->sd_sync && sd->sd_wu_pending == 0) 851 wakeup(sd); 852 } 853 854 retry: 855 splx(s); 856 return; 857 bad: 858 xs->error = XS_DRIVER_STUFFUP; 859 if (wu->swu_flags & SR_WUF_REBUILD) { 860 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 861 wakeup(wu); 862 } else { 863 scsi_done(xs); 864 } 865 866 splx(s); 867 } 868 869 void 870 sr_raid6_recreate_wu(struct sr_workunit *wu) 871 { 872 struct sr_discipline *sd = wu->swu_dis; 873 struct sr_workunit *wup = wu; 874 struct sr_ccb *ccb; 875 876 do { 877 DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup); 878 879 /* toss all ccbs */ 880 while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { 881 TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); 882 sr_ccb_put(ccb); 883 } 884 TAILQ_INIT(&wup->swu_ccb); 885 886 /* recreate ccbs */ 887 wup->swu_state = SR_WU_REQUEUE; 888 if (sd->sd_scsi_rw(wup)) 889 panic("could not requeue io"); 890 891 wup = wup->swu_collider; 892 } while (wup); 893 } 894 895 int 896 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, 897 void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) 898 { 899 struct sr_discipline *sd = wu->swu_dis; 900 struct sr_ccb *ccb; 901 struct sr_raid6_opaque *pqbuf; 902 903 ccb = sr_ccb_get(sd); 904 if (!ccb) 905 return (-1); 906 907 /* allocate temporary buffer */ 908 if (data == NULL) { 909 data = sr_get_block(sd, len); 910 if (data == NULL) 911 return (-1); 912 } 913 914 DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", 915 flag & SCSI_DATA_IN ? "read" : "write", 916 dsk, blk, len, pbuf, qbuf); 917 918 ccb->ccb_flag = ccbflag; 919 if (flag & SCSI_POLL) { 920 ccb->ccb_buf.b_flags = 0; 921 ccb->ccb_buf.b_iodone = NULL; 922 } else { 923 ccb->ccb_buf.b_flags = B_CALL; 924 ccb->ccb_buf.b_iodone = sr_raid6_intr; 925 } 926 if (flag & SCSI_DATA_IN) 927 ccb->ccb_buf.b_flags |= B_READ; 928 else 929 ccb->ccb_buf.b_flags |= B_WRITE; 930 931 /* add offset for metadata */ 932 ccb->ccb_buf.b_flags |= B_PHYS; 933 ccb->ccb_buf.b_blkno = blk; 934 ccb->ccb_buf.b_bcount = len; 935 ccb->ccb_buf.b_bufsize = len; 936 ccb->ccb_buf.b_resid = len; 937 ccb->ccb_buf.b_data = data; 938 ccb->ccb_buf.b_error = 0; 939 ccb->ccb_buf.b_proc = curproc; 940 ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; 941 ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn; 942 ccb->ccb_buf.b_bq = NULL; 943 if ((ccb->ccb_buf.b_flags & B_READ) == 0) 944 ccb->ccb_buf.b_vp->v_numoutput++; 945 946 ccb->ccb_wu = wu; 947 ccb->ccb_target = dsk; 948 if (pbuf || qbuf) { 949 if (qbuf && gf_premul(gn)) 950 return (-1); 951 952 pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_ZERO | M_NOWAIT); 953 if (pqbuf == NULL) { 954 sr_ccb_put(ccb); 955 return (-1); 956 } 957 pqbuf->pbuf = pbuf; 958 pqbuf->qbuf = qbuf; 959 pqbuf->gn = gn; 960 ccb->ccb_opaque = pqbuf; 961 } 962 963 LIST_INIT(&ccb->ccb_buf.b_dep); 964 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 965 966 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " 967 "b_blkno: %x b_flags 0x%0x b_data %p\n", 968 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 969 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, 970 ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); 971 972 wu->swu_io_count++; 973 974 return (0); 975 } 976 977 /* Perform RAID6 parity calculation. 978 * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ 979 void 980 sr_raid6_xorp(void *p, void *d, int len) 981 { 982 uint32_t *pbuf = p, *data = d; 983 984 len >>= 2; 985 while (len--) 986 *pbuf++ ^= *data++; 987 } 988 989 void 990 sr_raid6_xorq(void *q, void *d, int len, int gn) 991 { 992 uint32_t *qbuf = q, *data = d, x; 993 uint8_t *gn_map = gf_map[gn]; 994 995 len >>= 2; 996 while (len--) { 997 x = *data++; 998 *qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) | 999 ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) | 1000 ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) | 1001 ((uint32_t)gn_map[(x >> 24) & 0xff] << 24)); 1002 } 1003 } 1004 1005 /* Create GF256 log/pow tables: polynomial = 0x11D */ 1006 void 1007 gf_init(void) 1008 { 1009 int i; 1010 uint8_t p = 1; 1011 1012 /* use 2N pow table to avoid using % in multiply */ 1013 for (i=0; i<256; i++) { 1014 gf_log[p] = i; 1015 gf_pow[i] = gf_pow[i+255] = p; 1016 p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); 1017 } 1018 gf_log[0] = 512; 1019 } 1020 1021 uint8_t 1022 gf_inv(uint8_t a) 1023 { 1024 return gf_pow[255 - gf_log[a]]; 1025 } 1026 1027 uint8_t 1028 gf_mul(uint8_t a, uint8_t b) 1029 { 1030 return gf_pow[gf_log[a] + gf_log[b]]; 1031 } 1032 1033 /* Precalculate multiplication tables for drive gn */ 1034 int 1035 gf_premul(uint8_t gn) 1036 { 1037 int i; 1038 1039 if (gf_map[gn] != NULL) 1040 return (0); 1041 1042 if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL) 1043 return (-1); 1044 1045 for (i=0; i<256; i++) 1046 gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]]; 1047 return (0); 1048 } 1049