1 /* $OpenBSD: softraid_raid6.c,v 1.7 2009/11/13 23:34:24 jordan Exp $ */ 2 /* 3 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 4 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include "bio.h" 20 21 #include <sys/param.h> 22 #include <sys/systm.h> 23 #include <sys/buf.h> 24 #include <sys/device.h> 25 #include <sys/ioctl.h> 26 #include <sys/proc.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/disklabel.h> 34 #include <sys/mount.h> 35 #include <sys/sensors.h> 36 #include <sys/stat.h> 37 #include <sys/conf.h> 38 #include <sys/uio.h> 39 40 #include <scsi/scsi_all.h> 41 #include <scsi/scsiconf.h> 42 #include <scsi/scsi_disk.h> 43 44 #include <dev/softraidvar.h> 45 #include <dev/rndvar.h> 46 47 uint8_t *gf_map[256]; 48 uint8_t gf_pow[768]; 49 int gf_log[256]; 50 51 /* RAID 6 functions. */ 52 int sr_raid6_alloc_resources(struct sr_discipline *); 53 int sr_raid6_free_resources(struct sr_discipline *); 54 int sr_raid6_rw(struct sr_workunit *); 55 int sr_raid6_openings(struct sr_discipline *); 56 void sr_raid6_intr(struct buf *); 57 void sr_raid6_recreate_wu(struct sr_workunit *); 58 void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); 59 void sr_raid6_set_vol_state(struct sr_discipline *); 60 61 void sr_raid6_xorp(void *, void *, int); 62 void sr_raid6_xorq(void *, void *, int, int); 63 int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, 64 void *, int, int, void *, void *, int); 65 void sr_dump(void *, int); 66 void sr_raid6_scrub(struct sr_discipline *); 67 int sr_failio(struct sr_workunit *); 68 69 void *sr_get_block(struct sr_discipline *, int); 70 void sr_put_block(struct sr_discipline *, void *); 71 72 void gf_init(void); 73 uint8_t gf_inv(uint8_t); 74 int gf_premul(uint8_t); 75 76 #define SR_NOFAIL 0x00 77 #define SR_FAILX (1L << 0) 78 #define SR_FAILY (1L << 1) 79 #define SR_FAILP (1L << 2) 80 #define SR_FAILQ (1L << 3) 81 82 #define M_FAIL 0x00 83 84 #define M_RX 0x01 85 #define M_RXP 0x02 86 #define M_RXQ 0x03 87 #define M_RXY 0x04 88 #define M_RFLG 0x0F 89 90 #define M_WXPQ 0x10 91 #define M_WXY 0x20 92 #define M_WPQ 0x30 93 #define M_WFLG 0xF0 94 95 /* Mapping of Failure Flags to Read/Write state */ 96 uint8_t sr_rwmode[16] = { 97 [SR_FAILX+SR_FAILY+SR_FAILP] = M_FAIL, 98 [SR_FAILX+SR_FAILY+SR_FAILQ] = M_FAIL, 99 [SR_FAILX+SR_FAILP+SR_FAILQ] = M_FAIL, 100 [SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL, 101 [SR_FAILX+SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL, 102 103 [SR_NOFAIL] = M_RX | M_WXPQ, 104 [SR_FAILY] = M_RX | M_WXPQ, 105 [SR_FAILP] = M_RX | M_WXPQ, 106 [SR_FAILQ] = M_RX | M_WXPQ, 107 [SR_FAILY+SR_FAILP] = M_RX | M_WXPQ, 108 [SR_FAILY+SR_FAILQ] = M_RX | M_WXPQ, 109 [SR_FAILP+SR_FAILQ] = M_RX | M_WXPQ, 110 111 [SR_FAILX] = M_RXQ | M_WPQ, 112 [SR_FAILX+SR_FAILQ] = M_RXQ | M_WPQ, 113 [SR_FAILX+SR_FAILP] = M_RXP | M_WPQ, 114 [SR_FAILX+SR_FAILY] = M_RXY | M_WXY, 115 }; 116 117 struct sr_raid6_opaque { 118 int gn; 119 void *pbuf; 120 void *qbuf; 121 }; 122 123 /* discipline initialisation. */ 124 void 125 sr_raid6_discipline_init(struct sr_discipline *sd) 126 { 127 /* Initialize GF256 tables */ 128 gf_init(); 129 130 /* fill out discipline members. */ 131 sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */ 132 sd->sd_max_wu = SR_RAID6_NOWU; 133 sd->sd_rebuild = 0; 134 135 /* setup discipline pointers. */ 136 sd->sd_alloc_resources = sr_raid6_alloc_resources; 137 sd->sd_free_resources = sr_raid6_free_resources; 138 sd->sd_start_discipline = NULL; 139 sd->sd_scsi_inquiry = sr_raid_inquiry; 140 sd->sd_scsi_read_cap = sr_raid_read_cap; 141 sd->sd_scsi_tur = sr_raid_tur; 142 sd->sd_scsi_req_sense = sr_raid_request_sense; 143 sd->sd_scsi_start_stop = sr_raid_start_stop; 144 sd->sd_scsi_sync = sr_raid_sync; 145 sd->sd_scsi_rw = sr_raid6_rw; 146 sd->sd_set_chunk_state = sr_raid6_set_chunk_state; 147 sd->sd_set_vol_state = sr_raid6_set_vol_state; 148 sd->sd_openings = sr_raid6_openings; 149 } 150 151 int 152 sr_raid6_openings(struct sr_discipline *sd) 153 { 154 return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ 155 } 156 157 int 158 sr_raid6_alloc_resources(struct sr_discipline *sd) 159 { 160 int rv = EINVAL; 161 162 if (!sd) 163 return (rv); 164 165 DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", 166 DEVNAME(sd->sd_sc)); 167 168 if (sr_wu_alloc(sd)) 169 goto bad; 170 if (sr_ccb_alloc(sd)) 171 goto bad; 172 173 /* setup runtime values */ 174 sd->mds.mdd_raid6.sr6_strip_bits = 175 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 176 if (sd->mds.mdd_raid6.sr6_strip_bits == -1) 177 goto bad; 178 179 rv = 0; 180 bad: 181 return (rv); 182 } 183 184 int 185 sr_raid6_free_resources(struct sr_discipline *sd) 186 { 187 int rv = EINVAL; 188 189 if (!sd) 190 return (rv); 191 192 DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", 193 DEVNAME(sd->sd_sc)); 194 195 sr_wu_free(sd); 196 sr_ccb_free(sd); 197 198 rv = 0; 199 return (rv); 200 } 201 202 void 203 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 204 { 205 int old_state, s; 206 207 /* XXX this is for RAID 0 */ 208 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 209 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 210 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 211 212 /* ok to go to splbio since this only happens in error path */ 213 s = splbio(); 214 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 215 216 /* multiple IOs to the same chunk that fail will come through here */ 217 if (old_state == new_state) 218 goto done; 219 220 switch (old_state) { 221 case BIOC_SDONLINE: 222 switch (new_state) { 223 case BIOC_SDOFFLINE: 224 case BIOC_SDSCRUB: 225 break; 226 default: 227 goto die; 228 } 229 break; 230 231 case BIOC_SDOFFLINE: 232 if (new_state == BIOC_SDREBUILD) { 233 ; 234 } else 235 goto die; 236 break; 237 238 case BIOC_SDSCRUB: 239 switch (new_state) { 240 case BIOC_SDONLINE: 241 case BIOC_SDOFFLINE: 242 break; 243 default: 244 goto die; 245 } 246 break; 247 248 case BIOC_SDREBUILD: 249 switch (new_state) { 250 case BIOC_SDONLINE: 251 case BIOC_SDOFFLINE: 252 break; 253 default: 254 goto die; 255 } 256 break; 257 258 default: 259 die: 260 splx(s); /* XXX */ 261 panic("%s: %s: %s: invalid chunk state transition " 262 "%d -> %d\n", DEVNAME(sd->sd_sc), 263 sd->sd_meta->ssd_devname, 264 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 265 old_state, new_state); 266 /* NOTREACHED */ 267 } 268 269 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 270 sd->sd_set_vol_state(sd); 271 272 sd->sd_must_flush = 1; 273 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 274 done: 275 splx(s); 276 } 277 278 void 279 sr_raid6_set_vol_state(struct sr_discipline *sd) 280 { 281 int states[SR_MAX_STATES]; 282 int new_state, i, s, nd; 283 int old_state = sd->sd_vol_status; 284 285 /* XXX this is for RAID 0 */ 286 287 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 288 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 289 290 nd = sd->sd_meta->ssdi.ssd_chunk_no; 291 292 for (i = 0; i < SR_MAX_STATES; i++) 293 states[i] = 0; 294 295 for (i = 0; i < nd; i++) { 296 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 297 if (s >= SR_MAX_STATES) 298 panic("%s: %s: %s: invalid chunk state", 299 DEVNAME(sd->sd_sc), 300 sd->sd_meta->ssd_devname, 301 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 302 states[s]++; 303 } 304 305 if (states[BIOC_SDONLINE] == nd) 306 new_state = BIOC_SVONLINE; 307 else if (states[BIOC_SDONLINE] < nd - 2) 308 new_state = BIOC_SVOFFLINE; 309 else if (states[BIOC_SDSCRUB] != 0) 310 new_state = BIOC_SVSCRUB; 311 else if (states[BIOC_SDREBUILD] != 0) 312 new_state = BIOC_SVREBUILD; 313 else if (states[BIOC_SDONLINE] < nd) 314 new_state = BIOC_SVDEGRADED; 315 else { 316 printf("old_state = %d, ", old_state); 317 for (i = 0; i < nd; i++) 318 printf("%d = %d, ", i, 319 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 320 panic("invalid new_state"); 321 } 322 323 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", 324 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 325 old_state, new_state); 326 327 switch (old_state) { 328 case BIOC_SVONLINE: 329 switch (new_state) { 330 case BIOC_SVONLINE: /* can go to same state */ 331 case BIOC_SVOFFLINE: 332 case BIOC_SVDEGRADED: 333 case BIOC_SVREBUILD: /* happens on boot */ 334 break; 335 default: 336 goto die; 337 } 338 break; 339 340 case BIOC_SVOFFLINE: 341 /* XXX this might be a little too much */ 342 goto die; 343 344 case BIOC_SVSCRUB: 345 switch (new_state) { 346 case BIOC_SVONLINE: 347 case BIOC_SVOFFLINE: 348 case BIOC_SVDEGRADED: 349 case BIOC_SVSCRUB: /* can go to same state */ 350 break; 351 default: 352 goto die; 353 } 354 break; 355 356 case BIOC_SVBUILDING: 357 switch (new_state) { 358 case BIOC_SVONLINE: 359 case BIOC_SVOFFLINE: 360 case BIOC_SVBUILDING: /* can go to the same state */ 361 break; 362 default: 363 goto die; 364 } 365 break; 366 367 case BIOC_SVREBUILD: 368 switch (new_state) { 369 case BIOC_SVONLINE: 370 case BIOC_SVOFFLINE: 371 case BIOC_SVDEGRADED: 372 case BIOC_SVREBUILD: /* can go to the same state */ 373 break; 374 default: 375 goto die; 376 } 377 break; 378 379 case BIOC_SVDEGRADED: 380 switch (new_state) { 381 case BIOC_SVOFFLINE: 382 case BIOC_SVREBUILD: 383 case BIOC_SVDEGRADED: /* can go to the same state */ 384 break; 385 default: 386 goto die; 387 } 388 break; 389 390 default: 391 die: 392 panic("%s: %s: invalid volume state transition %d -> %d\n", 393 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 394 old_state, new_state); 395 /* NOTREACHED */ 396 } 397 398 sd->sd_vol_status = new_state; 399 } 400 401 /* modes: 402 * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 403 * SR_CCBF_FREEBUF, qbuf, NULL, 0); 404 * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 405 * SR_CCBF_FREEBUF, pbuf, NULL, 0); 406 * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 407 * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); 408 */ 409 410 int 411 sr_raid6_rw(struct sr_workunit *wu) 412 { 413 struct sr_workunit *wu_w = NULL; 414 struct sr_discipline *sd = wu->swu_dis; 415 struct scsi_xfer *xs = wu->swu_xs; 416 struct sr_chunk *scp; 417 int s, fail, i, rwmode; 418 daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; 419 daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; 420 daddr64_t strip_bits, length, strip_offs, datalen; 421 void *pbuf, *data, *qbuf; 422 423 /* blk and scsi error will be handled by sr_validate_io */ 424 if (sr_validate_io(wu, &blk, "sr_raid6_rw")) 425 goto bad; 426 427 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 428 strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; 429 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; 430 431 data = xs->data; 432 datalen = xs->datalen; 433 lbaoffs = blk << DEV_BSHIFT; 434 435 rwmode = (xs->flags & SCSI_DATA_IN) ? M_RFLG : M_WFLG; 436 if (xs->flags & SCSI_DATA_OUT) 437 /* create write workunit */ 438 if ((wu_w = sr_wu_get(sd, 0)) == NULL) { 439 printf("%s: can't get wu_w", DEVNAME(sd->sd_sc)); 440 goto bad; 441 } 442 443 wu->swu_blk_start = 0; 444 while (datalen != 0) { 445 strip_no = lbaoffs >> strip_bits; 446 strip_offs = lbaoffs & (strip_size - 1); 447 chunk_offs = (strip_no / no_chunk) << strip_bits; 448 phys_offs = chunk_offs + strip_offs + 449 ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT); 450 451 /* get size remaining in this stripe */ 452 length = MIN(strip_size - strip_offs, datalen); 453 454 /* map disk offset to parity/data drive */ 455 chunk = strip_no % no_chunk; 456 457 qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); 458 if (qchunk == 0) 459 pchunk = no_chunk + 1; 460 else 461 pchunk = qchunk - 1; 462 if (chunk >= pchunk) 463 chunk++; 464 if (chunk >= qchunk) 465 chunk++; 466 467 lba = phys_offs >> DEV_BSHIFT; 468 469 /* XXX big hammer.. exclude I/O from entire stripe */ 470 if (wu->swu_blk_start == 0) 471 wu->swu_blk_start = chunk_offs >> DEV_BSHIFT; 472 wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1; 473 474 fail = 0; 475 fchunk = -1; 476 477 /* Get disk-fail flags */ 478 for (i=0; i< no_chunk+2; i++) { 479 scp = sd->sd_vol.sv_chunks[i]; 480 switch (scp->src_meta.scm_status) { 481 case BIOC_SDOFFLINE: 482 case BIOC_SDREBUILD: 483 case BIOC_SDHOTSPARE: 484 if (i == qchunk) 485 fail |= SR_FAILQ; 486 else if (i == pchunk) 487 fail |= SR_FAILP; 488 else if (i == chunk) 489 fail |= SR_FAILX; 490 else { 491 /* dual data-disk failure */ 492 fail |= SR_FAILY; 493 fchunk = i; 494 } 495 break; 496 } 497 } 498 if (xs->flags & SCSI_DATA_IN) { 499 if (!(fail & SR_FAILX)) { 500 /* drive is good. issue single read request */ 501 if (sr_raid6_addio(wu, chunk, lba, length, 502 data, xs->flags, 0, NULL, NULL, 0)) 503 goto bad; 504 } else if (fail & SR_FAILP) { 505 /* Dx, P failed */ 506 printf("Disk %llx offline, " 507 "regenerating Dx+P\n", chunk); 508 509 qbuf = sr_get_block(sd, length); 510 if (qbuf == NULL) 511 goto bad; 512 513 /* Calculate: Dx*gx = Q^(Dz*gz) 514 * Q: sr_raid6_xorp(data, --, length); 515 * Dz: sr_raid6_xorq(data, --, length, gf_pow[i]); 516 */ 517 memset(data, 0, length); 518 for (i = 0; i < no_chunk+2; i++) { 519 if (i == qchunk) { 520 /* Read Q */ 521 if (sr_raid6_addio(wu, i, lba, 522 length, NULL, SCSI_DATA_IN, 523 SR_CCBF_FREEBUF, qbuf, 524 NULL, 0)) 525 goto bad; 526 } else if (i != chunk && i != pchunk) { 527 /* Read Dz * gz */ 528 if (sr_raid6_addio(wu, i, lba, 529 length, NULL, SCSI_DATA_IN, 530 SR_CCBF_FREEBUF, NULL, 531 qbuf, gf_pow[i])) 532 goto bad; 533 } 534 } 535 536 /* run fake wu when read i/o is complete */ 537 if (wu_w == NULL && 538 (wu_w = sr_wu_get(sd, 0)) == NULL) 539 goto bad; 540 541 wu_w->swu_flags |= SR_WUF_FAIL; 542 if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0, 543 SR_CCBF_FREEBUF, NULL, data, 544 gf_inv(gf_pow[chunk]))) 545 goto bad; 546 } else if (fail & SR_FAILY) { 547 /* Dx, Dy failed */ 548 printf("Disk %llx & %llx offline, " 549 "regenerating Dx+Dy\n", chunk, fchunk); 550 qbuf = sr_get_block(sd, length); 551 if (qbuf == NULL) 552 goto bad; 553 pbuf = sr_get_block(sd, length); 554 if (pbuf == NULL) 555 goto bad; 556 557 /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz 558 * Q: sr_raid6_xorp(qbuf, --, length); 559 * P: sr_raid6_xorp(pbuf, --, length); 560 * Dz: sr_raid6_xorp(pbuf, --, length); 561 * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); 562 */ 563 memset(data, 0, length); 564 for (i = 0; i < no_chunk+2; i++) { 565 if (i == qchunk) { 566 /* read Q */ 567 if (sr_raid6_addio(wu, i, lba, 568 length, NULL, SCSI_DATA_IN, 569 SR_CCBF_FREEBUF, qbuf, 570 NULL, 0)) 571 goto bad; 572 } else if (i == pchunk) { 573 /* read P */ 574 if (sr_raid6_addio(wu, i, lba, 575 length, NULL, SCSI_DATA_IN, 576 SR_CCBF_FREEBUF, pbuf, 577 NULL, 0)) 578 goto bad; 579 } else if (i != chunk) { 580 /* read Dz * gz */ 581 if (sr_raid6_addio(wu, i, lba, 582 length, NULL, SCSI_DATA_IN, 583 SR_CCBF_FREEBUF, pbuf, 584 qbuf, gf_pow[i])) 585 goto bad; 586 } 587 } 588 589 /* run fake wu when read i/o is complete */ 590 if (wu_w == NULL && 591 (wu_w = sr_wu_get(sd, 0)) == NULL) 592 goto bad; 593 594 wu_w->swu_flags |= SR_WUF_FAIL; 595 if (sr_raid6_addio(wu_w, 0, 0, length, pbuf, 0, 596 SR_CCBF_FREEBUF, NULL, data, 597 gf_inv(gf_pow[255+chunk-fchunk] ^ 1))) 598 goto bad; 599 if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0, 600 SR_CCBF_FREEBUF, NULL, data, 601 gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]))) 602 goto bad; 603 } else { 604 /* Two cases: single disk (Dx) or (Dx+Q) 605 * Dx = Dz ^ P (same as RAID5) 606 */ 607 printf("Disk %llx offline, " 608 "regenerating Dx%s\n", chunk, 609 fail & SR_FAILQ ? "+Q" : " single"); 610 611 /* Calculate: Dx = P^Dz 612 * P: sr_raid6_xorp(data, ---, length); 613 * Dz: sr_raid6_xorp(data, ---, length); 614 */ 615 memset(data, 0, length); 616 for (i = 0; i < no_chunk+2; i++) { 617 if (i != chunk && i != qchunk) { 618 /* Read Dz */ 619 if (sr_raid6_addio(wu, i, lba, 620 length, NULL, SCSI_DATA_IN, 621 SR_CCBF_FREEBUF, data, 622 NULL, 0)) 623 goto bad; 624 } 625 } 626 627 /* data will contain correct value on completion */ 628 } 629 } else { 630 /* XXX handle writes to failed/offline disk? */ 631 if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP)) 632 goto bad; 633 634 /* 635 * initialize pbuf with contents of new data to be 636 * written. This will be XORed with old data and old 637 * parity in the intr routine. The result in pbuf 638 * is the new parity data. 639 */ 640 qbuf = sr_get_block(sd, length); 641 if (qbuf == NULL) 642 goto bad; 643 644 pbuf = sr_get_block(sd, length); 645 if (pbuf == NULL) 646 goto bad; 647 648 /* Calulate P = Dn; Q = gn * Dn */ 649 if (gf_premul(gf_pow[chunk])) 650 goto bad; 651 sr_raid6_xorp(pbuf, data, length); 652 sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); 653 654 /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ 655 if (sr_raid6_addio(wu, chunk, lba, length, NULL, 656 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, 657 gf_pow[chunk])) 658 goto bad; 659 660 /* Read old xor-parity: P ^= P' */ 661 if (sr_raid6_addio(wu, pchunk, lba, length, NULL, 662 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) 663 goto bad; 664 665 /* Read old q-parity: Q ^= Q' */ 666 if (sr_raid6_addio(wu, qchunk, lba, length, NULL, 667 SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) 668 goto bad; 669 670 /* write new data */ 671 if (sr_raid6_addio(wu_w, chunk, lba, length, data, 672 xs->flags, 0, NULL, NULL, 0)) 673 goto bad; 674 675 /* write new xor-parity */ 676 if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf, 677 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 678 goto bad; 679 680 /* write new q-parity */ 681 if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf, 682 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 683 goto bad; 684 } 685 686 /* advance to next block */ 687 lbaoffs += length; 688 datalen -= length; 689 data += length; 690 } 691 692 s = splbio(); 693 if (wu_w) { 694 /* collide write request with reads */ 695 wu_w->swu_blk_start = wu->swu_blk_start; 696 wu_w->swu_blk_end = wu->swu_blk_end; 697 698 /* 699 * put xs block in write request (scsi_done not called till 700 * write completes) 701 */ 702 wu_w->swu_xs = wu->swu_xs; 703 wu->swu_xs = NULL; 704 705 wu_w->swu_state = SR_WU_DEFERRED; 706 wu->swu_collider = wu_w; 707 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 708 } 709 710 /* rebuild io, let rebuild routine deal with it */ 711 if (wu->swu_flags & SR_WUF_REBUILD) 712 goto queued; 713 714 /* current io failed, restart */ 715 if (wu->swu_state == SR_WU_RESTART) 716 goto start; 717 718 /* deferred io failed, don't restart */ 719 if (wu->swu_state == SR_WU_REQUEUE) 720 goto queued; 721 722 if (sr_check_io_collision(wu)) 723 goto queued; 724 725 start: 726 sr_raid_startwu(wu); 727 queued: 728 splx(s); 729 return (0); 730 bad: 731 /* wu is unwound by sr_wu_put */ 732 if (wu_w) 733 sr_wu_put(wu_w); 734 return (1); 735 } 736 737 /* Handle failure I/O completion */ 738 int 739 sr_failio(struct sr_workunit *wu) 740 { 741 struct sr_discipline *sd = wu->swu_dis; 742 struct sr_ccb *ccb; 743 744 if (!(wu->swu_flags & SR_WUF_FAIL)) 745 return (0); 746 747 /* Wu is a 'fake'.. don't do real I/O just intr */ 748 TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); 749 TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) 750 sr_raid6_intr(&ccb->ccb_buf); 751 return (1); 752 } 753 754 void 755 sr_raid6_intr(struct buf *bp) 756 { 757 struct sr_ccb *ccb = (struct sr_ccb *)bp; 758 struct sr_workunit *wu = ccb->ccb_wu, *wup; 759 struct sr_discipline *sd = wu->swu_dis; 760 struct scsi_xfer *xs = wu->swu_xs; 761 struct sr_softc *sc = sd->sd_sc; 762 struct sr_raid6_opaque *pq = ccb->ccb_opaque; 763 int s, pend; 764 765 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", 766 DEVNAME(sc), bp, xs); 767 768 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 769 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 770 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, 771 ccb->ccb_buf.b_blkno, ccb->ccb_target); 772 773 s = splbio(); 774 775 if (ccb->ccb_buf.b_flags & B_ERROR) { 776 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 777 DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); 778 printf("io error: disk %x\n", ccb->ccb_target); 779 wu->swu_ios_failed++; 780 ccb->ccb_state = SR_CCB_FAILED; 781 if (ccb->ccb_target != -1) 782 sd->sd_set_chunk_state(sd, ccb->ccb_target, 783 BIOC_SDOFFLINE); 784 else 785 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 786 } else { 787 ccb->ccb_state = SR_CCB_OK; 788 wu->swu_ios_succeeded++; 789 790 /* XOR data to result */ 791 if (pq) { 792 if (pq->pbuf) 793 /* Calculate xor-parity */ 794 sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, 795 ccb->ccb_buf.b_bcount); 796 if (pq->qbuf) 797 /* Calculate q-parity */ 798 sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, 799 ccb->ccb_buf.b_bcount, pq->gn); 800 free(pq, M_DEVBUF); 801 ccb->ccb_opaque = NULL; 802 } 803 } 804 805 /* free allocated data buffer */ 806 if (ccb->ccb_flag & SR_CCBF_FREEBUF) { 807 sr_put_block(sd, ccb->ccb_buf.b_data); 808 ccb->ccb_buf.b_data = NULL; 809 } 810 wu->swu_ios_complete++; 811 812 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 813 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 814 wu->swu_ios_failed); 815 816 if (wu->swu_ios_complete >= wu->swu_io_count) { 817 818 /* if all ios failed, retry reads and give up on writes */ 819 if (wu->swu_ios_failed == wu->swu_ios_complete) { 820 if (xs->flags & SCSI_DATA_IN) { 821 printf("%s: retrying read on block %lld\n", 822 DEVNAME(sc), ccb->ccb_buf.b_blkno); 823 sr_ccb_put(ccb); 824 TAILQ_INIT(&wu->swu_ccb); 825 wu->swu_state = SR_WU_RESTART; 826 if (sd->sd_scsi_rw(wu)) 827 goto bad; 828 else 829 goto retry; 830 } else { 831 printf("%s: permanently fail write on block " 832 "%lld\n", DEVNAME(sc), 833 ccb->ccb_buf.b_blkno); 834 xs->error = XS_DRIVER_STUFFUP; 835 goto bad; 836 } 837 } 838 839 if (xs != NULL) { 840 xs->error = XS_NOERROR; 841 xs->resid = 0; 842 xs->flags |= ITSDONE; 843 } 844 845 pend = 0; 846 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 847 if (wu == wup) { 848 /* wu on pendq, remove */ 849 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 850 pend = 1; 851 852 if (wu->swu_collider) { 853 if (wu->swu_ios_failed) 854 /* toss all ccbs and recreate */ 855 sr_raid6_recreate_wu(wu->swu_collider); 856 857 /* restart deferred wu */ 858 wu->swu_collider->swu_state = 859 SR_WU_INPROGRESS; 860 TAILQ_REMOVE(&sd->sd_wu_defq, 861 wu->swu_collider, swu_link); 862 if (sr_failio(wu->swu_collider) == 0) 863 sr_raid_startwu(wu->swu_collider); 864 } 865 break; 866 } 867 } 868 869 if (!pend) 870 printf("%s: wu: %p not on pending queue\n", 871 DEVNAME(sc), wu); 872 873 if (wu->swu_flags & SR_WUF_REBUILD) { 874 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 875 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 876 wakeup(wu); 877 } 878 } else { 879 /* do not change the order of these 2 functions */ 880 sr_wu_put(wu); 881 if (xs != NULL) 882 scsi_done(xs); 883 } 884 885 if (sd->sd_sync && sd->sd_wu_pending == 0) 886 wakeup(sd); 887 } 888 889 retry: 890 splx(s); 891 return; 892 bad: 893 xs->error = XS_DRIVER_STUFFUP; 894 xs->flags |= ITSDONE; 895 if (wu->swu_flags & SR_WUF_REBUILD) { 896 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 897 wakeup(wu); 898 } else { 899 /* do not change the order of these 2 functions */ 900 sr_wu_put(wu); 901 scsi_done(xs); 902 } 903 904 splx(s); 905 } 906 907 void 908 sr_raid6_recreate_wu(struct sr_workunit *wu) 909 { 910 struct sr_discipline *sd = wu->swu_dis; 911 struct sr_workunit *wup = wu; 912 struct sr_ccb *ccb; 913 914 do { 915 DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup); 916 917 /* toss all ccbs */ 918 while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { 919 TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); 920 sr_ccb_put(ccb); 921 } 922 TAILQ_INIT(&wup->swu_ccb); 923 924 /* recreate ccbs */ 925 wup->swu_state = SR_WU_REQUEUE; 926 if (sd->sd_scsi_rw(wup)) 927 panic("could not requeue io"); 928 929 wup = wup->swu_collider; 930 } while (wup); 931 } 932 933 int 934 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, 935 void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) 936 { 937 struct sr_discipline *sd = wu->swu_dis; 938 struct sr_ccb *ccb; 939 struct sr_raid6_opaque *pqbuf; 940 941 ccb = sr_ccb_get(sd); 942 if (!ccb) 943 return (-1); 944 945 /* allocate temporary buffer */ 946 if (data == NULL) { 947 data = sr_get_block(sd, len); 948 if (data == NULL) 949 return (-1); 950 } 951 952 DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", 953 flag & SCSI_DATA_IN ? "read" : "write", 954 dsk, blk, len, pbuf, qbuf); 955 956 ccb->ccb_flag = ccbflag; 957 if (flag & SCSI_POLL) { 958 ccb->ccb_buf.b_flags = 0; 959 ccb->ccb_buf.b_iodone = NULL; 960 } else { 961 ccb->ccb_buf.b_flags = B_CALL; 962 ccb->ccb_buf.b_iodone = sr_raid6_intr; 963 } 964 if (flag & SCSI_DATA_IN) 965 ccb->ccb_buf.b_flags |= B_READ; 966 else 967 ccb->ccb_buf.b_flags |= B_WRITE; 968 969 /* add offset for metadata */ 970 ccb->ccb_buf.b_flags |= B_PHYS; 971 ccb->ccb_buf.b_blkno = blk; 972 ccb->ccb_buf.b_bcount = len; 973 ccb->ccb_buf.b_bufsize = len; 974 ccb->ccb_buf.b_resid = len; 975 ccb->ccb_buf.b_data = data; 976 ccb->ccb_buf.b_error = 0; 977 ccb->ccb_buf.b_proc = curproc; 978 ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; 979 ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn; 980 if ((ccb->ccb_buf.b_flags & B_READ) == 0) 981 ccb->ccb_buf.b_vp->v_numoutput++; 982 983 ccb->ccb_wu = wu; 984 ccb->ccb_target = dsk; 985 if (pbuf || qbuf) { 986 if (qbuf && gf_premul(gn)) 987 return (-1); 988 989 pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL); 990 if (pqbuf == NULL) { 991 sr_ccb_put(ccb); 992 return (-1); 993 } 994 pqbuf->pbuf = pbuf; 995 pqbuf->qbuf = qbuf; 996 pqbuf->gn = gn; 997 ccb->ccb_opaque = pqbuf; 998 } 999 1000 LIST_INIT(&ccb->ccb_buf.b_dep); 1001 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 1002 1003 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " 1004 "b_blkno: %x b_flags 0x%0x b_data %p\n", 1005 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 1006 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, 1007 ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); 1008 1009 wu->swu_io_count++; 1010 1011 return (0); 1012 } 1013 1014 /* Perform RAID6 parity calculation. 1015 * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ 1016 void 1017 sr_raid6_xorp(void *p, void *d, int len) 1018 { 1019 uint8_t *pbuf = p, *data = d; 1020 1021 while (len--) 1022 pbuf[len] ^= data[len]; 1023 } 1024 1025 void 1026 sr_raid6_xorq(void *q, void *d, int len, int gn) 1027 { 1028 uint8_t *qbuf = q, *data = d; 1029 uint8_t *gn_map = gf_map[gn]; 1030 1031 /* Have to do this a byte at a time */ 1032 /* Faster multiply.. gn is always constant */ 1033 while (len--) 1034 qbuf[len] ^= gn_map[data[len]]; 1035 } 1036 1037 /* Create GF256 log/pow tables: polynomial = 0x11D */ 1038 void 1039 gf_init(void) 1040 { 1041 int i; 1042 uint8_t p = 1; 1043 1044 /* use 2N pow table to avoid using % in multiply */ 1045 for (i=0; i<256; i++) { 1046 gf_log[p] = i; 1047 gf_pow[i] = gf_pow[i+255] = p; 1048 p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); 1049 } 1050 gf_log[0] = 512; 1051 } 1052 1053 uint8_t 1054 gf_inv(uint8_t a) 1055 { 1056 return gf_pow[255 - gf_log[a]]; 1057 } 1058 1059 /* Precalculate multiplication tables for drive gn */ 1060 int 1061 gf_premul(uint8_t gn) 1062 { 1063 int i; 1064 1065 if (gf_map[gn] != NULL) 1066 return (0); 1067 1068 if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL) 1069 return (-1); 1070 1071 for (i=0; i<256; i++) 1072 gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]]; 1073 return (0); 1074 } 1075