1 /* $OpenBSD: softraid_raid6.c,v 1.9 2009/12/07 14:33:38 jsing Exp $ */ 2 /* 3 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 4 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include "bio.h" 20 21 #include <sys/param.h> 22 #include <sys/systm.h> 23 #include <sys/buf.h> 24 #include <sys/device.h> 25 #include <sys/ioctl.h> 26 #include <sys/proc.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/disklabel.h> 34 #include <sys/mount.h> 35 #include <sys/sensors.h> 36 #include <sys/stat.h> 37 #include <sys/conf.h> 38 #include <sys/uio.h> 39 40 #include <scsi/scsi_all.h> 41 #include <scsi/scsiconf.h> 42 #include <scsi/scsi_disk.h> 43 44 #include <dev/softraidvar.h> 45 #include <dev/rndvar.h> 46 47 uint8_t *gf_map[256]; 48 uint8_t gf_pow[768]; 49 int gf_log[256]; 50 51 /* RAID 6 functions. */ 52 int sr_raid6_alloc_resources(struct sr_discipline *); 53 int sr_raid6_free_resources(struct sr_discipline *); 54 int sr_raid6_rw(struct sr_workunit *); 55 int sr_raid6_openings(struct sr_discipline *); 56 void sr_raid6_intr(struct buf *); 57 void sr_raid6_recreate_wu(struct sr_workunit *); 58 void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); 59 void sr_raid6_set_vol_state(struct sr_discipline *); 60 61 void sr_raid6_xorp(void *, void *, int); 62 void sr_raid6_xorq(void *, void *, int, int); 63 int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, 64 void *, int, int, void *, void *, int); 65 void sr_dump(void *, int); 66 void sr_raid6_scrub(struct sr_discipline *); 67 int sr_failio(struct sr_workunit *); 68 69 void *sr_get_block(struct sr_discipline *, int); 70 void sr_put_block(struct sr_discipline *, void *); 71 72 void gf_init(void); 73 uint8_t gf_inv(uint8_t); 74 int gf_premul(uint8_t); 75 76 #define SR_NOFAIL 0x00 77 #define SR_FAILX (1L << 0) 78 #define SR_FAILY (1L << 1) 79 #define SR_FAILP (1L << 2) 80 #define SR_FAILQ (1L << 3) 81 82 #define M_FAIL 0x00 83 84 #define M_RX 0x01 85 #define M_RXP 0x02 86 #define M_RXQ 0x03 87 #define M_RXY 0x04 88 #define M_RFLG 0x0F 89 90 #define M_WXPQ 0x10 91 #define M_WXY 0x20 92 #define M_WPQ 0x30 93 #define M_WFLG 0xF0 94 95 /* Mapping of Failure Flags to Read/Write state */ 96 uint8_t sr_rwmode[16] = { 97 [SR_FAILX+SR_FAILY+SR_FAILP] = M_FAIL, 98 [SR_FAILX+SR_FAILY+SR_FAILQ] = M_FAIL, 99 [SR_FAILX+SR_FAILP+SR_FAILQ] = M_FAIL, 100 [SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL, 101 [SR_FAILX+SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL, 102 103 [SR_NOFAIL] = M_RX | M_WXPQ, 104 [SR_FAILY] = M_RX | M_WXPQ, 105 [SR_FAILP] = M_RX | M_WXPQ, 106 [SR_FAILQ] = M_RX | M_WXPQ, 107 [SR_FAILY+SR_FAILP] = M_RX | M_WXPQ, 108 [SR_FAILY+SR_FAILQ] = M_RX | M_WXPQ, 109 [SR_FAILP+SR_FAILQ] = M_RX | M_WXPQ, 110 111 [SR_FAILX] = M_RXQ | M_WPQ, 112 [SR_FAILX+SR_FAILQ] = M_RXQ | M_WPQ, 113 [SR_FAILX+SR_FAILP] = M_RXP | M_WPQ, 114 [SR_FAILX+SR_FAILY] = M_RXY | M_WXY, 115 }; 116 117 struct sr_raid6_opaque { 118 int gn; 119 void *pbuf; 120 void *qbuf; 121 }; 122 123 /* discipline initialisation. */ 124 void 125 sr_raid6_discipline_init(struct sr_discipline *sd) 126 { 127 128 /* Initialize GF256 tables */ 129 gf_init(); 130 131 /* fill out discipline members. */ 132 sd->sd_type = SR_MD_RAID6; 133 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE; 134 sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */ 135 sd->sd_max_wu = SR_RAID6_NOWU; 136 137 /* setup discipline pointers. */ 138 sd->sd_alloc_resources = sr_raid6_alloc_resources; 139 sd->sd_free_resources = sr_raid6_free_resources; 140 sd->sd_start_discipline = NULL; 141 sd->sd_scsi_inquiry = sr_raid_inquiry; 142 sd->sd_scsi_read_cap = sr_raid_read_cap; 143 sd->sd_scsi_tur = sr_raid_tur; 144 sd->sd_scsi_req_sense = sr_raid_request_sense; 145 sd->sd_scsi_start_stop = sr_raid_start_stop; 146 sd->sd_scsi_sync = sr_raid_sync; 147 sd->sd_scsi_rw = sr_raid6_rw; 148 sd->sd_set_chunk_state = sr_raid6_set_chunk_state; 149 sd->sd_set_vol_state = sr_raid6_set_vol_state; 150 sd->sd_openings = sr_raid6_openings; 151 } 152 153 int 154 sr_raid6_openings(struct sr_discipline *sd) 155 { 156 return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ 157 } 158 159 int 160 sr_raid6_alloc_resources(struct sr_discipline *sd) 161 { 162 int rv = EINVAL; 163 164 if (!sd) 165 return (rv); 166 167 DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", 168 DEVNAME(sd->sd_sc)); 169 170 if (sr_wu_alloc(sd)) 171 goto bad; 172 if (sr_ccb_alloc(sd)) 173 goto bad; 174 175 /* setup runtime values */ 176 sd->mds.mdd_raid6.sr6_strip_bits = 177 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 178 if (sd->mds.mdd_raid6.sr6_strip_bits == -1) 179 goto bad; 180 181 rv = 0; 182 bad: 183 return (rv); 184 } 185 186 int 187 sr_raid6_free_resources(struct sr_discipline *sd) 188 { 189 int rv = EINVAL; 190 191 if (!sd) 192 return (rv); 193 194 DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", 195 DEVNAME(sd->sd_sc)); 196 197 sr_wu_free(sd); 198 sr_ccb_free(sd); 199 200 rv = 0; 201 return (rv); 202 } 203 204 void 205 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 206 { 207 int old_state, s; 208 209 /* XXX this is for RAID 0 */ 210 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 211 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 212 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 213 214 /* ok to go to splbio since this only happens in error path */ 215 s = splbio(); 216 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 217 218 /* multiple IOs to the same chunk that fail will come through here */ 219 if (old_state == new_state) 220 goto done; 221 222 switch (old_state) { 223 case BIOC_SDONLINE: 224 switch (new_state) { 225 case BIOC_SDOFFLINE: 226 case BIOC_SDSCRUB: 227 break; 228 default: 229 goto die; 230 } 231 break; 232 233 case BIOC_SDOFFLINE: 234 if (new_state == BIOC_SDREBUILD) { 235 ; 236 } else 237 goto die; 238 break; 239 240 case BIOC_SDSCRUB: 241 switch (new_state) { 242 case BIOC_SDONLINE: 243 case BIOC_SDOFFLINE: 244 break; 245 default: 246 goto die; 247 } 248 break; 249 250 case BIOC_SDREBUILD: 251 switch (new_state) { 252 case BIOC_SDONLINE: 253 case BIOC_SDOFFLINE: 254 break; 255 default: 256 goto die; 257 } 258 break; 259 260 default: 261 die: 262 splx(s); /* XXX */ 263 panic("%s: %s: %s: invalid chunk state transition " 264 "%d -> %d\n", DEVNAME(sd->sd_sc), 265 sd->sd_meta->ssd_devname, 266 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 267 old_state, new_state); 268 /* NOTREACHED */ 269 } 270 271 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 272 sd->sd_set_vol_state(sd); 273 274 sd->sd_must_flush = 1; 275 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 276 done: 277 splx(s); 278 } 279 280 void 281 sr_raid6_set_vol_state(struct sr_discipline *sd) 282 { 283 int states[SR_MAX_STATES]; 284 int new_state, i, s, nd; 285 int old_state = sd->sd_vol_status; 286 287 /* XXX this is for RAID 0 */ 288 289 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 290 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 291 292 nd = sd->sd_meta->ssdi.ssd_chunk_no; 293 294 for (i = 0; i < SR_MAX_STATES; i++) 295 states[i] = 0; 296 297 for (i = 0; i < nd; i++) { 298 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 299 if (s >= SR_MAX_STATES) 300 panic("%s: %s: %s: invalid chunk state", 301 DEVNAME(sd->sd_sc), 302 sd->sd_meta->ssd_devname, 303 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 304 states[s]++; 305 } 306 307 if (states[BIOC_SDONLINE] == nd) 308 new_state = BIOC_SVONLINE; 309 else if (states[BIOC_SDONLINE] < nd - 2) 310 new_state = BIOC_SVOFFLINE; 311 else if (states[BIOC_SDSCRUB] != 0) 312 new_state = BIOC_SVSCRUB; 313 else if (states[BIOC_SDREBUILD] != 0) 314 new_state = BIOC_SVREBUILD; 315 else if (states[BIOC_SDONLINE] < nd) 316 new_state = BIOC_SVDEGRADED; 317 else { 318 printf("old_state = %d, ", old_state); 319 for (i = 0; i < nd; i++) 320 printf("%d = %d, ", i, 321 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 322 panic("invalid new_state"); 323 } 324 325 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", 326 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 327 old_state, new_state); 328 329 switch (old_state) { 330 case BIOC_SVONLINE: 331 switch (new_state) { 332 case BIOC_SVONLINE: /* can go to same state */ 333 case BIOC_SVOFFLINE: 334 case BIOC_SVDEGRADED: 335 case BIOC_SVREBUILD: /* happens on boot */ 336 break; 337 default: 338 goto die; 339 } 340 break; 341 342 case BIOC_SVOFFLINE: 343 /* XXX this might be a little too much */ 344 goto die; 345 346 case BIOC_SVSCRUB: 347 switch (new_state) { 348 case BIOC_SVONLINE: 349 case BIOC_SVOFFLINE: 350 case BIOC_SVDEGRADED: 351 case BIOC_SVSCRUB: /* can go to same state */ 352 break; 353 default: 354 goto die; 355 } 356 break; 357 358 case BIOC_SVBUILDING: 359 switch (new_state) { 360 case BIOC_SVONLINE: 361 case BIOC_SVOFFLINE: 362 case BIOC_SVBUILDING: /* can go to the same state */ 363 break; 364 default: 365 goto die; 366 } 367 break; 368 369 case BIOC_SVREBUILD: 370 switch (new_state) { 371 case BIOC_SVONLINE: 372 case BIOC_SVOFFLINE: 373 case BIOC_SVDEGRADED: 374 case BIOC_SVREBUILD: /* can go to the same state */ 375 break; 376 default: 377 goto die; 378 } 379 break; 380 381 case BIOC_SVDEGRADED: 382 switch (new_state) { 383 case BIOC_SVOFFLINE: 384 case BIOC_SVREBUILD: 385 case BIOC_SVDEGRADED: /* can go to the same state */ 386 break; 387 default: 388 goto die; 389 } 390 break; 391 392 default: 393 die: 394 panic("%s: %s: invalid volume state transition %d -> %d\n", 395 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 396 old_state, new_state); 397 /* NOTREACHED */ 398 } 399 400 sd->sd_vol_status = new_state; 401 } 402 403 /* modes: 404 * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 405 * SR_CCBF_FREEBUF, qbuf, NULL, 0); 406 * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 407 * SR_CCBF_FREEBUF, pbuf, NULL, 0); 408 * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 409 * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); 410 */ 411 412 int 413 sr_raid6_rw(struct sr_workunit *wu) 414 { 415 struct sr_workunit *wu_w = NULL; 416 struct sr_discipline *sd = wu->swu_dis; 417 struct scsi_xfer *xs = wu->swu_xs; 418 struct sr_chunk *scp; 419 int s, fail, i, rwmode; 420 daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; 421 daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; 422 daddr64_t strip_bits, length, strip_offs, datalen; 423 void *pbuf, *data, *qbuf; 424 425 /* blk and scsi error will be handled by sr_validate_io */ 426 if (sr_validate_io(wu, &blk, "sr_raid6_rw")) 427 goto bad; 428 429 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 430 strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; 431 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; 432 433 data = xs->data; 434 datalen = xs->datalen; 435 lbaoffs = blk << DEV_BSHIFT; 436 437 rwmode = (xs->flags & SCSI_DATA_IN) ? M_RFLG : M_WFLG; 438 if (xs->flags & SCSI_DATA_OUT) 439 /* create write workunit */ 440 if ((wu_w = sr_wu_get(sd, 0)) == NULL) { 441 printf("%s: can't get wu_w", DEVNAME(sd->sd_sc)); 442 goto bad; 443 } 444 445 wu->swu_blk_start = 0; 446 while (datalen != 0) { 447 strip_no = lbaoffs >> strip_bits; 448 strip_offs = lbaoffs & (strip_size - 1); 449 chunk_offs = (strip_no / no_chunk) << strip_bits; 450 phys_offs = chunk_offs + strip_offs + 451 ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT); 452 453 /* get size remaining in this stripe */ 454 length = MIN(strip_size - strip_offs, datalen); 455 456 /* map disk offset to parity/data drive */ 457 chunk = strip_no % no_chunk; 458 459 qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); 460 if (qchunk == 0) 461 pchunk = no_chunk + 1; 462 else 463 pchunk = qchunk - 1; 464 if (chunk >= pchunk) 465 chunk++; 466 if (chunk >= qchunk) 467 chunk++; 468 469 lba = phys_offs >> DEV_BSHIFT; 470 471 /* XXX big hammer.. exclude I/O from entire stripe */ 472 if (wu->swu_blk_start == 0) 473 wu->swu_blk_start = chunk_offs >> DEV_BSHIFT; 474 wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1; 475 476 fail = 0; 477 fchunk = -1; 478 479 /* Get disk-fail flags */ 480 for (i=0; i< no_chunk+2; i++) { 481 scp = sd->sd_vol.sv_chunks[i]; 482 switch (scp->src_meta.scm_status) { 483 case BIOC_SDOFFLINE: 484 case BIOC_SDREBUILD: 485 case BIOC_SDHOTSPARE: 486 if (i == qchunk) 487 fail |= SR_FAILQ; 488 else if (i == pchunk) 489 fail |= SR_FAILP; 490 else if (i == chunk) 491 fail |= SR_FAILX; 492 else { 493 /* dual data-disk failure */ 494 fail |= SR_FAILY; 495 fchunk = i; 496 } 497 break; 498 } 499 } 500 if (xs->flags & SCSI_DATA_IN) { 501 if (!(fail & SR_FAILX)) { 502 /* drive is good. issue single read request */ 503 if (sr_raid6_addio(wu, chunk, lba, length, 504 data, xs->flags, 0, NULL, NULL, 0)) 505 goto bad; 506 } else if (fail & SR_FAILP) { 507 /* Dx, P failed */ 508 printf("Disk %llx offline, " 509 "regenerating Dx+P\n", chunk); 510 511 qbuf = sr_get_block(sd, length); 512 if (qbuf == NULL) 513 goto bad; 514 515 /* Calculate: Dx*gx = Q^(Dz*gz) 516 * Q: sr_raid6_xorp(data, --, length); 517 * Dz: sr_raid6_xorq(data, --, length, gf_pow[i]); 518 */ 519 memset(data, 0, length); 520 for (i = 0; i < no_chunk+2; i++) { 521 if (i == qchunk) { 522 /* Read Q */ 523 if (sr_raid6_addio(wu, i, lba, 524 length, NULL, SCSI_DATA_IN, 525 SR_CCBF_FREEBUF, qbuf, 526 NULL, 0)) 527 goto bad; 528 } else if (i != chunk && i != pchunk) { 529 /* Read Dz * gz */ 530 if (sr_raid6_addio(wu, i, lba, 531 length, NULL, SCSI_DATA_IN, 532 SR_CCBF_FREEBUF, NULL, 533 qbuf, gf_pow[i])) 534 goto bad; 535 } 536 } 537 538 /* run fake wu when read i/o is complete */ 539 if (wu_w == NULL && 540 (wu_w = sr_wu_get(sd, 0)) == NULL) 541 goto bad; 542 543 wu_w->swu_flags |= SR_WUF_FAIL; 544 if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0, 545 SR_CCBF_FREEBUF, NULL, data, 546 gf_inv(gf_pow[chunk]))) 547 goto bad; 548 } else if (fail & SR_FAILY) { 549 /* Dx, Dy failed */ 550 printf("Disk %llx & %llx offline, " 551 "regenerating Dx+Dy\n", chunk, fchunk); 552 qbuf = sr_get_block(sd, length); 553 if (qbuf == NULL) 554 goto bad; 555 pbuf = sr_get_block(sd, length); 556 if (pbuf == NULL) 557 goto bad; 558 559 /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz 560 * Q: sr_raid6_xorp(qbuf, --, length); 561 * P: sr_raid6_xorp(pbuf, --, length); 562 * Dz: sr_raid6_xorp(pbuf, --, length); 563 * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); 564 */ 565 memset(data, 0, length); 566 for (i = 0; i < no_chunk+2; i++) { 567 if (i == qchunk) { 568 /* read Q */ 569 if (sr_raid6_addio(wu, i, lba, 570 length, NULL, SCSI_DATA_IN, 571 SR_CCBF_FREEBUF, qbuf, 572 NULL, 0)) 573 goto bad; 574 } else if (i == pchunk) { 575 /* read P */ 576 if (sr_raid6_addio(wu, i, lba, 577 length, NULL, SCSI_DATA_IN, 578 SR_CCBF_FREEBUF, pbuf, 579 NULL, 0)) 580 goto bad; 581 } else if (i != chunk) { 582 /* read Dz * gz */ 583 if (sr_raid6_addio(wu, i, lba, 584 length, NULL, SCSI_DATA_IN, 585 SR_CCBF_FREEBUF, pbuf, 586 qbuf, gf_pow[i])) 587 goto bad; 588 } 589 } 590 591 /* run fake wu when read i/o is complete */ 592 if (wu_w == NULL && 593 (wu_w = sr_wu_get(sd, 0)) == NULL) 594 goto bad; 595 596 wu_w->swu_flags |= SR_WUF_FAIL; 597 if (sr_raid6_addio(wu_w, 0, 0, length, pbuf, 0, 598 SR_CCBF_FREEBUF, NULL, data, 599 gf_inv(gf_pow[255+chunk-fchunk] ^ 1))) 600 goto bad; 601 if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0, 602 SR_CCBF_FREEBUF, NULL, data, 603 gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]))) 604 goto bad; 605 } else { 606 /* Two cases: single disk (Dx) or (Dx+Q) 607 * Dx = Dz ^ P (same as RAID5) 608 */ 609 printf("Disk %llx offline, " 610 "regenerating Dx%s\n", chunk, 611 fail & SR_FAILQ ? "+Q" : " single"); 612 613 /* Calculate: Dx = P^Dz 614 * P: sr_raid6_xorp(data, ---, length); 615 * Dz: sr_raid6_xorp(data, ---, length); 616 */ 617 memset(data, 0, length); 618 for (i = 0; i < no_chunk+2; i++) { 619 if (i != chunk && i != qchunk) { 620 /* Read Dz */ 621 if (sr_raid6_addio(wu, i, lba, 622 length, NULL, SCSI_DATA_IN, 623 SR_CCBF_FREEBUF, data, 624 NULL, 0)) 625 goto bad; 626 } 627 } 628 629 /* data will contain correct value on completion */ 630 } 631 } else { 632 /* XXX handle writes to failed/offline disk? */ 633 if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP)) 634 goto bad; 635 636 /* 637 * initialize pbuf with contents of new data to be 638 * written. This will be XORed with old data and old 639 * parity in the intr routine. The result in pbuf 640 * is the new parity data. 641 */ 642 qbuf = sr_get_block(sd, length); 643 if (qbuf == NULL) 644 goto bad; 645 646 pbuf = sr_get_block(sd, length); 647 if (pbuf == NULL) 648 goto bad; 649 650 /* Calulate P = Dn; Q = gn * Dn */ 651 if (gf_premul(gf_pow[chunk])) 652 goto bad; 653 sr_raid6_xorp(pbuf, data, length); 654 sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); 655 656 /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ 657 if (sr_raid6_addio(wu, chunk, lba, length, NULL, 658 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, 659 gf_pow[chunk])) 660 goto bad; 661 662 /* Read old xor-parity: P ^= P' */ 663 if (sr_raid6_addio(wu, pchunk, lba, length, NULL, 664 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) 665 goto bad; 666 667 /* Read old q-parity: Q ^= Q' */ 668 if (sr_raid6_addio(wu, qchunk, lba, length, NULL, 669 SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) 670 goto bad; 671 672 /* write new data */ 673 if (sr_raid6_addio(wu_w, chunk, lba, length, data, 674 xs->flags, 0, NULL, NULL, 0)) 675 goto bad; 676 677 /* write new xor-parity */ 678 if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf, 679 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 680 goto bad; 681 682 /* write new q-parity */ 683 if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf, 684 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 685 goto bad; 686 } 687 688 /* advance to next block */ 689 lbaoffs += length; 690 datalen -= length; 691 data += length; 692 } 693 694 s = splbio(); 695 if (wu_w) { 696 /* collide write request with reads */ 697 wu_w->swu_blk_start = wu->swu_blk_start; 698 wu_w->swu_blk_end = wu->swu_blk_end; 699 700 /* 701 * put xs block in write request (scsi_done not called till 702 * write completes) 703 */ 704 wu_w->swu_xs = wu->swu_xs; 705 wu->swu_xs = NULL; 706 707 wu_w->swu_state = SR_WU_DEFERRED; 708 wu->swu_collider = wu_w; 709 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 710 } 711 712 /* rebuild io, let rebuild routine deal with it */ 713 if (wu->swu_flags & SR_WUF_REBUILD) 714 goto queued; 715 716 /* current io failed, restart */ 717 if (wu->swu_state == SR_WU_RESTART) 718 goto start; 719 720 /* deferred io failed, don't restart */ 721 if (wu->swu_state == SR_WU_REQUEUE) 722 goto queued; 723 724 if (sr_check_io_collision(wu)) 725 goto queued; 726 727 start: 728 sr_raid_startwu(wu); 729 queued: 730 splx(s); 731 return (0); 732 bad: 733 /* wu is unwound by sr_wu_put */ 734 if (wu_w) 735 sr_wu_put(wu_w); 736 return (1); 737 } 738 739 /* Handle failure I/O completion */ 740 int 741 sr_failio(struct sr_workunit *wu) 742 { 743 struct sr_discipline *sd = wu->swu_dis; 744 struct sr_ccb *ccb; 745 746 if (!(wu->swu_flags & SR_WUF_FAIL)) 747 return (0); 748 749 /* Wu is a 'fake'.. don't do real I/O just intr */ 750 TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); 751 TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) 752 sr_raid6_intr(&ccb->ccb_buf); 753 return (1); 754 } 755 756 void 757 sr_raid6_intr(struct buf *bp) 758 { 759 struct sr_ccb *ccb = (struct sr_ccb *)bp; 760 struct sr_workunit *wu = ccb->ccb_wu, *wup; 761 struct sr_discipline *sd = wu->swu_dis; 762 struct scsi_xfer *xs = wu->swu_xs; 763 struct sr_softc *sc = sd->sd_sc; 764 struct sr_raid6_opaque *pq = ccb->ccb_opaque; 765 int s, pend; 766 767 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", 768 DEVNAME(sc), bp, xs); 769 770 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 771 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 772 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, 773 ccb->ccb_buf.b_blkno, ccb->ccb_target); 774 775 s = splbio(); 776 777 if (ccb->ccb_buf.b_flags & B_ERROR) { 778 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 779 DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); 780 printf("io error: disk %x\n", ccb->ccb_target); 781 wu->swu_ios_failed++; 782 ccb->ccb_state = SR_CCB_FAILED; 783 if (ccb->ccb_target != -1) 784 sd->sd_set_chunk_state(sd, ccb->ccb_target, 785 BIOC_SDOFFLINE); 786 else 787 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 788 } else { 789 ccb->ccb_state = SR_CCB_OK; 790 wu->swu_ios_succeeded++; 791 792 /* XOR data to result */ 793 if (pq) { 794 if (pq->pbuf) 795 /* Calculate xor-parity */ 796 sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, 797 ccb->ccb_buf.b_bcount); 798 if (pq->qbuf) 799 /* Calculate q-parity */ 800 sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, 801 ccb->ccb_buf.b_bcount, pq->gn); 802 free(pq, M_DEVBUF); 803 ccb->ccb_opaque = NULL; 804 } 805 } 806 807 /* free allocated data buffer */ 808 if (ccb->ccb_flag & SR_CCBF_FREEBUF) { 809 sr_put_block(sd, ccb->ccb_buf.b_data); 810 ccb->ccb_buf.b_data = NULL; 811 } 812 wu->swu_ios_complete++; 813 814 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 815 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 816 wu->swu_ios_failed); 817 818 if (wu->swu_ios_complete >= wu->swu_io_count) { 819 820 /* if all ios failed, retry reads and give up on writes */ 821 if (wu->swu_ios_failed == wu->swu_ios_complete) { 822 if (xs->flags & SCSI_DATA_IN) { 823 printf("%s: retrying read on block %lld\n", 824 DEVNAME(sc), ccb->ccb_buf.b_blkno); 825 sr_ccb_put(ccb); 826 TAILQ_INIT(&wu->swu_ccb); 827 wu->swu_state = SR_WU_RESTART; 828 if (sd->sd_scsi_rw(wu)) 829 goto bad; 830 else 831 goto retry; 832 } else { 833 printf("%s: permanently fail write on block " 834 "%lld\n", DEVNAME(sc), 835 ccb->ccb_buf.b_blkno); 836 xs->error = XS_DRIVER_STUFFUP; 837 goto bad; 838 } 839 } 840 841 if (xs != NULL) { 842 xs->error = XS_NOERROR; 843 xs->resid = 0; 844 xs->flags |= ITSDONE; 845 } 846 847 pend = 0; 848 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 849 if (wu == wup) { 850 /* wu on pendq, remove */ 851 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 852 pend = 1; 853 854 if (wu->swu_collider) { 855 if (wu->swu_ios_failed) 856 /* toss all ccbs and recreate */ 857 sr_raid6_recreate_wu(wu->swu_collider); 858 859 /* restart deferred wu */ 860 wu->swu_collider->swu_state = 861 SR_WU_INPROGRESS; 862 TAILQ_REMOVE(&sd->sd_wu_defq, 863 wu->swu_collider, swu_link); 864 if (sr_failio(wu->swu_collider) == 0) 865 sr_raid_startwu(wu->swu_collider); 866 } 867 break; 868 } 869 } 870 871 if (!pend) 872 printf("%s: wu: %p not on pending queue\n", 873 DEVNAME(sc), wu); 874 875 if (wu->swu_flags & SR_WUF_REBUILD) { 876 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 877 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 878 wakeup(wu); 879 } 880 } else { 881 /* do not change the order of these 2 functions */ 882 sr_wu_put(wu); 883 if (xs != NULL) 884 scsi_done(xs); 885 } 886 887 if (sd->sd_sync && sd->sd_wu_pending == 0) 888 wakeup(sd); 889 } 890 891 retry: 892 splx(s); 893 return; 894 bad: 895 xs->error = XS_DRIVER_STUFFUP; 896 xs->flags |= ITSDONE; 897 if (wu->swu_flags & SR_WUF_REBUILD) { 898 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 899 wakeup(wu); 900 } else { 901 /* do not change the order of these 2 functions */ 902 sr_wu_put(wu); 903 scsi_done(xs); 904 } 905 906 splx(s); 907 } 908 909 void 910 sr_raid6_recreate_wu(struct sr_workunit *wu) 911 { 912 struct sr_discipline *sd = wu->swu_dis; 913 struct sr_workunit *wup = wu; 914 struct sr_ccb *ccb; 915 916 do { 917 DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup); 918 919 /* toss all ccbs */ 920 while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { 921 TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); 922 sr_ccb_put(ccb); 923 } 924 TAILQ_INIT(&wup->swu_ccb); 925 926 /* recreate ccbs */ 927 wup->swu_state = SR_WU_REQUEUE; 928 if (sd->sd_scsi_rw(wup)) 929 panic("could not requeue io"); 930 931 wup = wup->swu_collider; 932 } while (wup); 933 } 934 935 int 936 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, 937 void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) 938 { 939 struct sr_discipline *sd = wu->swu_dis; 940 struct sr_ccb *ccb; 941 struct sr_raid6_opaque *pqbuf; 942 943 ccb = sr_ccb_get(sd); 944 if (!ccb) 945 return (-1); 946 947 /* allocate temporary buffer */ 948 if (data == NULL) { 949 data = sr_get_block(sd, len); 950 if (data == NULL) 951 return (-1); 952 } 953 954 DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", 955 flag & SCSI_DATA_IN ? "read" : "write", 956 dsk, blk, len, pbuf, qbuf); 957 958 ccb->ccb_flag = ccbflag; 959 if (flag & SCSI_POLL) { 960 ccb->ccb_buf.b_flags = 0; 961 ccb->ccb_buf.b_iodone = NULL; 962 } else { 963 ccb->ccb_buf.b_flags = B_CALL; 964 ccb->ccb_buf.b_iodone = sr_raid6_intr; 965 } 966 if (flag & SCSI_DATA_IN) 967 ccb->ccb_buf.b_flags |= B_READ; 968 else 969 ccb->ccb_buf.b_flags |= B_WRITE; 970 971 /* add offset for metadata */ 972 ccb->ccb_buf.b_flags |= B_PHYS; 973 ccb->ccb_buf.b_blkno = blk; 974 ccb->ccb_buf.b_bcount = len; 975 ccb->ccb_buf.b_bufsize = len; 976 ccb->ccb_buf.b_resid = len; 977 ccb->ccb_buf.b_data = data; 978 ccb->ccb_buf.b_error = 0; 979 ccb->ccb_buf.b_proc = curproc; 980 ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; 981 ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn; 982 if ((ccb->ccb_buf.b_flags & B_READ) == 0) 983 ccb->ccb_buf.b_vp->v_numoutput++; 984 985 ccb->ccb_wu = wu; 986 ccb->ccb_target = dsk; 987 if (pbuf || qbuf) { 988 if (qbuf && gf_premul(gn)) 989 return (-1); 990 991 pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL); 992 if (pqbuf == NULL) { 993 sr_ccb_put(ccb); 994 return (-1); 995 } 996 pqbuf->pbuf = pbuf; 997 pqbuf->qbuf = qbuf; 998 pqbuf->gn = gn; 999 ccb->ccb_opaque = pqbuf; 1000 } 1001 1002 LIST_INIT(&ccb->ccb_buf.b_dep); 1003 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 1004 1005 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " 1006 "b_blkno: %x b_flags 0x%0x b_data %p\n", 1007 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 1008 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, 1009 ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); 1010 1011 wu->swu_io_count++; 1012 1013 return (0); 1014 } 1015 1016 /* Perform RAID6 parity calculation. 1017 * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ 1018 void 1019 sr_raid6_xorp(void *p, void *d, int len) 1020 { 1021 uint8_t *pbuf = p, *data = d; 1022 1023 while (len--) 1024 pbuf[len] ^= data[len]; 1025 } 1026 1027 void 1028 sr_raid6_xorq(void *q, void *d, int len, int gn) 1029 { 1030 uint8_t *qbuf = q, *data = d; 1031 uint8_t *gn_map = gf_map[gn]; 1032 1033 /* Have to do this a byte at a time */ 1034 /* Faster multiply.. gn is always constant */ 1035 while (len--) 1036 qbuf[len] ^= gn_map[data[len]]; 1037 } 1038 1039 /* Create GF256 log/pow tables: polynomial = 0x11D */ 1040 void 1041 gf_init(void) 1042 { 1043 int i; 1044 uint8_t p = 1; 1045 1046 /* use 2N pow table to avoid using % in multiply */ 1047 for (i=0; i<256; i++) { 1048 gf_log[p] = i; 1049 gf_pow[i] = gf_pow[i+255] = p; 1050 p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); 1051 } 1052 gf_log[0] = 512; 1053 } 1054 1055 uint8_t 1056 gf_inv(uint8_t a) 1057 { 1058 return gf_pow[255 - gf_log[a]]; 1059 } 1060 1061 /* Precalculate multiplication tables for drive gn */ 1062 int 1063 gf_premul(uint8_t gn) 1064 { 1065 int i; 1066 1067 if (gf_map[gn] != NULL) 1068 return (0); 1069 1070 if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL) 1071 return (-1); 1072 1073 for (i=0; i<256; i++) 1074 gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]]; 1075 return (0); 1076 } 1077