1 /* $OpenBSD: softraid_raid1.c,v 1.20 2009/12/07 14:27:12 jsing Exp $ */ 2 /* 3 * Copyright (c) 2007 Marco Peereboom <marco@peereboom.us> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include "bio.h" 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/buf.h> 23 #include <sys/device.h> 24 #include <sys/ioctl.h> 25 #include <sys/proc.h> 26 #include <sys/malloc.h> 27 #include <sys/kernel.h> 28 #include <sys/disk.h> 29 #include <sys/rwlock.h> 30 #include <sys/queue.h> 31 #include <sys/fcntl.h> 32 #include <sys/disklabel.h> 33 #include <sys/mount.h> 34 #include <sys/sensors.h> 35 #include <sys/stat.h> 36 #include <sys/conf.h> 37 #include <sys/uio.h> 38 39 #include <scsi/scsi_all.h> 40 #include <scsi/scsiconf.h> 41 #include <scsi/scsi_disk.h> 42 43 #include <dev/softraidvar.h> 44 #include <dev/rndvar.h> 45 46 /* RAID 1 functions. */ 47 int sr_raid1_alloc_resources(struct sr_discipline *); 48 int sr_raid1_free_resources(struct sr_discipline *); 49 int sr_raid1_rw(struct sr_workunit *); 50 void sr_raid1_intr(struct buf *); 51 void sr_raid1_recreate_wu(struct sr_workunit *); 52 53 /* Discipline initialisation. */ 54 void 55 sr_raid1_discipline_init(struct sr_discipline *sd) 56 { 57 58 /* Fill out discipline members. */ 59 sd->sd_type = SR_MD_RAID1; 60 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE | 61 SR_CAP_REBUILD; 62 sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no; 63 sd->sd_max_wu = SR_RAID1_NOWU; 64 65 /* Setup discipline pointers. */ 66 sd->sd_alloc_resources = sr_raid1_alloc_resources; 67 sd->sd_free_resources = sr_raid1_free_resources; 68 sd->sd_start_discipline = NULL; 69 sd->sd_scsi_inquiry = sr_raid_inquiry; 70 sd->sd_scsi_read_cap = sr_raid_read_cap; 71 sd->sd_scsi_tur = sr_raid_tur; 72 sd->sd_scsi_req_sense = sr_raid_request_sense; 73 sd->sd_scsi_start_stop = sr_raid_start_stop; 74 sd->sd_scsi_sync = sr_raid_sync; 75 sd->sd_scsi_rw = sr_raid1_rw; 76 sd->sd_set_chunk_state = sr_raid1_set_chunk_state; 77 sd->sd_set_vol_state = sr_raid1_set_vol_state; 78 } 79 80 int 81 sr_raid1_alloc_resources(struct sr_discipline *sd) 82 { 83 int rv = EINVAL; 84 85 if (!sd) 86 return (rv); 87 88 DNPRINTF(SR_D_DIS, "%s: sr_raid1_alloc_resources\n", 89 DEVNAME(sd->sd_sc)); 90 91 if (sr_wu_alloc(sd)) 92 goto bad; 93 if (sr_ccb_alloc(sd)) 94 goto bad; 95 96 rv = 0; 97 bad: 98 return (rv); 99 } 100 101 int 102 sr_raid1_free_resources(struct sr_discipline *sd) 103 { 104 int rv = EINVAL; 105 106 if (!sd) 107 return (rv); 108 109 DNPRINTF(SR_D_DIS, "%s: sr_raid1_free_resources\n", 110 DEVNAME(sd->sd_sc)); 111 112 sr_wu_free(sd); 113 sr_ccb_free(sd); 114 115 rv = 0; 116 return (rv); 117 } 118 119 void 120 sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 121 { 122 int old_state, s; 123 124 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 125 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 126 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 127 128 /* ok to go to splbio since this only happens in error path */ 129 s = splbio(); 130 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 131 132 /* multiple IOs to the same chunk that fail will come through here */ 133 if (old_state == new_state) 134 goto done; 135 136 switch (old_state) { 137 case BIOC_SDONLINE: 138 switch (new_state) { 139 case BIOC_SDOFFLINE: 140 case BIOC_SDSCRUB: 141 break; 142 default: 143 goto die; 144 } 145 break; 146 147 case BIOC_SDOFFLINE: 148 switch (new_state) { 149 case BIOC_SDREBUILD: 150 case BIOC_SDHOTSPARE: 151 break; 152 default: 153 goto die; 154 } 155 break; 156 157 case BIOC_SDSCRUB: 158 if (new_state == BIOC_SDONLINE) { 159 ; 160 } else 161 goto die; 162 break; 163 164 case BIOC_SDREBUILD: 165 switch (new_state) { 166 case BIOC_SDONLINE: 167 break; 168 case BIOC_SDOFFLINE: 169 /* Abort rebuild since the rebuild chunk disappeared. */ 170 sd->sd_reb_abort = 1; 171 break; 172 default: 173 goto die; 174 } 175 break; 176 177 case BIOC_SDHOTSPARE: 178 switch (new_state) { 179 case BIOC_SDOFFLINE: 180 case BIOC_SDREBUILD: 181 break; 182 default: 183 goto die; 184 } 185 break; 186 187 default: 188 die: 189 splx(s); /* XXX */ 190 panic("%s: %s: %s: invalid chunk state transition " 191 "%d -> %d\n", DEVNAME(sd->sd_sc), 192 sd->sd_meta->ssd_devname, 193 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 194 old_state, new_state); 195 /* NOTREACHED */ 196 } 197 198 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 199 sd->sd_set_vol_state(sd); 200 201 sd->sd_must_flush = 1; 202 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 203 done: 204 splx(s); 205 } 206 207 void 208 sr_raid1_set_vol_state(struct sr_discipline *sd) 209 { 210 int states[SR_MAX_STATES]; 211 int new_state, i, s, nd; 212 int old_state = sd->sd_vol_status; 213 214 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 215 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 216 217 nd = sd->sd_meta->ssdi.ssd_chunk_no; 218 219 for (i = 0; i < SR_MAX_STATES; i++) 220 states[i] = 0; 221 222 for (i = 0; i < nd; i++) { 223 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 224 if (s >= SR_MAX_STATES) 225 panic("%s: %s: %s: invalid chunk state", 226 DEVNAME(sd->sd_sc), 227 sd->sd_meta->ssd_devname, 228 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 229 states[s]++; 230 } 231 232 if (states[BIOC_SDONLINE] == nd) 233 new_state = BIOC_SVONLINE; 234 else if (states[BIOC_SDONLINE] == 0) 235 new_state = BIOC_SVOFFLINE; 236 else if (states[BIOC_SDSCRUB] != 0) 237 new_state = BIOC_SVSCRUB; 238 else if (states[BIOC_SDREBUILD] != 0) 239 new_state = BIOC_SVREBUILD; 240 else if (states[BIOC_SDOFFLINE] != 0) 241 new_state = BIOC_SVDEGRADED; 242 else { 243 #ifdef SR_DEBUG 244 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state " 245 "was %d\n", DEVNAME(sd->sd_sc), old_state); 246 for (i = 0; i < nd; i++) 247 DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n", 248 DEVNAME(sd->sd_sc), i, 249 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 250 #endif 251 panic("invalid volume state"); 252 } 253 254 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid1_set_vol_state %d -> %d\n", 255 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 256 old_state, new_state); 257 258 switch (old_state) { 259 case BIOC_SVONLINE: 260 switch (new_state) { 261 case BIOC_SVONLINE: /* can go to same state */ 262 case BIOC_SVOFFLINE: 263 case BIOC_SVDEGRADED: 264 case BIOC_SVREBUILD: /* happens on boot */ 265 break; 266 default: 267 goto die; 268 } 269 break; 270 271 case BIOC_SVOFFLINE: 272 /* XXX this might be a little too much */ 273 goto die; 274 275 case BIOC_SVSCRUB: 276 switch (new_state) { 277 case BIOC_SVONLINE: 278 case BIOC_SVOFFLINE: 279 case BIOC_SVDEGRADED: 280 case BIOC_SVSCRUB: /* can go to same state */ 281 break; 282 default: 283 goto die; 284 } 285 break; 286 287 case BIOC_SVBUILDING: 288 switch (new_state) { 289 case BIOC_SVONLINE: 290 case BIOC_SVOFFLINE: 291 case BIOC_SVBUILDING: /* can go to the same state */ 292 break; 293 default: 294 goto die; 295 } 296 break; 297 298 case BIOC_SVREBUILD: 299 switch (new_state) { 300 case BIOC_SVONLINE: 301 case BIOC_SVOFFLINE: 302 case BIOC_SVDEGRADED: 303 case BIOC_SVREBUILD: /* can go to the same state */ 304 break; 305 default: 306 goto die; 307 } 308 break; 309 310 case BIOC_SVDEGRADED: 311 switch (new_state) { 312 case BIOC_SVOFFLINE: 313 case BIOC_SVREBUILD: 314 case BIOC_SVDEGRADED: /* can go to the same state */ 315 break; 316 default: 317 goto die; 318 } 319 break; 320 321 default: 322 die: 323 panic("%s: %s: invalid volume state transition " 324 "%d -> %d\n", DEVNAME(sd->sd_sc), 325 sd->sd_meta->ssd_devname, 326 old_state, new_state); 327 /* NOTREACHED */ 328 } 329 330 sd->sd_vol_status = new_state; 331 332 /* If we have just become degraded, look for a hotspare. */ 333 if (new_state == BIOC_SVDEGRADED) 334 workq_add_task(NULL, 0, sr_hotspare_rebuild_callback, sd, NULL); 335 } 336 337 int 338 sr_raid1_rw(struct sr_workunit *wu) 339 { 340 struct sr_discipline *sd = wu->swu_dis; 341 struct scsi_xfer *xs = wu->swu_xs; 342 struct sr_ccb *ccb; 343 struct buf *b; 344 struct sr_chunk *scp; 345 int ios, x, i, s, rt; 346 daddr64_t blk; 347 348 /* blk and scsi error will be handled by sr_validate_io */ 349 if (sr_validate_io(wu, &blk, "sr_raid1_rw")) 350 goto bad; 351 352 /* calculate physical block */ 353 blk += SR_META_SIZE + SR_META_OFFSET; 354 355 if (xs->flags & SCSI_DATA_IN) 356 ios = 1; 357 else 358 ios = sd->sd_meta->ssdi.ssd_chunk_no; 359 wu->swu_io_count = ios; 360 361 for (i = 0; i < ios; i++) { 362 ccb = sr_ccb_get(sd); 363 if (!ccb) { 364 /* should never happen but handle more gracefully */ 365 printf("%s: %s: too many ccbs queued\n", 366 DEVNAME(sd->sd_sc), 367 sd->sd_meta->ssd_devname); 368 goto bad; 369 } 370 b = &ccb->ccb_buf; 371 372 if (xs->flags & SCSI_POLL) { 373 b->b_flags = 0; 374 b->b_iodone = NULL; 375 } else { 376 b->b_flags = B_CALL; 377 b->b_iodone = sr_raid1_intr; 378 } 379 380 b->b_flags |= B_PHYS; 381 b->b_blkno = blk; 382 b->b_bcount = xs->datalen; 383 b->b_bufsize = xs->datalen; 384 b->b_resid = xs->datalen; 385 b->b_data = xs->data; 386 b->b_error = 0; 387 b->b_proc = curproc; 388 ccb->ccb_wu = wu; 389 390 if (xs->flags & SCSI_DATA_IN) { 391 rt = 0; 392 ragain: 393 /* interleave reads */ 394 x = sd->mds.mdd_raid1.sr1_counter++ % 395 sd->sd_meta->ssdi.ssd_chunk_no; 396 scp = sd->sd_vol.sv_chunks[x]; 397 switch (scp->src_meta.scm_status) { 398 case BIOC_SDONLINE: 399 case BIOC_SDSCRUB: 400 b->b_flags |= B_READ; 401 break; 402 403 case BIOC_SDOFFLINE: 404 case BIOC_SDREBUILD: 405 case BIOC_SDHOTSPARE: 406 if (rt++ < sd->sd_meta->ssdi.ssd_chunk_no) 407 goto ragain; 408 409 /* FALLTHROUGH */ 410 default: 411 /* volume offline */ 412 printf("%s: is offline, can't read\n", 413 DEVNAME(sd->sd_sc)); 414 sr_ccb_put(ccb); 415 goto bad; 416 } 417 } else { 418 /* writes go on all working disks */ 419 x = i; 420 scp = sd->sd_vol.sv_chunks[x]; 421 switch (scp->src_meta.scm_status) { 422 case BIOC_SDONLINE: 423 case BIOC_SDSCRUB: 424 case BIOC_SDREBUILD: 425 b->b_flags |= B_WRITE; 426 break; 427 428 case BIOC_SDHOTSPARE: /* should never happen */ 429 case BIOC_SDOFFLINE: 430 wu->swu_io_count--; 431 sr_ccb_put(ccb); 432 continue; 433 434 default: 435 goto bad; 436 } 437 438 } 439 ccb->ccb_target = x; 440 b->b_dev = sd->sd_vol.sv_chunks[x]->src_dev_mm; 441 b->b_vp = sd->sd_vol.sv_chunks[x]->src_vn; 442 if ((b->b_flags & B_READ) == 0) 443 b->b_vp->v_numoutput++; 444 445 LIST_INIT(&b->b_dep); 446 447 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 448 449 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid1: b_bcount: %d " 450 "b_blkno: %x b_flags 0x%0x b_data %p\n", 451 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 452 b->b_bcount, b->b_blkno, 453 b->b_flags, b->b_data); 454 } 455 456 s = splbio(); 457 458 /* rebuild io, let rebuild routine deal with it */ 459 if (wu->swu_flags & SR_WUF_REBUILD) 460 goto queued; 461 462 /* current io failed, restart */ 463 if (wu->swu_state == SR_WU_RESTART) 464 goto start; 465 466 /* deferred io failed, don't restart */ 467 if (wu->swu_state == SR_WU_REQUEUE) 468 goto queued; 469 470 if (sr_check_io_collision(wu)) 471 goto queued; 472 473 start: 474 sr_raid_startwu(wu); 475 queued: 476 splx(s); 477 return (0); 478 bad: 479 /* wu is unwound by sr_wu_put */ 480 return (1); 481 } 482 483 void 484 sr_raid1_intr(struct buf *bp) 485 { 486 struct sr_ccb *ccb = (struct sr_ccb *)bp; 487 struct sr_workunit *wu = ccb->ccb_wu, *wup; 488 struct sr_discipline *sd = wu->swu_dis; 489 struct scsi_xfer *xs = wu->swu_xs; 490 struct sr_softc *sc = sd->sd_sc; 491 struct buf *b; 492 int s, pend; 493 494 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %x xs %x\n", 495 DEVNAME(sc), bp, xs); 496 497 b = &ccb->ccb_buf; 498 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 499 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 500 b->b_bcount, b->b_resid, b->b_flags, b->b_blkno, ccb->ccb_target); 501 502 s = splbio(); 503 504 if (b->b_flags & B_ERROR) { 505 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 506 DEVNAME(sc), b->b_blkno, ccb->ccb_target); 507 wu->swu_ios_failed++; 508 ccb->ccb_state = SR_CCB_FAILED; 509 if (ccb->ccb_target != -1) 510 sd->sd_set_chunk_state(sd, ccb->ccb_target, 511 BIOC_SDOFFLINE); 512 else 513 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 514 } else { 515 ccb->ccb_state = SR_CCB_OK; 516 wu->swu_ios_succeeded++; 517 } 518 wu->swu_ios_complete++; 519 520 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 521 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 522 wu->swu_ios_failed); 523 524 if (wu->swu_ios_complete >= wu->swu_io_count) { 525 /* if all ios failed, retry reads and give up on writes */ 526 if (wu->swu_ios_failed == wu->swu_ios_complete) { 527 if (xs->flags & SCSI_DATA_IN) { 528 printf("%s: retrying read on block %lld\n", 529 DEVNAME(sc), b->b_blkno); 530 sr_ccb_put(ccb); 531 TAILQ_INIT(&wu->swu_ccb); 532 wu->swu_state = SR_WU_RESTART; 533 if (sd->sd_scsi_rw(wu)) 534 goto bad; 535 else 536 goto retry; 537 } else { 538 printf("%s: permanently fail write on block " 539 "%lld\n", DEVNAME(sc), b->b_blkno); 540 xs->error = XS_DRIVER_STUFFUP; 541 goto bad; 542 } 543 } 544 545 xs->error = XS_NOERROR; 546 xs->resid = 0; 547 xs->flags |= ITSDONE; 548 549 pend = 0; 550 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 551 if (wu == wup) { 552 /* wu on pendq, remove */ 553 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 554 pend = 1; 555 556 if (wu->swu_collider) { 557 if (wu->swu_ios_failed) 558 /* toss all ccbs and recreate */ 559 sr_raid1_recreate_wu(wu->swu_collider); 560 561 /* restart deferred wu */ 562 wu->swu_collider->swu_state = 563 SR_WU_INPROGRESS; 564 TAILQ_REMOVE(&sd->sd_wu_defq, 565 wu->swu_collider, swu_link); 566 sr_raid_startwu(wu->swu_collider); 567 } 568 break; 569 } 570 } 571 572 if (!pend) 573 printf("%s: wu: %p not on pending queue\n", 574 DEVNAME(sc), wu); 575 576 if (wu->swu_flags & SR_WUF_REBUILD) { 577 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 578 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 579 wakeup(wu); 580 } 581 } else { 582 /* do not change the order of these 2 functions */ 583 sr_wu_put(wu); 584 scsi_done(xs); 585 } 586 587 if (sd->sd_sync && sd->sd_wu_pending == 0) 588 wakeup(sd); 589 } 590 591 retry: 592 splx(s); 593 return; 594 bad: 595 xs->error = XS_DRIVER_STUFFUP; 596 xs->flags |= ITSDONE; 597 if (wu->swu_flags & SR_WUF_REBUILD) { 598 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 599 wakeup(wu); 600 } else { 601 /* do not change the order of these 2 functions */ 602 sr_wu_put(wu); 603 scsi_done(xs); 604 } 605 606 splx(s); 607 } 608 609 void 610 sr_raid1_recreate_wu(struct sr_workunit *wu) 611 { 612 struct sr_discipline *sd = wu->swu_dis; 613 struct sr_workunit *wup = wu; 614 struct sr_ccb *ccb; 615 616 do { 617 DNPRINTF(SR_D_INTR, "%s: sr_raid1_recreate_wu: %p\n", wup); 618 619 /* toss all ccbs */ 620 while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { 621 TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); 622 sr_ccb_put(ccb); 623 } 624 TAILQ_INIT(&wup->swu_ccb); 625 626 /* recreate ccbs */ 627 wup->swu_state = SR_WU_REQUEUE; 628 if (sd->sd_scsi_rw(wup)) 629 panic("could not requeue io"); 630 631 wup = wup->swu_collider; 632 } while (wup); 633 } 634