1 /* $OpenBSD: softraid_raid1.c,v 1.19 2009/08/09 14:12:25 marco Exp $ */ 2 /* 3 * Copyright (c) 2007 Marco Peereboom <marco@peereboom.us> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include "bio.h" 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/buf.h> 23 #include <sys/device.h> 24 #include <sys/ioctl.h> 25 #include <sys/proc.h> 26 #include <sys/malloc.h> 27 #include <sys/kernel.h> 28 #include <sys/disk.h> 29 #include <sys/rwlock.h> 30 #include <sys/queue.h> 31 #include <sys/fcntl.h> 32 #include <sys/disklabel.h> 33 #include <sys/mount.h> 34 #include <sys/sensors.h> 35 #include <sys/stat.h> 36 #include <sys/conf.h> 37 #include <sys/uio.h> 38 39 #include <scsi/scsi_all.h> 40 #include <scsi/scsiconf.h> 41 #include <scsi/scsi_disk.h> 42 43 #include <dev/softraidvar.h> 44 #include <dev/rndvar.h> 45 46 /* RAID 1 functions. */ 47 int sr_raid1_alloc_resources(struct sr_discipline *); 48 int sr_raid1_free_resources(struct sr_discipline *); 49 int sr_raid1_rw(struct sr_workunit *); 50 void sr_raid1_intr(struct buf *); 51 void sr_raid1_recreate_wu(struct sr_workunit *); 52 53 /* Discipline initialisation. */ 54 void 55 sr_raid1_discipline_init(struct sr_discipline *sd) 56 { 57 58 /* Fill out discipline members. */ 59 sd->sd_type = SR_MD_RAID1; 60 sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no; 61 sd->sd_max_wu = SR_RAID1_NOWU; 62 sd->sd_rebuild = 1; 63 64 /* Setup discipline pointers. */ 65 sd->sd_alloc_resources = sr_raid1_alloc_resources; 66 sd->sd_free_resources = sr_raid1_free_resources; 67 sd->sd_start_discipline = NULL; 68 sd->sd_scsi_inquiry = sr_raid_inquiry; 69 sd->sd_scsi_read_cap = sr_raid_read_cap; 70 sd->sd_scsi_tur = sr_raid_tur; 71 sd->sd_scsi_req_sense = sr_raid_request_sense; 72 sd->sd_scsi_start_stop = sr_raid_start_stop; 73 sd->sd_scsi_sync = sr_raid_sync; 74 sd->sd_scsi_rw = sr_raid1_rw; 75 sd->sd_set_chunk_state = sr_raid1_set_chunk_state; 76 sd->sd_set_vol_state = sr_raid1_set_vol_state; 77 } 78 79 int 80 sr_raid1_alloc_resources(struct sr_discipline *sd) 81 { 82 int rv = EINVAL; 83 84 if (!sd) 85 return (rv); 86 87 DNPRINTF(SR_D_DIS, "%s: sr_raid1_alloc_resources\n", 88 DEVNAME(sd->sd_sc)); 89 90 if (sr_wu_alloc(sd)) 91 goto bad; 92 if (sr_ccb_alloc(sd)) 93 goto bad; 94 95 rv = 0; 96 bad: 97 return (rv); 98 } 99 100 int 101 sr_raid1_free_resources(struct sr_discipline *sd) 102 { 103 int rv = EINVAL; 104 105 if (!sd) 106 return (rv); 107 108 DNPRINTF(SR_D_DIS, "%s: sr_raid1_free_resources\n", 109 DEVNAME(sd->sd_sc)); 110 111 sr_wu_free(sd); 112 sr_ccb_free(sd); 113 114 rv = 0; 115 return (rv); 116 } 117 118 void 119 sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 120 { 121 int old_state, s; 122 123 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 124 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 125 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 126 127 /* ok to go to splbio since this only happens in error path */ 128 s = splbio(); 129 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 130 131 /* multiple IOs to the same chunk that fail will come through here */ 132 if (old_state == new_state) 133 goto done; 134 135 switch (old_state) { 136 case BIOC_SDONLINE: 137 switch (new_state) { 138 case BIOC_SDOFFLINE: 139 case BIOC_SDSCRUB: 140 break; 141 default: 142 goto die; 143 } 144 break; 145 146 case BIOC_SDOFFLINE: 147 switch (new_state) { 148 case BIOC_SDREBUILD: 149 case BIOC_SDHOTSPARE: 150 break; 151 default: 152 goto die; 153 } 154 break; 155 156 case BIOC_SDSCRUB: 157 if (new_state == BIOC_SDONLINE) { 158 ; 159 } else 160 goto die; 161 break; 162 163 case BIOC_SDREBUILD: 164 switch (new_state) { 165 case BIOC_SDONLINE: 166 break; 167 case BIOC_SDOFFLINE: 168 /* Abort rebuild since the rebuild chunk disappeared. */ 169 sd->sd_reb_abort = 1; 170 break; 171 default: 172 goto die; 173 } 174 break; 175 176 case BIOC_SDHOTSPARE: 177 switch (new_state) { 178 case BIOC_SDOFFLINE: 179 case BIOC_SDREBUILD: 180 break; 181 default: 182 goto die; 183 } 184 break; 185 186 default: 187 die: 188 splx(s); /* XXX */ 189 panic("%s: %s: %s: invalid chunk state transition " 190 "%d -> %d\n", DEVNAME(sd->sd_sc), 191 sd->sd_meta->ssd_devname, 192 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 193 old_state, new_state); 194 /* NOTREACHED */ 195 } 196 197 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 198 sd->sd_set_vol_state(sd); 199 200 sd->sd_must_flush = 1; 201 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 202 done: 203 splx(s); 204 } 205 206 void 207 sr_raid1_set_vol_state(struct sr_discipline *sd) 208 { 209 int states[SR_MAX_STATES]; 210 int new_state, i, s, nd; 211 int old_state = sd->sd_vol_status; 212 213 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 214 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 215 216 nd = sd->sd_meta->ssdi.ssd_chunk_no; 217 218 for (i = 0; i < SR_MAX_STATES; i++) 219 states[i] = 0; 220 221 for (i = 0; i < nd; i++) { 222 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 223 if (s >= SR_MAX_STATES) 224 panic("%s: %s: %s: invalid chunk state", 225 DEVNAME(sd->sd_sc), 226 sd->sd_meta->ssd_devname, 227 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 228 states[s]++; 229 } 230 231 if (states[BIOC_SDONLINE] == nd) 232 new_state = BIOC_SVONLINE; 233 else if (states[BIOC_SDONLINE] == 0) 234 new_state = BIOC_SVOFFLINE; 235 else if (states[BIOC_SDSCRUB] != 0) 236 new_state = BIOC_SVSCRUB; 237 else if (states[BIOC_SDREBUILD] != 0) 238 new_state = BIOC_SVREBUILD; 239 else if (states[BIOC_SDOFFLINE] != 0) 240 new_state = BIOC_SVDEGRADED; 241 else { 242 #ifdef SR_DEBUG 243 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state " 244 "was %d\n", DEVNAME(sd->sd_sc), old_state); 245 for (i = 0; i < nd; i++) 246 DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n", 247 DEVNAME(sd->sd_sc), i, 248 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 249 #endif 250 panic("invalid volume state"); 251 } 252 253 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid1_set_vol_state %d -> %d\n", 254 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 255 old_state, new_state); 256 257 switch (old_state) { 258 case BIOC_SVONLINE: 259 switch (new_state) { 260 case BIOC_SVONLINE: /* can go to same state */ 261 case BIOC_SVOFFLINE: 262 case BIOC_SVDEGRADED: 263 case BIOC_SVREBUILD: /* happens on boot */ 264 break; 265 default: 266 goto die; 267 } 268 break; 269 270 case BIOC_SVOFFLINE: 271 /* XXX this might be a little too much */ 272 goto die; 273 274 case BIOC_SVSCRUB: 275 switch (new_state) { 276 case BIOC_SVONLINE: 277 case BIOC_SVOFFLINE: 278 case BIOC_SVDEGRADED: 279 case BIOC_SVSCRUB: /* can go to same state */ 280 break; 281 default: 282 goto die; 283 } 284 break; 285 286 case BIOC_SVBUILDING: 287 switch (new_state) { 288 case BIOC_SVONLINE: 289 case BIOC_SVOFFLINE: 290 case BIOC_SVBUILDING: /* can go to the same state */ 291 break; 292 default: 293 goto die; 294 } 295 break; 296 297 case BIOC_SVREBUILD: 298 switch (new_state) { 299 case BIOC_SVONLINE: 300 case BIOC_SVOFFLINE: 301 case BIOC_SVDEGRADED: 302 case BIOC_SVREBUILD: /* can go to the same state */ 303 break; 304 default: 305 goto die; 306 } 307 break; 308 309 case BIOC_SVDEGRADED: 310 switch (new_state) { 311 case BIOC_SVOFFLINE: 312 case BIOC_SVREBUILD: 313 case BIOC_SVDEGRADED: /* can go to the same state */ 314 break; 315 default: 316 goto die; 317 } 318 break; 319 320 default: 321 die: 322 panic("%s: %s: invalid volume state transition " 323 "%d -> %d\n", DEVNAME(sd->sd_sc), 324 sd->sd_meta->ssd_devname, 325 old_state, new_state); 326 /* NOTREACHED */ 327 } 328 329 sd->sd_vol_status = new_state; 330 331 /* If we have just become degraded, look for a hotspare. */ 332 if (new_state == BIOC_SVDEGRADED) 333 workq_add_task(NULL, 0, sr_hotspare_rebuild_callback, sd, NULL); 334 } 335 336 int 337 sr_raid1_rw(struct sr_workunit *wu) 338 { 339 struct sr_discipline *sd = wu->swu_dis; 340 struct scsi_xfer *xs = wu->swu_xs; 341 struct sr_ccb *ccb; 342 struct buf *b; 343 struct sr_chunk *scp; 344 int ios, x, i, s, rt; 345 daddr64_t blk; 346 347 /* blk and scsi error will be handled by sr_validate_io */ 348 if (sr_validate_io(wu, &blk, "sr_raid1_rw")) 349 goto bad; 350 351 /* calculate physical block */ 352 blk += SR_META_SIZE + SR_META_OFFSET; 353 354 if (xs->flags & SCSI_DATA_IN) 355 ios = 1; 356 else 357 ios = sd->sd_meta->ssdi.ssd_chunk_no; 358 wu->swu_io_count = ios; 359 360 for (i = 0; i < ios; i++) { 361 ccb = sr_ccb_get(sd); 362 if (!ccb) { 363 /* should never happen but handle more gracefully */ 364 printf("%s: %s: too many ccbs queued\n", 365 DEVNAME(sd->sd_sc), 366 sd->sd_meta->ssd_devname); 367 goto bad; 368 } 369 b = &ccb->ccb_buf; 370 371 if (xs->flags & SCSI_POLL) { 372 b->b_flags = 0; 373 b->b_iodone = NULL; 374 } else { 375 b->b_flags = B_CALL; 376 b->b_iodone = sr_raid1_intr; 377 } 378 379 b->b_flags |= B_PHYS; 380 b->b_blkno = blk; 381 b->b_bcount = xs->datalen; 382 b->b_bufsize = xs->datalen; 383 b->b_resid = xs->datalen; 384 b->b_data = xs->data; 385 b->b_error = 0; 386 b->b_proc = curproc; 387 ccb->ccb_wu = wu; 388 389 if (xs->flags & SCSI_DATA_IN) { 390 rt = 0; 391 ragain: 392 /* interleave reads */ 393 x = sd->mds.mdd_raid1.sr1_counter++ % 394 sd->sd_meta->ssdi.ssd_chunk_no; 395 scp = sd->sd_vol.sv_chunks[x]; 396 switch (scp->src_meta.scm_status) { 397 case BIOC_SDONLINE: 398 case BIOC_SDSCRUB: 399 b->b_flags |= B_READ; 400 break; 401 402 case BIOC_SDOFFLINE: 403 case BIOC_SDREBUILD: 404 case BIOC_SDHOTSPARE: 405 if (rt++ < sd->sd_meta->ssdi.ssd_chunk_no) 406 goto ragain; 407 408 /* FALLTHROUGH */ 409 default: 410 /* volume offline */ 411 printf("%s: is offline, can't read\n", 412 DEVNAME(sd->sd_sc)); 413 sr_ccb_put(ccb); 414 goto bad; 415 } 416 } else { 417 /* writes go on all working disks */ 418 x = i; 419 scp = sd->sd_vol.sv_chunks[x]; 420 switch (scp->src_meta.scm_status) { 421 case BIOC_SDONLINE: 422 case BIOC_SDSCRUB: 423 case BIOC_SDREBUILD: 424 b->b_flags |= B_WRITE; 425 break; 426 427 case BIOC_SDHOTSPARE: /* should never happen */ 428 case BIOC_SDOFFLINE: 429 wu->swu_io_count--; 430 sr_ccb_put(ccb); 431 continue; 432 433 default: 434 goto bad; 435 } 436 437 } 438 ccb->ccb_target = x; 439 b->b_dev = sd->sd_vol.sv_chunks[x]->src_dev_mm; 440 b->b_vp = sd->sd_vol.sv_chunks[x]->src_vn; 441 if ((b->b_flags & B_READ) == 0) 442 b->b_vp->v_numoutput++; 443 444 LIST_INIT(&b->b_dep); 445 446 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 447 448 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid1: b_bcount: %d " 449 "b_blkno: %x b_flags 0x%0x b_data %p\n", 450 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 451 b->b_bcount, b->b_blkno, 452 b->b_flags, b->b_data); 453 } 454 455 s = splbio(); 456 457 /* rebuild io, let rebuild routine deal with it */ 458 if (wu->swu_flags & SR_WUF_REBUILD) 459 goto queued; 460 461 /* current io failed, restart */ 462 if (wu->swu_state == SR_WU_RESTART) 463 goto start; 464 465 /* deferred io failed, don't restart */ 466 if (wu->swu_state == SR_WU_REQUEUE) 467 goto queued; 468 469 if (sr_check_io_collision(wu)) 470 goto queued; 471 472 start: 473 sr_raid_startwu(wu); 474 queued: 475 splx(s); 476 return (0); 477 bad: 478 /* wu is unwound by sr_wu_put */ 479 return (1); 480 } 481 482 void 483 sr_raid1_intr(struct buf *bp) 484 { 485 struct sr_ccb *ccb = (struct sr_ccb *)bp; 486 struct sr_workunit *wu = ccb->ccb_wu, *wup; 487 struct sr_discipline *sd = wu->swu_dis; 488 struct scsi_xfer *xs = wu->swu_xs; 489 struct sr_softc *sc = sd->sd_sc; 490 struct buf *b; 491 int s, pend; 492 493 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %x xs %x\n", 494 DEVNAME(sc), bp, xs); 495 496 b = &ccb->ccb_buf; 497 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 498 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 499 b->b_bcount, b->b_resid, b->b_flags, b->b_blkno, ccb->ccb_target); 500 501 s = splbio(); 502 503 if (b->b_flags & B_ERROR) { 504 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 505 DEVNAME(sc), b->b_blkno, ccb->ccb_target); 506 wu->swu_ios_failed++; 507 ccb->ccb_state = SR_CCB_FAILED; 508 if (ccb->ccb_target != -1) 509 sd->sd_set_chunk_state(sd, ccb->ccb_target, 510 BIOC_SDOFFLINE); 511 else 512 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 513 } else { 514 ccb->ccb_state = SR_CCB_OK; 515 wu->swu_ios_succeeded++; 516 } 517 wu->swu_ios_complete++; 518 519 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 520 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 521 wu->swu_ios_failed); 522 523 if (wu->swu_ios_complete >= wu->swu_io_count) { 524 /* if all ios failed, retry reads and give up on writes */ 525 if (wu->swu_ios_failed == wu->swu_ios_complete) { 526 if (xs->flags & SCSI_DATA_IN) { 527 printf("%s: retrying read on block %lld\n", 528 DEVNAME(sc), b->b_blkno); 529 sr_ccb_put(ccb); 530 TAILQ_INIT(&wu->swu_ccb); 531 wu->swu_state = SR_WU_RESTART; 532 if (sd->sd_scsi_rw(wu)) 533 goto bad; 534 else 535 goto retry; 536 } else { 537 printf("%s: permanently fail write on block " 538 "%lld\n", DEVNAME(sc), b->b_blkno); 539 xs->error = XS_DRIVER_STUFFUP; 540 goto bad; 541 } 542 } 543 544 xs->error = XS_NOERROR; 545 xs->resid = 0; 546 xs->flags |= ITSDONE; 547 548 pend = 0; 549 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 550 if (wu == wup) { 551 /* wu on pendq, remove */ 552 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 553 pend = 1; 554 555 if (wu->swu_collider) { 556 if (wu->swu_ios_failed) 557 /* toss all ccbs and recreate */ 558 sr_raid1_recreate_wu(wu->swu_collider); 559 560 /* restart deferred wu */ 561 wu->swu_collider->swu_state = 562 SR_WU_INPROGRESS; 563 TAILQ_REMOVE(&sd->sd_wu_defq, 564 wu->swu_collider, swu_link); 565 sr_raid_startwu(wu->swu_collider); 566 } 567 break; 568 } 569 } 570 571 if (!pend) 572 printf("%s: wu: %p not on pending queue\n", 573 DEVNAME(sc), wu); 574 575 if (wu->swu_flags & SR_WUF_REBUILD) { 576 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 577 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 578 wakeup(wu); 579 } 580 } else { 581 /* do not change the order of these 2 functions */ 582 sr_wu_put(wu); 583 scsi_done(xs); 584 } 585 586 if (sd->sd_sync && sd->sd_wu_pending == 0) 587 wakeup(sd); 588 } 589 590 retry: 591 splx(s); 592 return; 593 bad: 594 xs->error = XS_DRIVER_STUFFUP; 595 xs->flags |= ITSDONE; 596 if (wu->swu_flags & SR_WUF_REBUILD) { 597 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 598 wakeup(wu); 599 } else { 600 /* do not change the order of these 2 functions */ 601 sr_wu_put(wu); 602 scsi_done(xs); 603 } 604 605 splx(s); 606 } 607 608 void 609 sr_raid1_recreate_wu(struct sr_workunit *wu) 610 { 611 struct sr_discipline *sd = wu->swu_dis; 612 struct sr_workunit *wup = wu; 613 struct sr_ccb *ccb; 614 615 do { 616 DNPRINTF(SR_D_INTR, "%s: sr_raid1_recreate_wu: %p\n", wup); 617 618 /* toss all ccbs */ 619 while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { 620 TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); 621 sr_ccb_put(ccb); 622 } 623 TAILQ_INIT(&wup->swu_ccb); 624 625 /* recreate ccbs */ 626 wup->swu_state = SR_WU_REQUEUE; 627 if (sd->sd_scsi_rw(wup)) 628 panic("could not requeue io"); 629 630 wup = wup->swu_collider; 631 } while (wup); 632 } 633