xref: /openbsd-src/sys/dev/softraid_raid1.c (revision 43003dfe3ad45d1698bed8a37f2b0f5b14f20d4f)
1 /* $OpenBSD: softraid_raid1.c,v 1.19 2009/08/09 14:12:25 marco Exp $ */
2 /*
3  * Copyright (c) 2007 Marco Peereboom <marco@peereboom.us>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include "bio.h"
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/buf.h>
23 #include <sys/device.h>
24 #include <sys/ioctl.h>
25 #include <sys/proc.h>
26 #include <sys/malloc.h>
27 #include <sys/kernel.h>
28 #include <sys/disk.h>
29 #include <sys/rwlock.h>
30 #include <sys/queue.h>
31 #include <sys/fcntl.h>
32 #include <sys/disklabel.h>
33 #include <sys/mount.h>
34 #include <sys/sensors.h>
35 #include <sys/stat.h>
36 #include <sys/conf.h>
37 #include <sys/uio.h>
38 
39 #include <scsi/scsi_all.h>
40 #include <scsi/scsiconf.h>
41 #include <scsi/scsi_disk.h>
42 
43 #include <dev/softraidvar.h>
44 #include <dev/rndvar.h>
45 
46 /* RAID 1 functions. */
47 int	sr_raid1_alloc_resources(struct sr_discipline *);
48 int	sr_raid1_free_resources(struct sr_discipline *);
49 int	sr_raid1_rw(struct sr_workunit *);
50 void	sr_raid1_intr(struct buf *);
51 void	sr_raid1_recreate_wu(struct sr_workunit *);
52 
53 /* Discipline initialisation. */
54 void
55 sr_raid1_discipline_init(struct sr_discipline *sd)
56 {
57 
58 	/* Fill out discipline members. */
59 	sd->sd_type = SR_MD_RAID1;
60 	sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
61 	sd->sd_max_wu = SR_RAID1_NOWU;
62 	sd->sd_rebuild = 1;
63 
64 	/* Setup discipline pointers. */
65 	sd->sd_alloc_resources = sr_raid1_alloc_resources;
66 	sd->sd_free_resources = sr_raid1_free_resources;
67 	sd->sd_start_discipline = NULL;
68 	sd->sd_scsi_inquiry = sr_raid_inquiry;
69 	sd->sd_scsi_read_cap = sr_raid_read_cap;
70 	sd->sd_scsi_tur = sr_raid_tur;
71 	sd->sd_scsi_req_sense = sr_raid_request_sense;
72 	sd->sd_scsi_start_stop = sr_raid_start_stop;
73 	sd->sd_scsi_sync = sr_raid_sync;
74 	sd->sd_scsi_rw = sr_raid1_rw;
75 	sd->sd_set_chunk_state = sr_raid1_set_chunk_state;
76 	sd->sd_set_vol_state = sr_raid1_set_vol_state;
77 }
78 
79 int
80 sr_raid1_alloc_resources(struct sr_discipline *sd)
81 {
82 	int			rv = EINVAL;
83 
84 	if (!sd)
85 		return (rv);
86 
87 	DNPRINTF(SR_D_DIS, "%s: sr_raid1_alloc_resources\n",
88 	    DEVNAME(sd->sd_sc));
89 
90 	if (sr_wu_alloc(sd))
91 		goto bad;
92 	if (sr_ccb_alloc(sd))
93 		goto bad;
94 
95 	rv = 0;
96 bad:
97 	return (rv);
98 }
99 
100 int
101 sr_raid1_free_resources(struct sr_discipline *sd)
102 {
103 	int			rv = EINVAL;
104 
105 	if (!sd)
106 		return (rv);
107 
108 	DNPRINTF(SR_D_DIS, "%s: sr_raid1_free_resources\n",
109 	    DEVNAME(sd->sd_sc));
110 
111 	sr_wu_free(sd);
112 	sr_ccb_free(sd);
113 
114 	rv = 0;
115 	return (rv);
116 }
117 
118 void
119 sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
120 {
121 	int			old_state, s;
122 
123 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
124 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
125 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
126 
127 	/* ok to go to splbio since this only happens in error path */
128 	s = splbio();
129 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
130 
131 	/* multiple IOs to the same chunk that fail will come through here */
132 	if (old_state == new_state)
133 		goto done;
134 
135 	switch (old_state) {
136 	case BIOC_SDONLINE:
137 		switch (new_state) {
138 		case BIOC_SDOFFLINE:
139 		case BIOC_SDSCRUB:
140 			break;
141 		default:
142 			goto die;
143 		}
144 		break;
145 
146 	case BIOC_SDOFFLINE:
147 		switch (new_state) {
148 		case BIOC_SDREBUILD:
149 		case BIOC_SDHOTSPARE:
150 			break;
151 		default:
152 			goto die;
153 		}
154 		break;
155 
156 	case BIOC_SDSCRUB:
157 		if (new_state == BIOC_SDONLINE) {
158 			;
159 		} else
160 			goto die;
161 		break;
162 
163 	case BIOC_SDREBUILD:
164 		switch (new_state) {
165 		case BIOC_SDONLINE:
166 			break;
167 		case BIOC_SDOFFLINE:
168 			/* Abort rebuild since the rebuild chunk disappeared. */
169 			sd->sd_reb_abort = 1;
170 			break;
171 		default:
172 			goto die;
173 		}
174 		break;
175 
176 	case BIOC_SDHOTSPARE:
177 		switch (new_state) {
178 		case BIOC_SDOFFLINE:
179 		case BIOC_SDREBUILD:
180 			break;
181 		default:
182 			goto die;
183 		}
184 		break;
185 
186 	default:
187 die:
188 		splx(s); /* XXX */
189 		panic("%s: %s: %s: invalid chunk state transition "
190 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
191 		    sd->sd_meta->ssd_devname,
192 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
193 		    old_state, new_state);
194 		/* NOTREACHED */
195 	}
196 
197 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
198 	sd->sd_set_vol_state(sd);
199 
200 	sd->sd_must_flush = 1;
201 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
202 done:
203 	splx(s);
204 }
205 
206 void
207 sr_raid1_set_vol_state(struct sr_discipline *sd)
208 {
209 	int			states[SR_MAX_STATES];
210 	int			new_state, i, s, nd;
211 	int			old_state = sd->sd_vol_status;
212 
213 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
214 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
215 
216 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
217 
218 	for (i = 0; i < SR_MAX_STATES; i++)
219 		states[i] = 0;
220 
221 	for (i = 0; i < nd; i++) {
222 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
223 		if (s >= SR_MAX_STATES)
224 			panic("%s: %s: %s: invalid chunk state",
225 			    DEVNAME(sd->sd_sc),
226 			    sd->sd_meta->ssd_devname,
227 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
228 		states[s]++;
229 	}
230 
231 	if (states[BIOC_SDONLINE] == nd)
232 		new_state = BIOC_SVONLINE;
233 	else if (states[BIOC_SDONLINE] == 0)
234 		new_state = BIOC_SVOFFLINE;
235 	else if (states[BIOC_SDSCRUB] != 0)
236 		new_state = BIOC_SVSCRUB;
237 	else if (states[BIOC_SDREBUILD] != 0)
238 		new_state = BIOC_SVREBUILD;
239 	else if (states[BIOC_SDOFFLINE] != 0)
240 		new_state = BIOC_SVDEGRADED;
241 	else {
242 #ifdef SR_DEBUG
243 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
244 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
245 		for (i = 0; i < nd; i++)
246 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
247 			    DEVNAME(sd->sd_sc), i,
248 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
249 #endif
250 		panic("invalid volume state");
251 	}
252 
253 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid1_set_vol_state %d -> %d\n",
254 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
255 	    old_state, new_state);
256 
257 	switch (old_state) {
258 	case BIOC_SVONLINE:
259 		switch (new_state) {
260 		case BIOC_SVONLINE: /* can go to same state */
261 		case BIOC_SVOFFLINE:
262 		case BIOC_SVDEGRADED:
263 		case BIOC_SVREBUILD: /* happens on boot */
264 			break;
265 		default:
266 			goto die;
267 		}
268 		break;
269 
270 	case BIOC_SVOFFLINE:
271 		/* XXX this might be a little too much */
272 		goto die;
273 
274 	case BIOC_SVSCRUB:
275 		switch (new_state) {
276 		case BIOC_SVONLINE:
277 		case BIOC_SVOFFLINE:
278 		case BIOC_SVDEGRADED:
279 		case BIOC_SVSCRUB: /* can go to same state */
280 			break;
281 		default:
282 			goto die;
283 		}
284 		break;
285 
286 	case BIOC_SVBUILDING:
287 		switch (new_state) {
288 		case BIOC_SVONLINE:
289 		case BIOC_SVOFFLINE:
290 		case BIOC_SVBUILDING: /* can go to the same state */
291 			break;
292 		default:
293 			goto die;
294 		}
295 		break;
296 
297 	case BIOC_SVREBUILD:
298 		switch (new_state) {
299 		case BIOC_SVONLINE:
300 		case BIOC_SVOFFLINE:
301 		case BIOC_SVDEGRADED:
302 		case BIOC_SVREBUILD: /* can go to the same state */
303 			break;
304 		default:
305 			goto die;
306 		}
307 		break;
308 
309 	case BIOC_SVDEGRADED:
310 		switch (new_state) {
311 		case BIOC_SVOFFLINE:
312 		case BIOC_SVREBUILD:
313 		case BIOC_SVDEGRADED: /* can go to the same state */
314 			break;
315 		default:
316 			goto die;
317 		}
318 		break;
319 
320 	default:
321 die:
322 		panic("%s: %s: invalid volume state transition "
323 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
324 		    sd->sd_meta->ssd_devname,
325 		    old_state, new_state);
326 		/* NOTREACHED */
327 	}
328 
329 	sd->sd_vol_status = new_state;
330 
331 	/* If we have just become degraded, look for a hotspare. */
332 	if (new_state == BIOC_SVDEGRADED)
333 		workq_add_task(NULL, 0, sr_hotspare_rebuild_callback, sd, NULL);
334 }
335 
336 int
337 sr_raid1_rw(struct sr_workunit *wu)
338 {
339 	struct sr_discipline	*sd = wu->swu_dis;
340 	struct scsi_xfer	*xs = wu->swu_xs;
341 	struct sr_ccb		*ccb;
342 	struct buf		*b;
343 	struct sr_chunk		*scp;
344 	int			ios, x, i, s, rt;
345 	daddr64_t		blk;
346 
347 	/* blk and scsi error will be handled by sr_validate_io */
348 	if (sr_validate_io(wu, &blk, "sr_raid1_rw"))
349 		goto bad;
350 
351 	/* calculate physical block */
352 	blk += SR_META_SIZE + SR_META_OFFSET;
353 
354 	if (xs->flags & SCSI_DATA_IN)
355 		ios = 1;
356 	else
357 		ios = sd->sd_meta->ssdi.ssd_chunk_no;
358 	wu->swu_io_count = ios;
359 
360 	for (i = 0; i < ios; i++) {
361 		ccb = sr_ccb_get(sd);
362 		if (!ccb) {
363 			/* should never happen but handle more gracefully */
364 			printf("%s: %s: too many ccbs queued\n",
365 			    DEVNAME(sd->sd_sc),
366 			    sd->sd_meta->ssd_devname);
367 			goto bad;
368 		}
369 		b = &ccb->ccb_buf;
370 
371 		if (xs->flags & SCSI_POLL) {
372 			b->b_flags = 0;
373 			b->b_iodone = NULL;
374 		} else {
375 			b->b_flags = B_CALL;
376 			b->b_iodone = sr_raid1_intr;
377 		}
378 
379 		b->b_flags |= B_PHYS;
380 		b->b_blkno = blk;
381 		b->b_bcount = xs->datalen;
382 		b->b_bufsize = xs->datalen;
383 		b->b_resid = xs->datalen;
384 		b->b_data = xs->data;
385 		b->b_error = 0;
386 		b->b_proc = curproc;
387 		ccb->ccb_wu = wu;
388 
389 		if (xs->flags & SCSI_DATA_IN) {
390 			rt = 0;
391 ragain:
392 			/* interleave reads */
393 			x = sd->mds.mdd_raid1.sr1_counter++ %
394 			    sd->sd_meta->ssdi.ssd_chunk_no;
395 			scp = sd->sd_vol.sv_chunks[x];
396 			switch (scp->src_meta.scm_status) {
397 			case BIOC_SDONLINE:
398 			case BIOC_SDSCRUB:
399 				b->b_flags |= B_READ;
400 				break;
401 
402 			case BIOC_SDOFFLINE:
403 			case BIOC_SDREBUILD:
404 			case BIOC_SDHOTSPARE:
405 				if (rt++ < sd->sd_meta->ssdi.ssd_chunk_no)
406 					goto ragain;
407 
408 				/* FALLTHROUGH */
409 			default:
410 				/* volume offline */
411 				printf("%s: is offline, can't read\n",
412 				    DEVNAME(sd->sd_sc));
413 				sr_ccb_put(ccb);
414 				goto bad;
415 			}
416 		} else {
417 			/* writes go on all working disks */
418 			x = i;
419 			scp = sd->sd_vol.sv_chunks[x];
420 			switch (scp->src_meta.scm_status) {
421 			case BIOC_SDONLINE:
422 			case BIOC_SDSCRUB:
423 			case BIOC_SDREBUILD:
424 				b->b_flags |= B_WRITE;
425 				break;
426 
427 			case BIOC_SDHOTSPARE: /* should never happen */
428 			case BIOC_SDOFFLINE:
429 				wu->swu_io_count--;
430 				sr_ccb_put(ccb);
431 				continue;
432 
433 			default:
434 				goto bad;
435 			}
436 
437 		}
438 		ccb->ccb_target = x;
439 		b->b_dev = sd->sd_vol.sv_chunks[x]->src_dev_mm;
440 		b->b_vp = sd->sd_vol.sv_chunks[x]->src_vn;
441 		if ((b->b_flags & B_READ) == 0)
442 			b->b_vp->v_numoutput++;
443 
444 		LIST_INIT(&b->b_dep);
445 
446 		TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
447 
448 		DNPRINTF(SR_D_DIS, "%s: %s: sr_raid1: b_bcount: %d "
449 		    "b_blkno: %x b_flags 0x%0x b_data %p\n",
450 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
451 		    b->b_bcount, b->b_blkno,
452 		    b->b_flags, b->b_data);
453 	}
454 
455 	s = splbio();
456 
457 	/* rebuild io, let rebuild routine deal with it */
458 	if (wu->swu_flags & SR_WUF_REBUILD)
459 		goto queued;
460 
461 	/* current io failed, restart */
462 	if (wu->swu_state == SR_WU_RESTART)
463 		goto start;
464 
465 	/* deferred io failed, don't restart */
466 	if (wu->swu_state == SR_WU_REQUEUE)
467 		goto queued;
468 
469 	if (sr_check_io_collision(wu))
470 		goto queued;
471 
472 start:
473 	sr_raid_startwu(wu);
474 queued:
475 	splx(s);
476 	return (0);
477 bad:
478 	/* wu is unwound by sr_wu_put */
479 	return (1);
480 }
481 
482 void
483 sr_raid1_intr(struct buf *bp)
484 {
485 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
486 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
487 	struct sr_discipline	*sd = wu->swu_dis;
488 	struct scsi_xfer	*xs = wu->swu_xs;
489 	struct sr_softc		*sc = sd->sd_sc;
490 	struct buf		*b;
491 	int			s, pend;
492 
493 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %x xs %x\n",
494 	    DEVNAME(sc), bp, xs);
495 
496 	b = &ccb->ccb_buf;
497 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
498 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
499 	    b->b_bcount, b->b_resid, b->b_flags, b->b_blkno, ccb->ccb_target);
500 
501 	s = splbio();
502 
503 	if (b->b_flags & B_ERROR) {
504 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
505 		    DEVNAME(sc), b->b_blkno, ccb->ccb_target);
506 		wu->swu_ios_failed++;
507 		ccb->ccb_state = SR_CCB_FAILED;
508 		if (ccb->ccb_target != -1)
509 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
510 			    BIOC_SDOFFLINE);
511 		else
512 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
513 	} else {
514 		ccb->ccb_state = SR_CCB_OK;
515 		wu->swu_ios_succeeded++;
516 	}
517 	wu->swu_ios_complete++;
518 
519 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
520 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
521 	    wu->swu_ios_failed);
522 
523 	if (wu->swu_ios_complete >= wu->swu_io_count) {
524 		/* if all ios failed, retry reads and give up on writes */
525 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
526 			if (xs->flags & SCSI_DATA_IN) {
527 				printf("%s: retrying read on block %lld\n",
528 				    DEVNAME(sc), b->b_blkno);
529 				sr_ccb_put(ccb);
530 				TAILQ_INIT(&wu->swu_ccb);
531 				wu->swu_state = SR_WU_RESTART;
532 				if (sd->sd_scsi_rw(wu))
533 					goto bad;
534 				else
535 					goto retry;
536 			} else {
537 				printf("%s: permanently fail write on block "
538 				    "%lld\n", DEVNAME(sc), b->b_blkno);
539 				xs->error = XS_DRIVER_STUFFUP;
540 				goto bad;
541 			}
542 		}
543 
544 		xs->error = XS_NOERROR;
545 		xs->resid = 0;
546 		xs->flags |= ITSDONE;
547 
548 		pend = 0;
549 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
550 			if (wu == wup) {
551 				/* wu on pendq, remove */
552 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
553 				pend = 1;
554 
555 				if (wu->swu_collider) {
556 					if (wu->swu_ios_failed)
557 						/* toss all ccbs and recreate */
558 						sr_raid1_recreate_wu(wu->swu_collider);
559 
560 					/* restart deferred wu */
561 					wu->swu_collider->swu_state =
562 					    SR_WU_INPROGRESS;
563 					TAILQ_REMOVE(&sd->sd_wu_defq,
564 					    wu->swu_collider, swu_link);
565 					sr_raid_startwu(wu->swu_collider);
566 				}
567 				break;
568 			}
569 		}
570 
571 		if (!pend)
572 			printf("%s: wu: %p not on pending queue\n",
573 			    DEVNAME(sc), wu);
574 
575 		if (wu->swu_flags & SR_WUF_REBUILD) {
576 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
577 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
578 				wakeup(wu);
579 			}
580 		} else {
581 			/* do not change the order of these 2 functions */
582 			sr_wu_put(wu);
583 			scsi_done(xs);
584 		}
585 
586 		if (sd->sd_sync && sd->sd_wu_pending == 0)
587 			wakeup(sd);
588 	}
589 
590 retry:
591 	splx(s);
592 	return;
593 bad:
594 	xs->error = XS_DRIVER_STUFFUP;
595 	xs->flags |= ITSDONE;
596 	if (wu->swu_flags & SR_WUF_REBUILD) {
597 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
598 		wakeup(wu);
599 	} else {
600 		/* do not change the order of these 2 functions */
601 		sr_wu_put(wu);
602 		scsi_done(xs);
603 	}
604 
605 	splx(s);
606 }
607 
608 void
609 sr_raid1_recreate_wu(struct sr_workunit *wu)
610 {
611 	struct sr_discipline	*sd = wu->swu_dis;
612 	struct sr_workunit	*wup = wu;
613 	struct sr_ccb		*ccb;
614 
615 	do {
616 		DNPRINTF(SR_D_INTR, "%s: sr_raid1_recreate_wu: %p\n", wup);
617 
618 		/* toss all ccbs */
619 		while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) {
620 			TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link);
621 			sr_ccb_put(ccb);
622 		}
623 		TAILQ_INIT(&wup->swu_ccb);
624 
625 		/* recreate ccbs */
626 		wup->swu_state = SR_WU_REQUEUE;
627 		if (sd->sd_scsi_rw(wup))
628 			panic("could not requeue io");
629 
630 		wup = wup->swu_collider;
631 	} while (wup);
632 }
633