xref: /openbsd-src/sys/dev/softraid_raid1.c (revision 50027fe110c3c362514cbbf1128910104a00203e)
1 /* $OpenBSD: softraid_raid1.c,v 1.20 2009/12/07 14:27:12 jsing Exp $ */
2 /*
3  * Copyright (c) 2007 Marco Peereboom <marco@peereboom.us>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include "bio.h"
19 
20 #include <sys/param.h>
21 #include <sys/systm.h>
22 #include <sys/buf.h>
23 #include <sys/device.h>
24 #include <sys/ioctl.h>
25 #include <sys/proc.h>
26 #include <sys/malloc.h>
27 #include <sys/kernel.h>
28 #include <sys/disk.h>
29 #include <sys/rwlock.h>
30 #include <sys/queue.h>
31 #include <sys/fcntl.h>
32 #include <sys/disklabel.h>
33 #include <sys/mount.h>
34 #include <sys/sensors.h>
35 #include <sys/stat.h>
36 #include <sys/conf.h>
37 #include <sys/uio.h>
38 
39 #include <scsi/scsi_all.h>
40 #include <scsi/scsiconf.h>
41 #include <scsi/scsi_disk.h>
42 
43 #include <dev/softraidvar.h>
44 #include <dev/rndvar.h>
45 
46 /* RAID 1 functions. */
47 int	sr_raid1_alloc_resources(struct sr_discipline *);
48 int	sr_raid1_free_resources(struct sr_discipline *);
49 int	sr_raid1_rw(struct sr_workunit *);
50 void	sr_raid1_intr(struct buf *);
51 void	sr_raid1_recreate_wu(struct sr_workunit *);
52 
53 /* Discipline initialisation. */
54 void
55 sr_raid1_discipline_init(struct sr_discipline *sd)
56 {
57 
58 	/* Fill out discipline members. */
59 	sd->sd_type = SR_MD_RAID1;
60 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
61 	    SR_CAP_REBUILD;
62 	sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
63 	sd->sd_max_wu = SR_RAID1_NOWU;
64 
65 	/* Setup discipline pointers. */
66 	sd->sd_alloc_resources = sr_raid1_alloc_resources;
67 	sd->sd_free_resources = sr_raid1_free_resources;
68 	sd->sd_start_discipline = NULL;
69 	sd->sd_scsi_inquiry = sr_raid_inquiry;
70 	sd->sd_scsi_read_cap = sr_raid_read_cap;
71 	sd->sd_scsi_tur = sr_raid_tur;
72 	sd->sd_scsi_req_sense = sr_raid_request_sense;
73 	sd->sd_scsi_start_stop = sr_raid_start_stop;
74 	sd->sd_scsi_sync = sr_raid_sync;
75 	sd->sd_scsi_rw = sr_raid1_rw;
76 	sd->sd_set_chunk_state = sr_raid1_set_chunk_state;
77 	sd->sd_set_vol_state = sr_raid1_set_vol_state;
78 }
79 
80 int
81 sr_raid1_alloc_resources(struct sr_discipline *sd)
82 {
83 	int			rv = EINVAL;
84 
85 	if (!sd)
86 		return (rv);
87 
88 	DNPRINTF(SR_D_DIS, "%s: sr_raid1_alloc_resources\n",
89 	    DEVNAME(sd->sd_sc));
90 
91 	if (sr_wu_alloc(sd))
92 		goto bad;
93 	if (sr_ccb_alloc(sd))
94 		goto bad;
95 
96 	rv = 0;
97 bad:
98 	return (rv);
99 }
100 
101 int
102 sr_raid1_free_resources(struct sr_discipline *sd)
103 {
104 	int			rv = EINVAL;
105 
106 	if (!sd)
107 		return (rv);
108 
109 	DNPRINTF(SR_D_DIS, "%s: sr_raid1_free_resources\n",
110 	    DEVNAME(sd->sd_sc));
111 
112 	sr_wu_free(sd);
113 	sr_ccb_free(sd);
114 
115 	rv = 0;
116 	return (rv);
117 }
118 
119 void
120 sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
121 {
122 	int			old_state, s;
123 
124 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
125 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
126 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
127 
128 	/* ok to go to splbio since this only happens in error path */
129 	s = splbio();
130 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
131 
132 	/* multiple IOs to the same chunk that fail will come through here */
133 	if (old_state == new_state)
134 		goto done;
135 
136 	switch (old_state) {
137 	case BIOC_SDONLINE:
138 		switch (new_state) {
139 		case BIOC_SDOFFLINE:
140 		case BIOC_SDSCRUB:
141 			break;
142 		default:
143 			goto die;
144 		}
145 		break;
146 
147 	case BIOC_SDOFFLINE:
148 		switch (new_state) {
149 		case BIOC_SDREBUILD:
150 		case BIOC_SDHOTSPARE:
151 			break;
152 		default:
153 			goto die;
154 		}
155 		break;
156 
157 	case BIOC_SDSCRUB:
158 		if (new_state == BIOC_SDONLINE) {
159 			;
160 		} else
161 			goto die;
162 		break;
163 
164 	case BIOC_SDREBUILD:
165 		switch (new_state) {
166 		case BIOC_SDONLINE:
167 			break;
168 		case BIOC_SDOFFLINE:
169 			/* Abort rebuild since the rebuild chunk disappeared. */
170 			sd->sd_reb_abort = 1;
171 			break;
172 		default:
173 			goto die;
174 		}
175 		break;
176 
177 	case BIOC_SDHOTSPARE:
178 		switch (new_state) {
179 		case BIOC_SDOFFLINE:
180 		case BIOC_SDREBUILD:
181 			break;
182 		default:
183 			goto die;
184 		}
185 		break;
186 
187 	default:
188 die:
189 		splx(s); /* XXX */
190 		panic("%s: %s: %s: invalid chunk state transition "
191 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
192 		    sd->sd_meta->ssd_devname,
193 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
194 		    old_state, new_state);
195 		/* NOTREACHED */
196 	}
197 
198 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
199 	sd->sd_set_vol_state(sd);
200 
201 	sd->sd_must_flush = 1;
202 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
203 done:
204 	splx(s);
205 }
206 
207 void
208 sr_raid1_set_vol_state(struct sr_discipline *sd)
209 {
210 	int			states[SR_MAX_STATES];
211 	int			new_state, i, s, nd;
212 	int			old_state = sd->sd_vol_status;
213 
214 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
215 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
216 
217 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
218 
219 	for (i = 0; i < SR_MAX_STATES; i++)
220 		states[i] = 0;
221 
222 	for (i = 0; i < nd; i++) {
223 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
224 		if (s >= SR_MAX_STATES)
225 			panic("%s: %s: %s: invalid chunk state",
226 			    DEVNAME(sd->sd_sc),
227 			    sd->sd_meta->ssd_devname,
228 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
229 		states[s]++;
230 	}
231 
232 	if (states[BIOC_SDONLINE] == nd)
233 		new_state = BIOC_SVONLINE;
234 	else if (states[BIOC_SDONLINE] == 0)
235 		new_state = BIOC_SVOFFLINE;
236 	else if (states[BIOC_SDSCRUB] != 0)
237 		new_state = BIOC_SVSCRUB;
238 	else if (states[BIOC_SDREBUILD] != 0)
239 		new_state = BIOC_SVREBUILD;
240 	else if (states[BIOC_SDOFFLINE] != 0)
241 		new_state = BIOC_SVDEGRADED;
242 	else {
243 #ifdef SR_DEBUG
244 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
245 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
246 		for (i = 0; i < nd; i++)
247 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
248 			    DEVNAME(sd->sd_sc), i,
249 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
250 #endif
251 		panic("invalid volume state");
252 	}
253 
254 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid1_set_vol_state %d -> %d\n",
255 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
256 	    old_state, new_state);
257 
258 	switch (old_state) {
259 	case BIOC_SVONLINE:
260 		switch (new_state) {
261 		case BIOC_SVONLINE: /* can go to same state */
262 		case BIOC_SVOFFLINE:
263 		case BIOC_SVDEGRADED:
264 		case BIOC_SVREBUILD: /* happens on boot */
265 			break;
266 		default:
267 			goto die;
268 		}
269 		break;
270 
271 	case BIOC_SVOFFLINE:
272 		/* XXX this might be a little too much */
273 		goto die;
274 
275 	case BIOC_SVSCRUB:
276 		switch (new_state) {
277 		case BIOC_SVONLINE:
278 		case BIOC_SVOFFLINE:
279 		case BIOC_SVDEGRADED:
280 		case BIOC_SVSCRUB: /* can go to same state */
281 			break;
282 		default:
283 			goto die;
284 		}
285 		break;
286 
287 	case BIOC_SVBUILDING:
288 		switch (new_state) {
289 		case BIOC_SVONLINE:
290 		case BIOC_SVOFFLINE:
291 		case BIOC_SVBUILDING: /* can go to the same state */
292 			break;
293 		default:
294 			goto die;
295 		}
296 		break;
297 
298 	case BIOC_SVREBUILD:
299 		switch (new_state) {
300 		case BIOC_SVONLINE:
301 		case BIOC_SVOFFLINE:
302 		case BIOC_SVDEGRADED:
303 		case BIOC_SVREBUILD: /* can go to the same state */
304 			break;
305 		default:
306 			goto die;
307 		}
308 		break;
309 
310 	case BIOC_SVDEGRADED:
311 		switch (new_state) {
312 		case BIOC_SVOFFLINE:
313 		case BIOC_SVREBUILD:
314 		case BIOC_SVDEGRADED: /* can go to the same state */
315 			break;
316 		default:
317 			goto die;
318 		}
319 		break;
320 
321 	default:
322 die:
323 		panic("%s: %s: invalid volume state transition "
324 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
325 		    sd->sd_meta->ssd_devname,
326 		    old_state, new_state);
327 		/* NOTREACHED */
328 	}
329 
330 	sd->sd_vol_status = new_state;
331 
332 	/* If we have just become degraded, look for a hotspare. */
333 	if (new_state == BIOC_SVDEGRADED)
334 		workq_add_task(NULL, 0, sr_hotspare_rebuild_callback, sd, NULL);
335 }
336 
337 int
338 sr_raid1_rw(struct sr_workunit *wu)
339 {
340 	struct sr_discipline	*sd = wu->swu_dis;
341 	struct scsi_xfer	*xs = wu->swu_xs;
342 	struct sr_ccb		*ccb;
343 	struct buf		*b;
344 	struct sr_chunk		*scp;
345 	int			ios, x, i, s, rt;
346 	daddr64_t		blk;
347 
348 	/* blk and scsi error will be handled by sr_validate_io */
349 	if (sr_validate_io(wu, &blk, "sr_raid1_rw"))
350 		goto bad;
351 
352 	/* calculate physical block */
353 	blk += SR_META_SIZE + SR_META_OFFSET;
354 
355 	if (xs->flags & SCSI_DATA_IN)
356 		ios = 1;
357 	else
358 		ios = sd->sd_meta->ssdi.ssd_chunk_no;
359 	wu->swu_io_count = ios;
360 
361 	for (i = 0; i < ios; i++) {
362 		ccb = sr_ccb_get(sd);
363 		if (!ccb) {
364 			/* should never happen but handle more gracefully */
365 			printf("%s: %s: too many ccbs queued\n",
366 			    DEVNAME(sd->sd_sc),
367 			    sd->sd_meta->ssd_devname);
368 			goto bad;
369 		}
370 		b = &ccb->ccb_buf;
371 
372 		if (xs->flags & SCSI_POLL) {
373 			b->b_flags = 0;
374 			b->b_iodone = NULL;
375 		} else {
376 			b->b_flags = B_CALL;
377 			b->b_iodone = sr_raid1_intr;
378 		}
379 
380 		b->b_flags |= B_PHYS;
381 		b->b_blkno = blk;
382 		b->b_bcount = xs->datalen;
383 		b->b_bufsize = xs->datalen;
384 		b->b_resid = xs->datalen;
385 		b->b_data = xs->data;
386 		b->b_error = 0;
387 		b->b_proc = curproc;
388 		ccb->ccb_wu = wu;
389 
390 		if (xs->flags & SCSI_DATA_IN) {
391 			rt = 0;
392 ragain:
393 			/* interleave reads */
394 			x = sd->mds.mdd_raid1.sr1_counter++ %
395 			    sd->sd_meta->ssdi.ssd_chunk_no;
396 			scp = sd->sd_vol.sv_chunks[x];
397 			switch (scp->src_meta.scm_status) {
398 			case BIOC_SDONLINE:
399 			case BIOC_SDSCRUB:
400 				b->b_flags |= B_READ;
401 				break;
402 
403 			case BIOC_SDOFFLINE:
404 			case BIOC_SDREBUILD:
405 			case BIOC_SDHOTSPARE:
406 				if (rt++ < sd->sd_meta->ssdi.ssd_chunk_no)
407 					goto ragain;
408 
409 				/* FALLTHROUGH */
410 			default:
411 				/* volume offline */
412 				printf("%s: is offline, can't read\n",
413 				    DEVNAME(sd->sd_sc));
414 				sr_ccb_put(ccb);
415 				goto bad;
416 			}
417 		} else {
418 			/* writes go on all working disks */
419 			x = i;
420 			scp = sd->sd_vol.sv_chunks[x];
421 			switch (scp->src_meta.scm_status) {
422 			case BIOC_SDONLINE:
423 			case BIOC_SDSCRUB:
424 			case BIOC_SDREBUILD:
425 				b->b_flags |= B_WRITE;
426 				break;
427 
428 			case BIOC_SDHOTSPARE: /* should never happen */
429 			case BIOC_SDOFFLINE:
430 				wu->swu_io_count--;
431 				sr_ccb_put(ccb);
432 				continue;
433 
434 			default:
435 				goto bad;
436 			}
437 
438 		}
439 		ccb->ccb_target = x;
440 		b->b_dev = sd->sd_vol.sv_chunks[x]->src_dev_mm;
441 		b->b_vp = sd->sd_vol.sv_chunks[x]->src_vn;
442 		if ((b->b_flags & B_READ) == 0)
443 			b->b_vp->v_numoutput++;
444 
445 		LIST_INIT(&b->b_dep);
446 
447 		TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
448 
449 		DNPRINTF(SR_D_DIS, "%s: %s: sr_raid1: b_bcount: %d "
450 		    "b_blkno: %x b_flags 0x%0x b_data %p\n",
451 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
452 		    b->b_bcount, b->b_blkno,
453 		    b->b_flags, b->b_data);
454 	}
455 
456 	s = splbio();
457 
458 	/* rebuild io, let rebuild routine deal with it */
459 	if (wu->swu_flags & SR_WUF_REBUILD)
460 		goto queued;
461 
462 	/* current io failed, restart */
463 	if (wu->swu_state == SR_WU_RESTART)
464 		goto start;
465 
466 	/* deferred io failed, don't restart */
467 	if (wu->swu_state == SR_WU_REQUEUE)
468 		goto queued;
469 
470 	if (sr_check_io_collision(wu))
471 		goto queued;
472 
473 start:
474 	sr_raid_startwu(wu);
475 queued:
476 	splx(s);
477 	return (0);
478 bad:
479 	/* wu is unwound by sr_wu_put */
480 	return (1);
481 }
482 
483 void
484 sr_raid1_intr(struct buf *bp)
485 {
486 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
487 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
488 	struct sr_discipline	*sd = wu->swu_dis;
489 	struct scsi_xfer	*xs = wu->swu_xs;
490 	struct sr_softc		*sc = sd->sd_sc;
491 	struct buf		*b;
492 	int			s, pend;
493 
494 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %x xs %x\n",
495 	    DEVNAME(sc), bp, xs);
496 
497 	b = &ccb->ccb_buf;
498 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
499 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
500 	    b->b_bcount, b->b_resid, b->b_flags, b->b_blkno, ccb->ccb_target);
501 
502 	s = splbio();
503 
504 	if (b->b_flags & B_ERROR) {
505 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
506 		    DEVNAME(sc), b->b_blkno, ccb->ccb_target);
507 		wu->swu_ios_failed++;
508 		ccb->ccb_state = SR_CCB_FAILED;
509 		if (ccb->ccb_target != -1)
510 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
511 			    BIOC_SDOFFLINE);
512 		else
513 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
514 	} else {
515 		ccb->ccb_state = SR_CCB_OK;
516 		wu->swu_ios_succeeded++;
517 	}
518 	wu->swu_ios_complete++;
519 
520 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
521 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
522 	    wu->swu_ios_failed);
523 
524 	if (wu->swu_ios_complete >= wu->swu_io_count) {
525 		/* if all ios failed, retry reads and give up on writes */
526 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
527 			if (xs->flags & SCSI_DATA_IN) {
528 				printf("%s: retrying read on block %lld\n",
529 				    DEVNAME(sc), b->b_blkno);
530 				sr_ccb_put(ccb);
531 				TAILQ_INIT(&wu->swu_ccb);
532 				wu->swu_state = SR_WU_RESTART;
533 				if (sd->sd_scsi_rw(wu))
534 					goto bad;
535 				else
536 					goto retry;
537 			} else {
538 				printf("%s: permanently fail write on block "
539 				    "%lld\n", DEVNAME(sc), b->b_blkno);
540 				xs->error = XS_DRIVER_STUFFUP;
541 				goto bad;
542 			}
543 		}
544 
545 		xs->error = XS_NOERROR;
546 		xs->resid = 0;
547 		xs->flags |= ITSDONE;
548 
549 		pend = 0;
550 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
551 			if (wu == wup) {
552 				/* wu on pendq, remove */
553 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
554 				pend = 1;
555 
556 				if (wu->swu_collider) {
557 					if (wu->swu_ios_failed)
558 						/* toss all ccbs and recreate */
559 						sr_raid1_recreate_wu(wu->swu_collider);
560 
561 					/* restart deferred wu */
562 					wu->swu_collider->swu_state =
563 					    SR_WU_INPROGRESS;
564 					TAILQ_REMOVE(&sd->sd_wu_defq,
565 					    wu->swu_collider, swu_link);
566 					sr_raid_startwu(wu->swu_collider);
567 				}
568 				break;
569 			}
570 		}
571 
572 		if (!pend)
573 			printf("%s: wu: %p not on pending queue\n",
574 			    DEVNAME(sc), wu);
575 
576 		if (wu->swu_flags & SR_WUF_REBUILD) {
577 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
578 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
579 				wakeup(wu);
580 			}
581 		} else {
582 			/* do not change the order of these 2 functions */
583 			sr_wu_put(wu);
584 			scsi_done(xs);
585 		}
586 
587 		if (sd->sd_sync && sd->sd_wu_pending == 0)
588 			wakeup(sd);
589 	}
590 
591 retry:
592 	splx(s);
593 	return;
594 bad:
595 	xs->error = XS_DRIVER_STUFFUP;
596 	xs->flags |= ITSDONE;
597 	if (wu->swu_flags & SR_WUF_REBUILD) {
598 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
599 		wakeup(wu);
600 	} else {
601 		/* do not change the order of these 2 functions */
602 		sr_wu_put(wu);
603 		scsi_done(xs);
604 	}
605 
606 	splx(s);
607 }
608 
609 void
610 sr_raid1_recreate_wu(struct sr_workunit *wu)
611 {
612 	struct sr_discipline	*sd = wu->swu_dis;
613 	struct sr_workunit	*wup = wu;
614 	struct sr_ccb		*ccb;
615 
616 	do {
617 		DNPRINTF(SR_D_INTR, "%s: sr_raid1_recreate_wu: %p\n", wup);
618 
619 		/* toss all ccbs */
620 		while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) {
621 			TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link);
622 			sr_ccb_put(ccb);
623 		}
624 		TAILQ_INIT(&wup->swu_ccb);
625 
626 		/* recreate ccbs */
627 		wup->swu_state = SR_WU_REQUEUE;
628 		if (sd->sd_scsi_rw(wup))
629 			panic("could not requeue io");
630 
631 		wup = wup->swu_collider;
632 	} while (wup);
633 }
634