xref: /openbsd-src/sys/dev/softraid_raid5.c (revision 91f110e064cd7c194e59e019b83bb7496c1c84d4)
1 /* $OpenBSD: softraid_raid5.c,v 1.15 2014/01/23 00:22:35 jsing Exp $ */
2 /*
3  * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include "bio.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/buf.h>
25 #include <sys/device.h>
26 #include <sys/ioctl.h>
27 #include <sys/proc.h>
28 #include <sys/malloc.h>
29 #include <sys/kernel.h>
30 #include <sys/disk.h>
31 #include <sys/rwlock.h>
32 #include <sys/queue.h>
33 #include <sys/fcntl.h>
34 #include <sys/disklabel.h>
35 #include <sys/mount.h>
36 #include <sys/sensors.h>
37 #include <sys/stat.h>
38 #include <sys/task.h>
39 #include <sys/pool.h>
40 #include <sys/conf.h>
41 #include <sys/uio.h>
42 
43 #include <scsi/scsi_all.h>
44 #include <scsi/scsiconf.h>
45 #include <scsi/scsi_disk.h>
46 
47 #include <dev/softraidvar.h>
48 #include <dev/rndvar.h>
49 
50 /* RAID 5 functions. */
51 int	sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
52 	    int, int64_t);
53 int	sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
54 	    int, void *);
55 int	sr_raid5_init(struct sr_discipline *);
56 int	sr_raid5_rw(struct sr_workunit *);
57 int	sr_raid5_openings(struct sr_discipline *);
58 void	sr_raid5_intr(struct buf *);
59 int	sr_raid5_wu_done(struct sr_workunit *);
60 void	sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
61 void	sr_raid5_set_vol_state(struct sr_discipline *);
62 
63 int	sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, daddr_t,
64 	    void *, int, int, void *);
65 int	sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, daddr_t,
66 	    void *);
67 int	sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
68 	    daddr_t, daddr_t, void *, int, int);
69 void	sr_raid5_xor(void *, void *, int);
70 
71 void	sr_raid5_rebuild(struct sr_discipline *);
72 void	sr_raid5_scrub(struct sr_discipline *);
73 
74 /* discipline initialisation. */
75 void
76 sr_raid5_discipline_init(struct sr_discipline *sd)
77 {
78 	/* Fill out discipline members. */
79 	sd->sd_type = SR_MD_RAID5;
80 	strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
81 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
82 	    SR_CAP_REBUILD | SR_CAP_REDUNDANT;
83 	sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */
84 	sd->sd_max_wu = SR_RAID5_NOWU + 2;	/* Two for scrub/rebuild. */
85 
86 	/* Setup discipline specific function pointers. */
87 	sd->sd_assemble = sr_raid5_assemble;
88 	sd->sd_create = sr_raid5_create;
89 	sd->sd_openings = sr_raid5_openings;
90 	sd->sd_rebuild = sr_raid5_rebuild;
91 	sd->sd_scsi_rw = sr_raid5_rw;
92 	sd->sd_scsi_intr = sr_raid5_intr;
93 	sd->sd_scsi_wu_done = sr_raid5_wu_done;
94 	sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
95 	sd->sd_set_vol_state = sr_raid5_set_vol_state;
96 }
97 
98 int
99 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
100     int no_chunk, int64_t coerced_size)
101 {
102 	if (no_chunk < 3) {
103 		sr_error(sd->sd_sc, "%s requires three or more chunks",
104 		    sd->sd_name);
105 		return EINVAL;
106 	}
107 
108 	/*
109 	 * XXX add variable strip size later even though MAXPHYS is really
110 	 * the clever value, users like to tinker with that type of stuff.
111 	 */
112 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
113 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
114 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
115 	    DEV_BSHIFT) - 1)) * (no_chunk - 1);
116 
117 	return sr_raid5_init(sd);
118 }
119 
120 int
121 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
122     int no_chunk, void *data)
123 {
124 	return sr_raid5_init(sd);
125 }
126 
127 int
128 sr_raid5_init(struct sr_discipline *sd)
129 {
130 	/* Initialise runtime values. */
131 	sd->mds.mdd_raid5.sr5_strip_bits =
132 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
133 	if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
134 		sr_error(sd->sd_sc, "invalid strip size");
135 		return EINVAL;
136 	}
137 
138 	return 0;
139 }
140 
141 int
142 sr_raid5_openings(struct sr_discipline *sd)
143 {
144 	/* Two work units per I/O, two for rebuild/scrub. */
145 	return ((sd->sd_max_wu - 2) >> 1);
146 }
147 
148 void
149 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
150 {
151 	int			old_state, s;
152 
153 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
154 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
155 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
156 
157 	/* ok to go to splbio since this only happens in error path */
158 	s = splbio();
159 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
160 
161 	/* multiple IOs to the same chunk that fail will come through here */
162 	if (old_state == new_state)
163 		goto done;
164 
165 	switch (old_state) {
166 	case BIOC_SDONLINE:
167 		switch (new_state) {
168 		case BIOC_SDOFFLINE:
169 		case BIOC_SDSCRUB:
170 			break;
171 		default:
172 			goto die;
173 		}
174 		break;
175 
176 	case BIOC_SDOFFLINE:
177 		if (new_state == BIOC_SDREBUILD) {
178 			;
179 		} else
180 			goto die;
181 		break;
182 
183 	case BIOC_SDSCRUB:
184 		switch (new_state) {
185 		case BIOC_SDONLINE:
186 		case BIOC_SDOFFLINE:
187 			break;
188 		default:
189 			goto die;
190 		}
191 		break;
192 
193 	case BIOC_SDREBUILD:
194 		switch (new_state) {
195 		case BIOC_SDONLINE:
196 		case BIOC_SDOFFLINE:
197 			break;
198 		default:
199 			goto die;
200 		}
201 		break;
202 
203 	default:
204 die:
205 		splx(s); /* XXX */
206 		panic("%s: %s: %s: invalid chunk state transition "
207 		    "%d -> %d", DEVNAME(sd->sd_sc),
208 		    sd->sd_meta->ssd_devname,
209 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
210 		    old_state, new_state);
211 		/* NOTREACHED */
212 	}
213 
214 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
215 	sd->sd_set_vol_state(sd);
216 
217 	sd->sd_must_flush = 1;
218 	task_add(systq, &sd->sd_meta_save_task);
219 done:
220 	splx(s);
221 }
222 
223 void
224 sr_raid5_set_vol_state(struct sr_discipline *sd)
225 {
226 	int			states[SR_MAX_STATES];
227 	int			new_state, i, s, nd;
228 	int			old_state = sd->sd_vol_status;
229 
230 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
231 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
232 
233 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
234 
235 	for (i = 0; i < SR_MAX_STATES; i++)
236 		states[i] = 0;
237 
238 	for (i = 0; i < nd; i++) {
239 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
240 		if (s >= SR_MAX_STATES)
241 			panic("%s: %s: %s: invalid chunk state",
242 			    DEVNAME(sd->sd_sc),
243 			    sd->sd_meta->ssd_devname,
244 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
245 		states[s]++;
246 	}
247 
248 	if (states[BIOC_SDONLINE] == nd)
249 		new_state = BIOC_SVONLINE;
250 	else if (states[BIOC_SDONLINE] < nd - 1)
251 		new_state = BIOC_SVOFFLINE;
252 	else if (states[BIOC_SDSCRUB] != 0)
253 		new_state = BIOC_SVSCRUB;
254 	else if (states[BIOC_SDREBUILD] != 0)
255 		new_state = BIOC_SVREBUILD;
256 	else if (states[BIOC_SDONLINE] == nd - 1)
257 		new_state = BIOC_SVDEGRADED;
258 	else {
259 #ifdef SR_DEBUG
260 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
261 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
262 		for (i = 0; i < nd; i++)
263 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
264 			    DEVNAME(sd->sd_sc), i,
265 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
266 #endif
267 		panic("invalid volume state");
268 	}
269 
270 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
271 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
272 	    old_state, new_state);
273 
274 	switch (old_state) {
275 	case BIOC_SVONLINE:
276 		switch (new_state) {
277 		case BIOC_SVONLINE: /* can go to same state */
278 		case BIOC_SVOFFLINE:
279 		case BIOC_SVDEGRADED:
280 		case BIOC_SVREBUILD: /* happens on boot */
281 			break;
282 		default:
283 			goto die;
284 		}
285 		break;
286 
287 	case BIOC_SVOFFLINE:
288 		/* XXX this might be a little too much */
289 		goto die;
290 
291 	case BIOC_SVDEGRADED:
292 		switch (new_state) {
293 		case BIOC_SVOFFLINE:
294 		case BIOC_SVREBUILD:
295 		case BIOC_SVDEGRADED: /* can go to the same state */
296 			break;
297 		default:
298 			goto die;
299 		}
300 		break;
301 
302 	case BIOC_SVBUILDING:
303 		switch (new_state) {
304 		case BIOC_SVONLINE:
305 		case BIOC_SVOFFLINE:
306 		case BIOC_SVBUILDING: /* can go to the same state */
307 			break;
308 		default:
309 			goto die;
310 		}
311 		break;
312 
313 	case BIOC_SVSCRUB:
314 		switch (new_state) {
315 		case BIOC_SVONLINE:
316 		case BIOC_SVOFFLINE:
317 		case BIOC_SVDEGRADED:
318 		case BIOC_SVSCRUB: /* can go to same state */
319 			break;
320 		default:
321 			goto die;
322 		}
323 		break;
324 
325 	case BIOC_SVREBUILD:
326 		switch (new_state) {
327 		case BIOC_SVONLINE:
328 		case BIOC_SVOFFLINE:
329 		case BIOC_SVDEGRADED:
330 		case BIOC_SVREBUILD: /* can go to the same state */
331 			break;
332 		default:
333 			goto die;
334 		}
335 		break;
336 
337 	default:
338 die:
339 		panic("%s: %s: invalid volume state transition %d -> %d",
340 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
341 		    old_state, new_state);
342 		/* NOTREACHED */
343 	}
344 
345 	sd->sd_vol_status = new_state;
346 }
347 
348 static inline int
349 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
350 {
351 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
352 	case BIOC_SDONLINE:
353 	case BIOC_SDSCRUB:
354 		return 1;
355 	default:
356 		return 0;
357 	}
358 }
359 
360 static inline int
361 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
362 {
363 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
364 	case BIOC_SDREBUILD:
365 		return 1;
366 	default:
367 		return 0;
368 	}
369 }
370 
371 int
372 sr_raid5_rw(struct sr_workunit *wu)
373 {
374 	struct sr_workunit	*wu_r = NULL;
375 	struct sr_discipline	*sd = wu->swu_dis;
376 	struct scsi_xfer	*xs = wu->swu_xs;
377 	struct sr_chunk		*scp;
378 	daddr_t			blk, lba;
379 	int64_t			chunk_offs, lbaoffs, phys_offs, strip_offs;
380 	int64_t			strip_bits, strip_no, strip_size;
381 	int64_t			chunk, no_chunk;
382 	int64_t			length, parity, datalen, row_size;
383 	void			*data;
384 	int			s;
385 
386 	/* blk and scsi error will be handled by sr_validate_io */
387 	if (sr_validate_io(wu, &blk, "sr_raid5_rw"))
388 		goto bad;
389 
390 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: lba %lld size %d\n",
391 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
392 	    (xs->flags & SCSI_DATA_IN) ? "read" : "write",
393 	    (long long)blk, xs->datalen);
394 
395 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
396 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
397 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
398 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
399 
400 	data = xs->data;
401 	datalen = xs->datalen;
402 	lbaoffs	= blk << DEV_BSHIFT;
403 
404 	if (xs->flags & SCSI_DATA_OUT) {
405 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
406 			printf("%s: %s failed to get read work unit",
407 			    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
408 			goto bad;
409 		}
410 		wu_r->swu_state = SR_WU_INPROGRESS;
411 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
412 	}
413 
414 	wu->swu_blk_start = 0;
415 	while (datalen != 0) {
416 		strip_no = lbaoffs >> strip_bits;
417 		strip_offs = lbaoffs & (strip_size - 1);
418 		chunk_offs = (strip_no / no_chunk) << strip_bits;
419 		phys_offs = chunk_offs + strip_offs +
420 		    (sd->sd_meta->ssd_data_offset << DEV_BSHIFT);
421 
422 		/* get size remaining in this stripe */
423 		length = MIN(strip_size - strip_offs, datalen);
424 
425 		/*
426 		 * Map disk offset to data and parity chunks, using a left
427 		 * asymmetric algorithm for the parity assignment.
428 		 */
429 		chunk = strip_no % no_chunk;
430 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
431 		if (chunk >= parity)
432 			chunk++;
433 
434 		lba = phys_offs >> DEV_BSHIFT;
435 
436 		/* XXX big hammer.. exclude I/O from entire stripe */
437 		if (wu->swu_blk_start == 0)
438 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
439 		wu->swu_blk_end = (strip_no / no_chunk) * row_size +
440 		    (row_size - 1);
441 
442 		scp = sd->sd_vol.sv_chunks[chunk];
443 		if (xs->flags & SCSI_DATA_IN) {
444 			switch (scp->src_meta.scm_status) {
445 			case BIOC_SDONLINE:
446 			case BIOC_SDSCRUB:
447 				/*
448 				 * Chunk is online, issue a single read
449 				 * request.
450 				 */
451 				if (sr_raid5_addio(wu, chunk, lba, length,
452 				    data, xs->flags, 0, NULL))
453 					goto bad;
454 				break;
455 			case BIOC_SDOFFLINE:
456 			case BIOC_SDREBUILD:
457 			case BIOC_SDHOTSPARE:
458 				if (sr_raid5_regenerate(wu, chunk, lba,
459 				    length, data))
460 					goto bad;
461 				break;
462 			default:
463 				printf("%s: is offline, can't read\n",
464 				    DEVNAME(sd->sd_sc));
465 				goto bad;
466 			}
467 		} else {
468 			if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
469 			    length, data, xs->flags, 0))
470 				goto bad;
471 		}
472 
473 		/* advance to next block */
474 		lbaoffs += length;
475 		datalen -= length;
476 		data += length;
477 	}
478 
479 	s = splbio();
480 	if (wu_r) {
481 		if (wu_r->swu_io_count > 0) {
482 			/* collide write request with reads */
483 			wu_r->swu_blk_start = wu->swu_blk_start;
484 			wu_r->swu_blk_end = wu->swu_blk_end;
485 
486 			wu->swu_state = SR_WU_DEFERRED;
487 			wu_r->swu_collider = wu;
488 			TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
489 
490 			wu = wu_r;
491 		} else {
492 			sr_scsi_wu_put(sd, wu_r);
493 		}
494 	}
495 	splx(s);
496 
497 	sr_schedule_wu(wu);
498 
499 	return (0);
500 
501 bad:
502 	/* wu is unwound by sr_wu_put */
503 	if (wu_r)
504 		sr_scsi_wu_put(sd, wu_r);
505 	return (1);
506 }
507 
508 int
509 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
510     daddr_t len, void *data)
511 {
512 	struct sr_discipline	*sd = wu->swu_dis;
513 	int			i;
514 
515 	/*
516 	 * Regenerate a block on a RAID 5 volume by xoring the data and parity
517 	 * from all of the remaining online chunks. This requires the parity
518 	 * to already be correct.
519  	 */
520 
521 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
522 	    "regenerating block %llu\n",
523 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
524 
525 	memset(data, 0, len);
526 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
527 		if (i == chunk)
528 			continue;
529 		if (!sr_raid5_chunk_online(sd, i))
530 			goto bad;
531 		if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
532 		    0, data))
533 			goto bad;
534 	}
535 	return (0);
536 
537 bad:
538 	return (1);
539 }
540 
541 int
542 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
543     int parity, daddr_t blkno, daddr_t len, void *data, int xsflags,
544     int ccbflags)
545 {
546 	struct sr_discipline	*sd = wu->swu_dis;
547 	struct scsi_xfer	*xs = wu->swu_xs;
548 	void			*xorbuf;
549 	int			chunk_online, chunk_rebuild;
550 	int			parity_online, parity_rebuild;
551 	int			other_offline = 0, other_rebuild = 0;
552 	int			i;
553 
554 	/*
555 	 * Perform a write to a RAID 5 volume. This write routine does not
556 	 * require the parity to already be correct and will operate on a
557 	 * uninitialised volume.
558 	 *
559 	 * There are four possible cases:
560 	 *
561 	 * 1) All data chunks and parity are online. In this case we read the
562 	 *    data from all data chunks, except the one we are writing to, in
563 	 *    order to calculate and write the new parity.
564 	 *
565 	 * 2) The parity chunk is offline. In this case we only need to write
566 	 *    to the data chunk. No parity calculation is required.
567 	 *
568 	 * 3) The data chunk is offline. In this case we read the data from all
569 	 *    online chunks in order to calculate and write the new parity.
570 	 *    This is the same as (1) except we do not write the data chunk.
571 	 *
572 	 * 4) A different data chunk is offline. The new parity is calculated
573 	 *    by taking the existing parity, xoring the original data and
574 	 *    xoring in the new data. This requires that the parity already be
575 	 *    correct, which it will be if any of the data chunks has
576 	 *    previously been written.
577 	 *
578 	 * There is an additional complication introduced by a chunk that is
579 	 * being rebuilt. If this is the data or parity chunk, then we want
580 	 * to write to it as per normal. If it is another data chunk then we
581 	 * need to presume that it has not yet been regenerated and use the
582 	 * same method as detailed in (4) above.
583 	 */
584 
585 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
586 	    "blk %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
587 	    chunk, parity, (unsigned long long)blkno);
588 
589 	chunk_online = sr_raid5_chunk_online(sd, chunk);
590 	chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
591 	parity_online = sr_raid5_chunk_online(sd, parity);
592 	parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
593 
594 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
595 		if (i == chunk || i == parity)
596 			continue;
597 		if (sr_raid5_chunk_rebuild(sd, i))
598 			other_rebuild = 1;
599 		else if (!sr_raid5_chunk_online(sd, i))
600 			other_offline = 1;
601 	}
602 
603 	DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
604 	    "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
605 	    chunk_online, parity_online, other_offline);
606 
607 	if (!parity_online && !parity_rebuild)
608 		goto data_write;
609 
610 	xorbuf = sr_block_get(sd, len);
611 	if (xorbuf == NULL)
612 		goto bad;
613 	memcpy(xorbuf, data, len);
614 
615 	if (other_offline || other_rebuild) {
616 
617 		/*
618 		 * XXX - If we can guarantee that this LBA has been scrubbed
619 		 * then we can also take this faster path.
620 		 */
621 
622 		/* Read in existing data and existing parity. */
623 		if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
624 		    SCSI_DATA_IN, 0, xorbuf))
625 			goto bad;
626 		if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
627 		    SCSI_DATA_IN, 0, xorbuf))
628 			goto bad;
629 
630 	} else {
631 
632 		/* Read in existing data from all other chunks. */
633 		for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
634 			if (i == chunk || i == parity)
635 				continue;
636 			if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
637 			    SCSI_DATA_IN, 0, xorbuf))
638 				goto bad;
639 		}
640 
641 	}
642 
643 	/* Write new parity. */
644 	if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
645 	    SR_CCBF_FREEBUF, NULL))
646 		goto bad;
647 
648 data_write:
649 	/* Write new data. */
650 	if (chunk_online || chunk_rebuild)
651 		if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
652 		    0, NULL))
653 			goto bad;
654 
655 	return (0);
656 
657 bad:
658 	return (1);
659 }
660 
661 void
662 sr_raid5_intr(struct buf *bp)
663 {
664 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
665 	struct sr_workunit	*wu = ccb->ccb_wu;
666 	struct sr_discipline	*sd = wu->swu_dis;
667 	int			s;
668 
669 	DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
670 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
671 
672 	s = splbio();
673 	sr_ccb_done(ccb);
674 
675 	/* XXX - Should this be done via the taskq? */
676 
677 	/* XOR data to result. */
678 	if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
679 		sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
680 		    ccb->ccb_buf.b_bcount);
681 
682 	/* Free allocated data buffer. */
683 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
684 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
685 		ccb->ccb_buf.b_data = NULL;
686 	}
687 
688 	sr_wu_done(wu);
689 	splx(s);
690 }
691 
692 int
693 sr_raid5_wu_done(struct sr_workunit *wu)
694 {
695 	struct sr_discipline	*sd = wu->swu_dis;
696 	struct scsi_xfer	*xs = wu->swu_xs;
697 
698 	/* XXX - we have no way of propagating errors... */
699 	if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
700 		return SR_WU_OK;
701 
702 	/* XXX - This is insufficient for RAID 5. */
703 	if (wu->swu_ios_succeeded > 0) {
704 		xs->error = XS_NOERROR;
705 		return SR_WU_OK;
706 	}
707 
708 	if (xs->flags & SCSI_DATA_IN) {
709 		printf("%s: retrying read on block %lld\n",
710 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
711 		sr_wu_release_ccbs(wu);
712 		wu->swu_state = SR_WU_RESTART;
713 		if (sd->sd_scsi_rw(wu) == 0)
714 			return SR_WU_RESTART;
715 	} else {
716 		/* XXX - retry write if we just went from online to degraded. */
717 		printf("%s: permanently fail write on block %lld\n",
718 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
719 	}
720 
721 	wu->swu_state = SR_WU_FAILED;
722 	xs->error = XS_DRIVER_STUFFUP;
723 
724 	return SR_WU_FAILED;
725 }
726 
727 int
728 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
729     daddr_t len, void *data, int xsflags, int ccbflags, void *xorbuf)
730 {
731 	struct sr_discipline	*sd = wu->swu_dis;
732 	struct sr_ccb		*ccb;
733 
734 	DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
735 	    "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
736 	    chunk, (long long)blkno, (long long)len, xorbuf ? "X0R" : "-");
737 
738 	/* Allocate temporary buffer. */
739 	if (data == NULL) {
740 		data = sr_block_get(sd, len);
741 		if (data == NULL)
742 			return (-1);
743 		ccbflags |= SR_CCBF_FREEBUF;
744 	}
745 
746 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
747 	if (ccb == NULL) {
748 		if (ccbflags & SR_CCBF_FREEBUF)
749 			sr_block_put(sd, data, len);
750 		return (-1);
751 	}
752 	ccb->ccb_opaque = xorbuf;
753 	sr_wu_enqueue_ccb(wu, ccb);
754 
755 	return (0);
756 }
757 
758 void
759 sr_raid5_xor(void *a, void *b, int len)
760 {
761 	uint32_t		*xa = a, *xb = b;
762 
763 	len >>= 2;
764 	while (len--)
765 		*xa++ ^= *xb++;
766 }
767 
768 void
769 sr_raid5_rebuild(struct sr_discipline *sd)
770 {
771 	int64_t strip_no, strip_size, strip_bits, i, psz, rb;
772 	int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
773 	struct sr_workunit *wu_r, *wu_w;
774 	int s, slept, percent = 0, old_percent = -1;
775 	int rebuild_chunk = -1;
776 	void *xorbuf;
777 
778 	/* Find the rebuild chunk. */
779 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
780 		if (sr_raid5_chunk_rebuild(sd, i)) {
781 			rebuild_chunk = i;
782 			break;
783 		}
784 	}
785 	if (rebuild_chunk == -1)
786 		goto bad;
787 
788 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
789 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
790 	chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
791 	chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
792 	chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
793 	row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
794 
795 	/* XXX - handle restarts. */
796 	DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
797 	    "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
798 	    "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
799 	    sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
800 	    row_size);
801 
802 	for (strip_no = 0; strip_no < chunk_strips; strip_no++) {
803 		chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no +
804 		    sd->sd_meta->ssd_data_offset;
805 
806 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
807 		    "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
808 		    sd->sd_meta->ssd_devname, strip_no, chunk_lba);
809 
810 		wu_w = sr_scsi_wu_get(sd, 0);
811 		wu_r = sr_scsi_wu_get(sd, 0);
812 
813 		xorbuf = sr_block_get(sd, strip_size);
814 		if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
815 		    strip_size, xorbuf))
816 			goto bad;
817 		if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
818 		    xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
819 			goto bad;
820 
821 		/* Collide write work unit with read work unit. */
822 		wu_r->swu_state = SR_WU_INPROGRESS;
823 		wu_r->swu_flags |= SR_WUF_REBUILD;
824 		wu_w->swu_state = SR_WU_DEFERRED;
825 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
826 		wu_r->swu_collider = wu_w;
827 
828 		/* Block I/O to this strip while we rebuild it. */
829 		wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
830 		wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
831 		wu_w->swu_blk_start = wu_r->swu_blk_start;
832 		wu_w->swu_blk_end = wu_r->swu_blk_end;
833 
834 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
835 		    "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
836 		    sd->sd_meta->ssd_devname,
837 		    wu_r->swu_blk_start, wu_r->swu_blk_end);
838 
839 		s = splbio();
840 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
841 		splx(s);
842 
843 		sr_schedule_wu(wu_r);
844 
845 		slept = 0;
846 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
847 			tsleep(wu_w, PRIBIO, "sr_rebuild", 0);
848 			slept = 1;
849 		}
850 		if (!slept)
851 			tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
852 
853 		sr_scsi_wu_put(sd, wu_r);
854 		sr_scsi_wu_put(sd, wu_w);
855 
856 		sd->sd_meta->ssd_rebuild =
857 		    (chunk_lba - sd->sd_meta->ssd_data_offset) * chunk_count;
858 
859 		psz = sd->sd_meta->ssdi.ssd_size;
860 		rb = sd->sd_meta->ssd_rebuild;
861 		if (rb > 0)
862 			percent = 100 - ((psz * 100 - rb * 100) / psz) - 1;
863 		else
864 			percent = 0;
865 		if (percent != old_percent && strip_no != chunk_strips - 1) {
866 			if (sr_meta_save(sd, SR_META_DIRTY))
867 				printf("%s: could not save metadata to %s\n",
868 				    DEVNAME(sd->sd_sc),
869 				    sd->sd_meta->ssd_devname);
870 			old_percent = percent;
871 		}
872 
873 		if (sd->sd_reb_abort)
874 			goto abort;
875 	}
876 
877 	DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
878 	    sd->sd_meta->ssd_devname);
879 
880 	/* all done */
881 	sd->sd_meta->ssd_rebuild = 0;
882 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
883 		if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
884 		    BIOC_SDREBUILD) {
885 			sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
886 			break;
887 		}
888 	}
889 
890 	return;
891 
892 abort:
893 	if (sr_meta_save(sd, SR_META_DIRTY))
894 		printf("%s: could not save metadata to %s\n",
895 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
896 bad:
897 	return;
898 }
899 
900 #if 0
901 void
902 sr_raid5_scrub(struct sr_discipline *sd)
903 {
904 	int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
905 	int64_t i;
906 	struct sr_workunit *wu_r, *wu_w;
907 	int s, slept;
908 	void *xorbuf;
909 
910 	wu_w = sr_scsi_wu_get(sd, 0);
911 	wu_r = sr_scsi_wu_get(sd, 0);
912 
913 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
914 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
915 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
916 	max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
917 
918 	for (strip_no = 0; strip_no < max_strip; strip_no++) {
919 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
920 
921 		xorbuf = sr_block_get(sd, strip_size);
922 		for (i = 0; i <= no_chunk; i++) {
923 			if (i != parity)
924 				sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
925 				    NULL, SCSI_DATA_IN, 0, xorbuf);
926 		}
927 		sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
928 		    SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
929 
930 		wu_r->swu_flags |= SR_WUF_REBUILD;
931 
932 		/* Collide wu_w with wu_r */
933 		wu_w->swu_state = SR_WU_DEFERRED;
934 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
935 		wu_r->swu_collider = wu_w;
936 
937 		s = splbio();
938 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
939 		splx(s);
940 
941 		wu_r->swu_state = SR_WU_INPROGRESS;
942 		sr_schedule_wu(wu_r);
943 
944 		slept = 0;
945 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
946 			tsleep(wu_w, PRIBIO, "sr_scrub", 0);
947 			slept = 1;
948 		}
949 		if (!slept)
950 			tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
951 	}
952 done:
953 	return;
954 }
955 #endif
956