xref: /openbsd-src/sys/dev/softraid_raid6.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /* $OpenBSD: softraid_raid6.c,v 1.62 2014/07/12 18:48:51 tedu Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/proc.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/task.h>
38 #include <sys/conf.h>
39 #include <sys/uio.h>
40 
41 #include <scsi/scsi_all.h>
42 #include <scsi/scsiconf.h>
43 #include <scsi/scsi_disk.h>
44 
45 #include <dev/softraidvar.h>
46 #include <dev/rndvar.h>
47 
48 uint8_t *gf_map[256];
49 uint8_t	gf_pow[768];
50 int	gf_log[256];
51 
52 /* RAID 6 functions. */
53 int	sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
54 	    int, int64_t);
55 int	sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
56 	    int, void *);
57 int	sr_raid6_init(struct sr_discipline *);
58 int	sr_raid6_rw(struct sr_workunit *);
59 int	sr_raid6_openings(struct sr_discipline *);
60 void	sr_raid6_intr(struct buf *);
61 int	sr_raid6_wu_done(struct sr_workunit *);
62 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
63 void	sr_raid6_set_vol_state(struct sr_discipline *);
64 
65 void	sr_raid6_xorp(void *, void *, int);
66 void	sr_raid6_xorq(void *, void *, int, int);
67 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr_t, daddr_t,
68 	    void *, int, int, void *, void *, int);
69 void	sr_raid6_scrub(struct sr_discipline *);
70 int	sr_failio(struct sr_workunit *);
71 
72 void	gf_init(void);
73 uint8_t gf_inv(uint8_t);
74 int	gf_premul(uint8_t);
75 uint8_t gf_mul(uint8_t, uint8_t);
76 
77 #define SR_NOFAIL		0x00
78 #define SR_FAILX		(1L << 0)
79 #define SR_FAILY		(1L << 1)
80 #define SR_FAILP		(1L << 2)
81 #define SR_FAILQ		(1L << 3)
82 
83 struct sr_raid6_opaque {
84 	int	gn;
85 	void	*pbuf;
86 	void	*qbuf;
87 };
88 
89 /* discipline initialisation. */
90 void
91 sr_raid6_discipline_init(struct sr_discipline *sd)
92 {
93 	/* Initialize GF256 tables. */
94 	gf_init();
95 
96 	/* Fill out discipline members. */
97 	sd->sd_type = SR_MD_RAID6;
98 	strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
99 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
100 	    SR_CAP_REDUNDANT;
101 	sd->sd_max_wu = SR_RAID6_NOWU;
102 
103 	/* Setup discipline specific function pointers. */
104 	sd->sd_assemble = sr_raid6_assemble;
105 	sd->sd_create = sr_raid6_create;
106 	sd->sd_openings = sr_raid6_openings;
107 	sd->sd_scsi_rw = sr_raid6_rw;
108 	sd->sd_scsi_intr = sr_raid6_intr;
109 	sd->sd_scsi_wu_done = sr_raid6_wu_done;
110 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
111 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
112 }
113 
114 int
115 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
116     int no_chunk, int64_t coerced_size)
117 {
118 	if (no_chunk < 4) {
119 		sr_error(sd->sd_sc, "%s requires four or more chunks",
120 		    sd->sd_name);
121 		return EINVAL;
122 	}
123 
124 	/*
125 	 * XXX add variable strip size later even though MAXPHYS is really
126 	 * the clever value, users like * to tinker with that type of stuff.
127 	 */
128 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
129 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
130 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
131 	    DEV_BSHIFT) - 1)) * (no_chunk - 2);
132 
133 	return sr_raid6_init(sd);
134 }
135 
136 int
137 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
138     int no_chunk, void *data)
139 {
140 	return sr_raid6_init(sd);
141 }
142 
143 int
144 sr_raid6_init(struct sr_discipline *sd)
145 {
146 	/* Initialise runtime values. */
147 	sd->mds.mdd_raid6.sr6_strip_bits =
148 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
149 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1) {
150 		sr_error(sd->sd_sc, "invalid strip size");
151 		return EINVAL;
152 	}
153 
154 	/* only if stripsize <= MAXPHYS */
155 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
156 
157 	return 0;
158 }
159 
160 int
161 sr_raid6_openings(struct sr_discipline *sd)
162 {
163 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
164 }
165 
166 void
167 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
168 {
169 	int			old_state, s;
170 
171 	/* XXX this is for RAID 0 */
172 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
173 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
174 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
175 
176 	/* ok to go to splbio since this only happens in error path */
177 	s = splbio();
178 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
179 
180 	/* multiple IOs to the same chunk that fail will come through here */
181 	if (old_state == new_state)
182 		goto done;
183 
184 	switch (old_state) {
185 	case BIOC_SDONLINE:
186 		switch (new_state) {
187 		case BIOC_SDOFFLINE:
188 		case BIOC_SDSCRUB:
189 			break;
190 		default:
191 			goto die;
192 		}
193 		break;
194 
195 	case BIOC_SDOFFLINE:
196 		if (new_state == BIOC_SDREBUILD) {
197 			;
198 		} else
199 			goto die;
200 		break;
201 
202 	case BIOC_SDSCRUB:
203 		switch (new_state) {
204 		case BIOC_SDONLINE:
205 		case BIOC_SDOFFLINE:
206 			break;
207 		default:
208 			goto die;
209 		}
210 		break;
211 
212 	case BIOC_SDREBUILD:
213 		switch (new_state) {
214 		case BIOC_SDONLINE:
215 		case BIOC_SDOFFLINE:
216 			break;
217 		default:
218 			goto die;
219 		}
220 		break;
221 
222 	default:
223 die:
224 		splx(s); /* XXX */
225 		panic("%s: %s: %s: invalid chunk state transition "
226 		    "%d -> %d", DEVNAME(sd->sd_sc),
227 		    sd->sd_meta->ssd_devname,
228 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
229 		    old_state, new_state);
230 		/* NOTREACHED */
231 	}
232 
233 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
234 	sd->sd_set_vol_state(sd);
235 
236 	sd->sd_must_flush = 1;
237 	task_add(systq, &sd->sd_meta_save_task);
238 done:
239 	splx(s);
240 }
241 
242 void
243 sr_raid6_set_vol_state(struct sr_discipline *sd)
244 {
245 	int			states[SR_MAX_STATES];
246 	int			new_state, i, s, nd;
247 	int			old_state = sd->sd_vol_status;
248 
249 	/* XXX this is for RAID 0 */
250 
251 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
252 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
253 
254 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
255 
256 	for (i = 0; i < SR_MAX_STATES; i++)
257 		states[i] = 0;
258 
259 	for (i = 0; i < nd; i++) {
260 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
261 		if (s >= SR_MAX_STATES)
262 			panic("%s: %s: %s: invalid chunk state",
263 			    DEVNAME(sd->sd_sc),
264 			    sd->sd_meta->ssd_devname,
265 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
266 		states[s]++;
267 	}
268 
269 	if (states[BIOC_SDONLINE] == nd)
270 		new_state = BIOC_SVONLINE;
271 	else if (states[BIOC_SDONLINE] < nd - 2)
272 		new_state = BIOC_SVOFFLINE;
273 	else if (states[BIOC_SDSCRUB] != 0)
274 		new_state = BIOC_SVSCRUB;
275 	else if (states[BIOC_SDREBUILD] != 0)
276 		new_state = BIOC_SVREBUILD;
277 	else if (states[BIOC_SDONLINE] < nd)
278 		new_state = BIOC_SVDEGRADED;
279 	else {
280 		printf("old_state = %d, ", old_state);
281 		for (i = 0; i < nd; i++)
282 			printf("%d = %d, ", i,
283 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
284 		panic("invalid new_state");
285 	}
286 
287 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
288 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
289 	    old_state, new_state);
290 
291 	switch (old_state) {
292 	case BIOC_SVONLINE:
293 		switch (new_state) {
294 		case BIOC_SVONLINE: /* can go to same state */
295 		case BIOC_SVOFFLINE:
296 		case BIOC_SVDEGRADED:
297 		case BIOC_SVREBUILD: /* happens on boot */
298 			break;
299 		default:
300 			goto die;
301 		}
302 		break;
303 
304 	case BIOC_SVOFFLINE:
305 		/* XXX this might be a little too much */
306 		goto die;
307 
308 	case BIOC_SVDEGRADED:
309 		switch (new_state) {
310 		case BIOC_SVOFFLINE:
311 		case BIOC_SVREBUILD:
312 		case BIOC_SVDEGRADED: /* can go to the same state */
313 			break;
314 		default:
315 			goto die;
316 		}
317 		break;
318 
319 	case BIOC_SVBUILDING:
320 		switch (new_state) {
321 		case BIOC_SVONLINE:
322 		case BIOC_SVOFFLINE:
323 		case BIOC_SVBUILDING: /* can go to the same state */
324 			break;
325 		default:
326 			goto die;
327 		}
328 		break;
329 
330 	case BIOC_SVSCRUB:
331 		switch (new_state) {
332 		case BIOC_SVONLINE:
333 		case BIOC_SVOFFLINE:
334 		case BIOC_SVDEGRADED:
335 		case BIOC_SVSCRUB: /* can go to same state */
336 			break;
337 		default:
338 			goto die;
339 		}
340 		break;
341 
342 	case BIOC_SVREBUILD:
343 		switch (new_state) {
344 		case BIOC_SVONLINE:
345 		case BIOC_SVOFFLINE:
346 		case BIOC_SVDEGRADED:
347 		case BIOC_SVREBUILD: /* can go to the same state */
348 			break;
349 		default:
350 			goto die;
351 		}
352 		break;
353 
354 	default:
355 die:
356 		panic("%s: %s: invalid volume state transition %d -> %d",
357 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
358 		    old_state, new_state);
359 		/* NOTREACHED */
360 	}
361 
362 	sd->sd_vol_status = new_state;
363 }
364 
365 /*  modes:
366  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
367  *		0, qbuf, NULL, 0);
368  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
369  *		0, pbuf, NULL, 0);
370  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
371  *		0, pbuf, qbuf, gf_pow[i]);
372  */
373 
374 int
375 sr_raid6_rw(struct sr_workunit *wu)
376 {
377 	struct sr_workunit	*wu_r = NULL;
378 	struct sr_discipline	*sd = wu->swu_dis;
379 	struct scsi_xfer	*xs = wu->swu_xs;
380 	struct sr_chunk		*scp;
381 	int			s, fail, i, gxinv, pxinv;
382 	daddr_t			blk, lba;
383 	int64_t			chunk_offs, lbaoffs, phys_offs, strip_offs;
384 	int64_t			strip_no, strip_size, strip_bits;
385 	int64_t			fchunk, no_chunk, chunk, qchunk, pchunk;
386 	int64_t			length, datalen, row_size;
387 	void			*pbuf, *data, *qbuf;
388 
389 	/* blk and scsi error will be handled by sr_validate_io */
390 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
391 		goto bad;
392 
393 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
394 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
395 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
396 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
397 
398 	data = xs->data;
399 	datalen = xs->datalen;
400 	lbaoffs	= blk << DEV_BSHIFT;
401 
402 	if (xs->flags & SCSI_DATA_OUT) {
403 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
404 			printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
405 			goto bad;
406 		}
407 		wu_r->swu_state = SR_WU_INPROGRESS;
408 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
409 	}
410 
411 	wu->swu_blk_start = 0;
412 	while (datalen != 0) {
413 		strip_no = lbaoffs >> strip_bits;
414 		strip_offs = lbaoffs & (strip_size - 1);
415 		chunk_offs = (strip_no / no_chunk) << strip_bits;
416 		phys_offs = chunk_offs + strip_offs +
417 		    (sd->sd_meta->ssd_data_offset << DEV_BSHIFT);
418 
419 		/* get size remaining in this stripe */
420 		length = MIN(strip_size - strip_offs, datalen);
421 
422 		/* map disk offset to parity/data drive */
423 		chunk = strip_no % no_chunk;
424 
425 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
426 		if (qchunk == 0)
427 			pchunk = no_chunk + 1;
428 		else
429 			pchunk = qchunk - 1;
430 		if (chunk >= pchunk)
431 			chunk++;
432 		if (chunk >= qchunk)
433 			chunk++;
434 
435 		lba = phys_offs >> DEV_BSHIFT;
436 
437 		/* XXX big hammer.. exclude I/O from entire stripe */
438 		if (wu->swu_blk_start == 0)
439 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
440 		wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
441 
442 		fail = 0;
443 		fchunk = -1;
444 
445 		/* Get disk-fail flags */
446 		for (i=0; i< no_chunk+2; i++) {
447 			scp = sd->sd_vol.sv_chunks[i];
448 			switch (scp->src_meta.scm_status) {
449 			case BIOC_SDOFFLINE:
450 			case BIOC_SDREBUILD:
451 			case BIOC_SDHOTSPARE:
452 				if (i == qchunk)
453 					fail |= SR_FAILQ;
454 				else if (i == pchunk)
455 					fail |= SR_FAILP;
456 				else if (i == chunk)
457 					fail |= SR_FAILX;
458 				else {
459 					/* dual data-disk failure */
460 					fail |= SR_FAILY;
461 					fchunk = i;
462 				}
463 				break;
464 			}
465 		}
466 		if (xs->flags & SCSI_DATA_IN) {
467 			if (!(fail & SR_FAILX)) {
468 				/* drive is good. issue single read request */
469 				if (sr_raid6_addio(wu, chunk, lba, length,
470 				    data, xs->flags, 0, NULL, NULL, 0))
471 					goto bad;
472 			} else if (fail & SR_FAILP) {
473 				/* Dx, P failed */
474 				printf("Disk %llx offline, "
475 				    "regenerating Dx+P\n", chunk);
476 
477 				gxinv = gf_inv(gf_pow[chunk]);
478 
479 				/* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
480 				memset(data, 0, length);
481 				if (sr_raid6_addio(wu, qchunk, lba, length,
482 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
483 					goto bad;
484 
485 				/* Read Dz * gz * inv(gx) */
486 				for (i = 0; i < no_chunk+2; i++) {
487 					if  (i == qchunk || i == pchunk || i == chunk)
488 						continue;
489 
490 					if (sr_raid6_addio(wu, i, lba, length,
491 					    NULL, SCSI_DATA_IN, 0, NULL, data,
492 					    gf_mul(gf_pow[i], gxinv)))
493 						goto bad;
494 				}
495 
496 				/* data will contain correct value on completion */
497 			} else if (fail & SR_FAILY) {
498 				/* Dx, Dy failed */
499 				printf("Disk %llx & %llx offline, "
500 				    "regenerating Dx+Dy\n", chunk, fchunk);
501 
502 				gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
503 				pxinv = gf_mul(gf_pow[fchunk], gxinv);
504 
505 				/* read Q * inv(gx + gy) */
506 				memset(data, 0, length);
507 				if (sr_raid6_addio(wu, qchunk, lba, length,
508 				    NULL, SCSI_DATA_IN, 0, NULL, data, gxinv))
509 					goto bad;
510 
511 				/* read P * gy * inv(gx + gy) */
512 				if (sr_raid6_addio(wu, pchunk, lba, length,
513 				    NULL, SCSI_DATA_IN, 0, NULL, data, pxinv))
514 					goto bad;
515 
516 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
517 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
518 				 *   P:  sr_raid6_xorp(pbuf, --, length);
519 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
520 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
521 				 */
522 				for (i = 0; i < no_chunk+2; i++) {
523 					if (i == qchunk || i == pchunk ||
524 					    i == chunk || i == fchunk)
525 						continue;
526 
527 					/* read Dz * (gz + gy) * inv(gx + gy) */
528 					if (sr_raid6_addio(wu, i, lba, length,
529 					    NULL, SCSI_DATA_IN, 0, NULL, data,
530 					    pxinv ^ gf_mul(gf_pow[i], gxinv)))
531 						goto bad;
532 				}
533 			} else {
534 				/* Two cases: single disk (Dx) or (Dx+Q)
535 				 *   Dx = Dz ^ P (same as RAID5)
536 				 */
537 				printf("Disk %llx offline, "
538 				    "regenerating Dx%s\n", chunk,
539 				    fail & SR_FAILQ ? "+Q" : " single");
540 
541 				/* Calculate: Dx = P^Dz
542 				 *   P:  sr_raid6_xorp(data, ---, length);
543 				 *   Dz: sr_raid6_xorp(data, ---, length);
544 				 */
545 				memset(data, 0, length);
546 				for (i = 0; i < no_chunk+2; i++) {
547 					if (i != chunk && i != qchunk) {
548 						/* Read Dz */
549 						if (sr_raid6_addio(wu, i, lba,
550 						    length, NULL, SCSI_DATA_IN,
551 						    0, data, NULL, 0))
552 							goto bad;
553 					}
554 				}
555 
556 				/* data will contain correct value on completion */
557 			}
558 		} else {
559 			/* XXX handle writes to failed/offline disk? */
560 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
561 				goto bad;
562 
563 			/*
564 			 * initialize pbuf with contents of new data to be
565 			 * written. This will be XORed with old data and old
566 			 * parity in the intr routine. The result in pbuf
567 			 * is the new parity data.
568 			 */
569 			qbuf = sr_block_get(sd, length);
570 			if (qbuf == NULL)
571 				goto bad;
572 
573 			pbuf = sr_block_get(sd, length);
574 			if (pbuf == NULL)
575 				goto bad;
576 
577 			/* Calculate P = Dn; Q = gn * Dn */
578 			if (gf_premul(gf_pow[chunk]))
579 				goto bad;
580 			sr_raid6_xorp(pbuf, data, length);
581 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
582 
583 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
584 			if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
585 				SCSI_DATA_IN, 0, pbuf, qbuf, gf_pow[chunk]))
586 				goto bad;
587 
588 			/* Read old xor-parity: P ^= P' */
589 			if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
590 				SCSI_DATA_IN, 0, pbuf, NULL, 0))
591 				goto bad;
592 
593 			/* Read old q-parity: Q ^= Q' */
594 			if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
595 				SCSI_DATA_IN, 0, qbuf, NULL, 0))
596 				goto bad;
597 
598 			/* write new data */
599 			if (sr_raid6_addio(wu, chunk, lba, length, data,
600 			    xs->flags, 0, NULL, NULL, 0))
601 				goto bad;
602 
603 			/* write new xor-parity */
604 			if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
605 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
606 				goto bad;
607 
608 			/* write new q-parity */
609 			if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
610 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
611 				goto bad;
612 		}
613 
614 		/* advance to next block */
615 		lbaoffs += length;
616 		datalen -= length;
617 		data += length;
618 	}
619 
620 	s = splbio();
621 	if (wu_r) {
622 		/* collide write request with reads */
623 		wu_r->swu_blk_start = wu->swu_blk_start;
624 		wu_r->swu_blk_end = wu->swu_blk_end;
625 
626 		wu->swu_state = SR_WU_DEFERRED;
627 		wu_r->swu_collider = wu;
628 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
629 
630 		wu = wu_r;
631 	}
632 	splx(s);
633 
634 	sr_schedule_wu(wu);
635 
636 	return (0);
637 bad:
638 	/* XXX - can leak pbuf/qbuf on error. */
639 	/* wu is unwound by sr_wu_put */
640 	if (wu_r)
641 		sr_scsi_wu_put(sd, wu_r);
642 	return (1);
643 }
644 
645 /* Handle failure I/O completion */
646 int
647 sr_failio(struct sr_workunit *wu)
648 {
649 	struct sr_discipline	*sd = wu->swu_dis;
650 	struct sr_ccb		*ccb;
651 
652 	if (!(wu->swu_flags & SR_WUF_FAIL))
653 		return (0);
654 
655 	/* Wu is a 'fake'.. don't do real I/O just intr */
656 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
657 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
658 		sr_raid6_intr(&ccb->ccb_buf);
659 	return (1);
660 }
661 
662 void
663 sr_raid6_intr(struct buf *bp)
664 {
665 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
666 	struct sr_workunit	*wu = ccb->ccb_wu;
667 	struct sr_discipline	*sd = wu->swu_dis;
668 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
669 	int			s;
670 
671 	DNPRINTF(SR_D_INTR, "%s: sr_raid6_intr bp %p xs %p\n",
672 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
673 
674 	s = splbio();
675 	sr_ccb_done(ccb);
676 
677 	/* XOR data to result. */
678 	if (ccb->ccb_state == SR_CCB_OK && pq) {
679 		if (pq->pbuf)
680 			/* Calculate xor-parity */
681 			sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
682 			    ccb->ccb_buf.b_bcount);
683 		if (pq->qbuf)
684 			/* Calculate q-parity */
685 			sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
686 			    ccb->ccb_buf.b_bcount, pq->gn);
687 		free(pq, M_DEVBUF, 0);
688 		ccb->ccb_opaque = NULL;
689 	}
690 
691 	/* Free allocated data buffer. */
692 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
693 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
694 		ccb->ccb_buf.b_data = NULL;
695 	}
696 
697 	sr_wu_done(wu);
698 	splx(s);
699 }
700 
701 int
702 sr_raid6_wu_done(struct sr_workunit *wu)
703 {
704 	struct sr_discipline	*sd = wu->swu_dis;
705 	struct scsi_xfer	*xs = wu->swu_xs;
706 
707 	/* XXX - we have no way of propagating errors... */
708 	if (wu->swu_flags & SR_WUF_DISCIPLINE)
709 		return SR_WU_OK;
710 
711 	/* XXX - This is insufficient for RAID 6. */
712 	if (wu->swu_ios_succeeded > 0) {
713 		xs->error = XS_NOERROR;
714 		return SR_WU_OK;
715 	}
716 
717 	if (xs->flags & SCSI_DATA_IN) {
718 		printf("%s: retrying read on block %lld\n",
719 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
720 		sr_wu_release_ccbs(wu);
721 		wu->swu_state = SR_WU_RESTART;
722 		if (sd->sd_scsi_rw(wu) == 0)
723 			return SR_WU_RESTART;
724 	} else {
725 		printf("%s: permanently fail write on block %lld\n",
726 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
727 	}
728 
729 	wu->swu_state = SR_WU_FAILED;
730 	xs->error = XS_DRIVER_STUFFUP;
731 
732 	return SR_WU_FAILED;
733 }
734 
735 int
736 sr_raid6_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
737     daddr_t len, void *data, int xsflags, int ccbflags, void *pbuf,
738     void *qbuf, int gn)
739 {
740 	struct sr_discipline	*sd = wu->swu_dis;
741 	struct sr_ccb		*ccb;
742 	struct sr_raid6_opaque  *pqbuf;
743 
744 	DNPRINTF(SR_D_DIS, "sr_raid6_addio: %s %d.%llx %llx %p:%p\n",
745 	    (xsflags & SCSI_DATA_IN) ? "read" : "write", chunk,
746 	    (long long)blkno, (long long)len,
747 	    pbuf, qbuf);
748 
749 	/* Allocate temporary buffer. */
750 	if (data == NULL) {
751 		data = sr_block_get(sd, len);
752 		if (data == NULL)
753 			return (-1);
754 		ccbflags |= SR_CCBF_FREEBUF;
755 	}
756 
757 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
758 	if (ccb == NULL) {
759 		if (ccbflags & SR_CCBF_FREEBUF)
760 			sr_block_put(sd, data, len);
761 		return (-1);
762 	}
763 	if (pbuf || qbuf) {
764 		/* XXX - can leak data and ccb on failure. */
765 		if (qbuf && gf_premul(gn))
766 			return (-1);
767 
768 		/* XXX - should be preallocated? */
769 		pqbuf = malloc(sizeof(struct sr_raid6_opaque),
770 		    M_DEVBUF, M_ZERO | M_NOWAIT);
771 		if (pqbuf == NULL) {
772 			sr_ccb_put(ccb);
773 			return (-1);
774 		}
775 		pqbuf->pbuf = pbuf;
776 		pqbuf->qbuf = qbuf;
777 		pqbuf->gn = gn;
778 		ccb->ccb_opaque = pqbuf;
779 	}
780 	sr_wu_enqueue_ccb(wu, ccb);
781 
782 	return (0);
783 }
784 
785 /* Perform RAID6 parity calculation.
786  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
787 void
788 sr_raid6_xorp(void *p, void *d, int len)
789 {
790 	uint32_t *pbuf = p, *data = d;
791 
792 	len >>= 2;
793 	while (len--)
794 		*pbuf++ ^= *data++;
795 }
796 
797 void
798 sr_raid6_xorq(void *q, void *d, int len, int gn)
799 {
800 	uint32_t 	*qbuf = q, *data = d, x;
801 	uint8_t	 	*gn_map = gf_map[gn];
802 
803 	len >>= 2;
804 	while (len--) {
805 		x = *data++;
806 		*qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
807 		  	    ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
808 			    ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
809 			    ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
810 	}
811 }
812 
813 /* Create GF256 log/pow tables: polynomial = 0x11D */
814 void
815 gf_init(void)
816 {
817 	int i;
818 	uint8_t p = 1;
819 
820 	/* use 2N pow table to avoid using % in multiply */
821 	for (i=0; i<256; i++) {
822 		gf_log[p] = i;
823 		gf_pow[i] = gf_pow[i+255] = p;
824 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
825 	}
826 	gf_log[0] = 512;
827 }
828 
829 uint8_t
830 gf_inv(uint8_t a)
831 {
832 	return gf_pow[255 - gf_log[a]];
833 }
834 
835 uint8_t
836 gf_mul(uint8_t a, uint8_t b)
837 {
838 	return gf_pow[gf_log[a] + gf_log[b]];
839 }
840 
841 /* Precalculate multiplication tables for drive gn */
842 int
843 gf_premul(uint8_t gn)
844 {
845 	int i;
846 
847 	if (gf_map[gn] != NULL)
848 		return (0);
849 
850 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
851 		return (-1);
852 
853 	for (i=0; i<256; i++)
854 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
855 	return (0);
856 }
857