xref: /openbsd-src/sys/dev/softraid_raid6.c (revision 4c1e55dc91edd6e69ccc60ce855900fbc12cf34f)
1 /* $OpenBSD: softraid_raid6.c,v 1.25 2011/12/25 15:28:17 jsing Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/proc.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/conf.h>
38 #include <sys/uio.h>
39 
40 #include <scsi/scsi_all.h>
41 #include <scsi/scsiconf.h>
42 #include <scsi/scsi_disk.h>
43 
44 #include <dev/softraidvar.h>
45 #include <dev/rndvar.h>
46 
47 uint8_t *gf_map[256];
48 uint8_t	gf_pow[768];
49 int	gf_log[256];
50 
51 /* RAID 6 functions. */
52 int	sr_raid6_create(struct sr_discipline *, struct bioc_createraid *,
53 	    int, int64_t);
54 int	sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *,
55 	    int);
56 int	sr_raid6_alloc_resources(struct sr_discipline *);
57 int	sr_raid6_free_resources(struct sr_discipline *);
58 int	sr_raid6_rw(struct sr_workunit *);
59 int	sr_raid6_openings(struct sr_discipline *);
60 void	sr_raid6_intr(struct buf *);
61 void	sr_raid6_recreate_wu(struct sr_workunit *);
62 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
63 void	sr_raid6_set_vol_state(struct sr_discipline *);
64 
65 void	sr_raid6_xorp(void *, void *, int);
66 void	sr_raid6_xorq(void *, void *, int, int);
67 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t,
68 	    void *, int, int, void *, void *, int);
69 void	sr_dump(void *, int);
70 void	sr_raid6_scrub(struct sr_discipline *);
71 int	sr_failio(struct sr_workunit *);
72 
73 void	*sr_get_block(struct sr_discipline *, int);
74 void	sr_put_block(struct sr_discipline *, void *, int);
75 
76 void	gf_init(void);
77 uint8_t gf_inv(uint8_t);
78 int	gf_premul(uint8_t);
79 uint8_t gf_mul(uint8_t, uint8_t);
80 
81 #define SR_NOFAIL		0x00
82 #define SR_FAILX		(1L << 0)
83 #define SR_FAILY		(1L << 1)
84 #define SR_FAILP		(1L << 2)
85 #define SR_FAILQ		(1L << 3)
86 
87 struct sr_raid6_opaque {
88 	int      gn;
89 	void	*pbuf;
90 	void	*qbuf;
91 };
92 
93 /* discipline initialisation. */
94 void
95 sr_raid6_discipline_init(struct sr_discipline *sd)
96 {
97 
98 	/* Initialize GF256 tables. */
99 	gf_init();
100 
101 	/* Fill out discipline members. */
102 	sd->sd_type = SR_MD_RAID6;
103 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE;
104 	sd->sd_max_wu = SR_RAID6_NOWU;
105 
106 	/* Setup discipline specific function pointers. */
107 	sd->sd_alloc_resources = sr_raid6_alloc_resources;
108 	sd->sd_assemble = sr_raid6_assemble;
109 	sd->sd_create = sr_raid6_create;
110 	sd->sd_free_resources = sr_raid6_free_resources;
111 	sd->sd_openings = sr_raid6_openings;
112 	sd->sd_scsi_rw = sr_raid6_rw;
113 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
114 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
115 }
116 
117 int
118 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc,
119     int no_chunk, int64_t coerced_size)
120 {
121 
122 	if (no_chunk < 4)
123 		return EINVAL;
124 
125 	strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name));
126 
127 	/*
128 	 * XXX add variable strip size later even though MAXPHYS is really
129 	 * the clever value, users like * to tinker with that type of stuff.
130 	 */
131         sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
132         sd->sd_meta->ssdi.ssd_size = (coerced_size &
133 	    ~((sd->sd_meta->ssdi.ssd_strip_size >> DEV_BSHIFT) - 1)) *
134 	    (no_chunk - 2);
135 
136 	/* only if stripsize <= MAXPHYS */
137 	sd->sd_max_ccb_per_wu = max(6, 2 * no_chunk);
138 
139 	return 0;
140 }
141 
142 int
143 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
144     int no_chunk)
145 {
146 
147 	/* only if stripsize <= MAXPHYS */
148 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no);
149 
150 	return 0;
151 }
152 
153 int
154 sr_raid6_openings(struct sr_discipline *sd)
155 {
156 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
157 }
158 
159 int
160 sr_raid6_alloc_resources(struct sr_discipline *sd)
161 {
162 	int			rv = EINVAL;
163 
164 	if (!sd)
165 		return (rv);
166 
167 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n",
168 	    DEVNAME(sd->sd_sc));
169 
170 	if (sr_wu_alloc(sd))
171 		goto bad;
172 	if (sr_ccb_alloc(sd))
173 		goto bad;
174 
175 	/* setup runtime values */
176 	sd->mds.mdd_raid6.sr6_strip_bits =
177 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
178 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1)
179 		goto bad;
180 
181 	rv = 0;
182 bad:
183 	return (rv);
184 }
185 
186 int
187 sr_raid6_free_resources(struct sr_discipline *sd)
188 {
189 	int			rv = EINVAL;
190 
191 	if (!sd)
192 		return (rv);
193 
194 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n",
195 	    DEVNAME(sd->sd_sc));
196 
197 	sr_wu_free(sd);
198 	sr_ccb_free(sd);
199 
200 	rv = 0;
201 	return (rv);
202 }
203 
204 void
205 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
206 {
207 	int			old_state, s;
208 
209 	/* XXX this is for RAID 0 */
210 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
211 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
212 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
213 
214 	/* ok to go to splbio since this only happens in error path */
215 	s = splbio();
216 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
217 
218 	/* multiple IOs to the same chunk that fail will come through here */
219 	if (old_state == new_state)
220 		goto done;
221 
222 	switch (old_state) {
223 	case BIOC_SDONLINE:
224 		switch (new_state) {
225 		case BIOC_SDOFFLINE:
226 		case BIOC_SDSCRUB:
227 			break;
228 		default:
229 			goto die;
230 		}
231 		break;
232 
233 	case BIOC_SDOFFLINE:
234 		if (new_state == BIOC_SDREBUILD) {
235 			;
236 		} else
237 			goto die;
238 		break;
239 
240 	case BIOC_SDSCRUB:
241 		switch (new_state) {
242 		case BIOC_SDONLINE:
243 		case BIOC_SDOFFLINE:
244 			break;
245 		default:
246 			goto die;
247 		}
248 		break;
249 
250 	case BIOC_SDREBUILD:
251 		switch (new_state) {
252 		case BIOC_SDONLINE:
253 		case BIOC_SDOFFLINE:
254 			break;
255 		default:
256 			goto die;
257 		}
258 		break;
259 
260 	default:
261 die:
262 		splx(s); /* XXX */
263 		panic("%s: %s: %s: invalid chunk state transition "
264 		    "%d -> %d", DEVNAME(sd->sd_sc),
265 		    sd->sd_meta->ssd_devname,
266 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
267 		    old_state, new_state);
268 		/* NOTREACHED */
269 	}
270 
271 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
272 	sd->sd_set_vol_state(sd);
273 
274 	sd->sd_must_flush = 1;
275 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
276 done:
277 	splx(s);
278 }
279 
280 void
281 sr_raid6_set_vol_state(struct sr_discipline *sd)
282 {
283 	int			states[SR_MAX_STATES];
284 	int			new_state, i, s, nd;
285 	int			old_state = sd->sd_vol_status;
286 
287 	/* XXX this is for RAID 0 */
288 
289 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
290 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
291 
292 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
293 
294 	for (i = 0; i < SR_MAX_STATES; i++)
295 		states[i] = 0;
296 
297 	for (i = 0; i < nd; i++) {
298 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
299 		if (s >= SR_MAX_STATES)
300 			panic("%s: %s: %s: invalid chunk state",
301 			    DEVNAME(sd->sd_sc),
302 			    sd->sd_meta->ssd_devname,
303 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
304 		states[s]++;
305 	}
306 
307 	if (states[BIOC_SDONLINE] == nd)
308 		new_state = BIOC_SVONLINE;
309 	else if (states[BIOC_SDONLINE] < nd - 2)
310 		new_state = BIOC_SVOFFLINE;
311 	else if (states[BIOC_SDSCRUB] != 0)
312 		new_state = BIOC_SVSCRUB;
313 	else if (states[BIOC_SDREBUILD] != 0)
314 		new_state = BIOC_SVREBUILD;
315 	else if (states[BIOC_SDONLINE] < nd)
316 		new_state = BIOC_SVDEGRADED;
317 	else {
318 		printf("old_state = %d, ", old_state);
319 		for (i = 0; i < nd; i++)
320 			printf("%d = %d, ", i,
321 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
322 		panic("invalid new_state");
323 	}
324 
325 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
326 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
327 	    old_state, new_state);
328 
329 	switch (old_state) {
330 	case BIOC_SVONLINE:
331 		switch (new_state) {
332 		case BIOC_SVONLINE: /* can go to same state */
333 		case BIOC_SVOFFLINE:
334 		case BIOC_SVDEGRADED:
335 		case BIOC_SVREBUILD: /* happens on boot */
336 			break;
337 		default:
338 			goto die;
339 		}
340 		break;
341 
342 	case BIOC_SVOFFLINE:
343 		/* XXX this might be a little too much */
344 		goto die;
345 
346 	case BIOC_SVSCRUB:
347 		switch (new_state) {
348 		case BIOC_SVONLINE:
349 		case BIOC_SVOFFLINE:
350 		case BIOC_SVDEGRADED:
351 		case BIOC_SVSCRUB: /* can go to same state */
352 			break;
353 		default:
354 			goto die;
355 		}
356 		break;
357 
358 	case BIOC_SVBUILDING:
359 		switch (new_state) {
360 		case BIOC_SVONLINE:
361 		case BIOC_SVOFFLINE:
362 		case BIOC_SVBUILDING: /* can go to the same state */
363 			break;
364 		default:
365 			goto die;
366 		}
367 		break;
368 
369 	case BIOC_SVREBUILD:
370 		switch (new_state) {
371 		case BIOC_SVONLINE:
372 		case BIOC_SVOFFLINE:
373 		case BIOC_SVDEGRADED:
374 		case BIOC_SVREBUILD: /* can go to the same state */
375 			break;
376 		default:
377 			goto die;
378 		}
379 		break;
380 
381 	case BIOC_SVDEGRADED:
382 		switch (new_state) {
383 		case BIOC_SVOFFLINE:
384 		case BIOC_SVREBUILD:
385 		case BIOC_SVDEGRADED: /* can go to the same state */
386 			break;
387 		default:
388 			goto die;
389 		}
390 		break;
391 
392 	default:
393 die:
394 		panic("%s: %s: invalid volume state transition %d -> %d",
395 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
396 		    old_state, new_state);
397 		/* NOTREACHED */
398 	}
399 
400 	sd->sd_vol_status = new_state;
401 }
402 
403 /*  modes:
404  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
405  *	        SR_CCBF_FREEBUF, qbuf, NULL, 0);
406  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
407  *		SR_CCBF_FREEBUF, pbuf, NULL, 0);
408  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
409  *		SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]);
410  */
411 
412 int
413 sr_raid6_rw(struct sr_workunit *wu)
414 {
415 	struct sr_workunit	*wu_r = NULL;
416 	struct sr_discipline	*sd = wu->swu_dis;
417 	struct scsi_xfer	*xs = wu->swu_xs;
418 	struct sr_chunk		*scp;
419 	int			s, fail, i, gxinv, pxinv;
420 	daddr64_t		blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk;
421 	daddr64_t		strip_size, no_chunk, lba, chunk_offs, phys_offs;
422 	daddr64_t		strip_bits, length, strip_offs, datalen, row_size;
423 	void		        *pbuf, *data, *qbuf;
424 
425 	/* blk and scsi error will be handled by sr_validate_io */
426 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
427 		goto bad;
428 
429 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
430 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
431 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
432 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
433 
434 	data = xs->data;
435 	datalen = xs->datalen;
436 	lbaoffs	= blk << DEV_BSHIFT;
437 
438 	if (xs->flags & SCSI_DATA_OUT)
439 		/* create write workunit */
440 		if ((wu_r = scsi_io_get(&sd->sd_iopool, SCSI_NOSLEEP)) == NULL){
441 			printf("%s: can't get wu_r", DEVNAME(sd->sd_sc));
442 			goto bad;
443 		}
444 
445 	wu->swu_blk_start = 0;
446 	while (datalen != 0) {
447 		strip_no = lbaoffs >> strip_bits;
448 		strip_offs = lbaoffs & (strip_size - 1);
449 		chunk_offs = (strip_no / no_chunk) << strip_bits;
450 		phys_offs = chunk_offs + strip_offs +
451 		    (sd->sd_meta->ssd_data_offset << DEV_BSHIFT);
452 
453 		/* get size remaining in this stripe */
454 		length = MIN(strip_size - strip_offs, datalen);
455 
456 		/* map disk offset to parity/data drive */
457 		chunk = strip_no % no_chunk;
458 
459 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
460 		if (qchunk == 0)
461 			pchunk = no_chunk + 1;
462 		else
463 			pchunk = qchunk - 1;
464 		if (chunk >= pchunk)
465 			chunk++;
466 		if (chunk >= qchunk)
467 			chunk++;
468 
469 		lba = phys_offs >> DEV_BSHIFT;
470 
471 		/* XXX big hammer.. exclude I/O from entire stripe */
472 		if (wu->swu_blk_start == 0)
473 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
474 		wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1);
475 
476 		fail = 0;
477 		fchunk = -1;
478 
479 		/* Get disk-fail flags */
480 		for (i=0; i< no_chunk+2; i++) {
481 			scp = sd->sd_vol.sv_chunks[i];
482 			switch (scp->src_meta.scm_status) {
483 			case BIOC_SDOFFLINE:
484 			case BIOC_SDREBUILD:
485 			case BIOC_SDHOTSPARE:
486 				if (i == qchunk)
487 					fail |= SR_FAILQ;
488 				else if (i == pchunk)
489 					fail |= SR_FAILP;
490 				else if (i == chunk)
491 					fail |= SR_FAILX;
492 				else {
493 					/* dual data-disk failure */
494 					fail |= SR_FAILY;
495 					fchunk = i;
496 				}
497 				break;
498 			}
499 		}
500 		if (xs->flags & SCSI_DATA_IN) {
501 			if (!(fail & SR_FAILX)) {
502 				/* drive is good. issue single read request */
503 				if (sr_raid6_addio(wu, chunk, lba, length,
504 				    data, xs->flags, 0, NULL, NULL, 0))
505 					goto bad;
506 			} else if (fail & SR_FAILP) {
507 				/* Dx, P failed */
508 				printf("Disk %llx offline, "
509 				    "regenerating Dx+P\n", chunk);
510 
511 				gxinv = gf_inv(gf_pow[chunk]);
512 
513 				/* Calculate: Dx = (Q^Dz*gz)*inv(gx) */
514 				memset(data, 0, length);
515 				if (sr_raid6_addio(wu, qchunk, lba, length, NULL,
516 				    SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data,
517 				    gxinv))
518 					goto bad;
519 
520 				/* Read Dz * gz * inv(gx) */
521 				for (i = 0; i < no_chunk+2; i++) {
522 					if  (i == qchunk || i == pchunk || i == chunk)
523 						continue;
524 
525 					if (sr_raid6_addio(wu, i, lba,
526 					   length, NULL, SCSI_DATA_IN,
527 					   SR_CCBF_FREEBUF, NULL,
528 					   data, gf_mul(gf_pow[i], gxinv)))
529 						goto bad;
530 				}
531 
532 				/* data will contain correct value on completion */
533 			} else if (fail & SR_FAILY) {
534 				/* Dx, Dy failed */
535 				printf("Disk %llx & %llx offline, "
536 				    "regenerating Dx+Dy\n", chunk, fchunk);
537 
538 				gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]);
539 				pxinv = gf_mul(gf_pow[fchunk], gxinv);
540 
541 				/* read Q * inv(gx + gy) */
542 				memset(data, 0, length);
543 				if (sr_raid6_addio(wu, qchunk, lba,
544 				    length,  NULL, SCSI_DATA_IN,
545 				    SR_CCBF_FREEBUF, NULL,
546 				    data, gxinv))
547 					goto bad;
548 
549 				/* read P * gy * inv(gx + gy) */
550 				if (sr_raid6_addio(wu, pchunk, lba,
551 				    length,  NULL, SCSI_DATA_IN,
552 				    SR_CCBF_FREEBUF, NULL,
553 				    data, pxinv))
554 					goto bad;
555 
556 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
557 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
558 				 *   P:  sr_raid6_xorp(pbuf, --, length);
559 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
560 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
561 				 */
562 				for (i = 0; i < no_chunk+2; i++) {
563 					if (i == qchunk || i == pchunk ||
564 					    i == chunk || i == fchunk)
565 						continue;
566 
567 					/* read Dz * (gz + gy) * inv(gx + gy) */
568 					if (sr_raid6_addio(wu, i, lba,
569 					    length, NULL, SCSI_DATA_IN,
570 					    SR_CCBF_FREEBUF, NULL, data,
571 					    pxinv ^ gf_mul(gf_pow[i], gxinv)))
572 						goto bad;
573 				}
574 			} else {
575 				/* Two cases: single disk (Dx) or (Dx+Q)
576 				 *   Dx = Dz ^ P (same as RAID5)
577 				 */
578 				printf("Disk %llx offline, "
579 				    "regenerating Dx%s\n", chunk,
580 				    fail & SR_FAILQ ? "+Q" : " single");
581 
582 				/* Calculate: Dx = P^Dz
583 				 *   P:  sr_raid6_xorp(data, ---, length);
584 				 *   Dz: sr_raid6_xorp(data, ---, length);
585 				 */
586 				memset(data, 0, length);
587 				for (i = 0; i < no_chunk+2; i++) {
588 					if (i != chunk && i != qchunk) {
589 						/* Read Dz */
590 						if (sr_raid6_addio(wu, i, lba,
591 						    length, NULL, SCSI_DATA_IN,
592 						    SR_CCBF_FREEBUF, data,
593 						    NULL, 0))
594 							goto bad;
595 					}
596 				}
597 
598 				/* data will contain correct value on completion */
599 			}
600 		} else {
601 			/* XXX handle writes to failed/offline disk? */
602 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
603 				goto bad;
604 
605 			/*
606 			 * initialize pbuf with contents of new data to be
607 			 * written. This will be XORed with old data and old
608 			 * parity in the intr routine. The result in pbuf
609 			 * is the new parity data.
610 			 */
611 			qbuf = sr_get_block(sd, length);
612 			if (qbuf == NULL)
613 				goto bad;
614 
615 			pbuf = sr_get_block(sd, length);
616 			if (pbuf == NULL)
617 				goto bad;
618 
619 			/* Calulate P = Dn; Q = gn * Dn */
620 			if (gf_premul(gf_pow[chunk]))
621 				goto bad;
622 			sr_raid6_xorp(pbuf, data, length);
623 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
624 
625 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
626 			if (sr_raid6_addio(wu_r, chunk, lba, length, NULL,
627 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf,
628 				gf_pow[chunk]))
629 				goto bad;
630 
631 			/* Read old xor-parity: P ^= P' */
632 			if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL,
633 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0))
634 				goto bad;
635 
636 			/* Read old q-parity: Q ^= Q' */
637 			if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL,
638 				SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0))
639 				goto bad;
640 
641 			/* write new data */
642 			if (sr_raid6_addio(wu, chunk, lba, length, data,
643 			    xs->flags, 0, NULL, NULL, 0))
644 				goto bad;
645 
646 			/* write new xor-parity */
647 			if (sr_raid6_addio(wu, pchunk, lba, length, pbuf,
648 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
649 				goto bad;
650 
651 			/* write new q-parity */
652 			if (sr_raid6_addio(wu, qchunk, lba, length, qbuf,
653 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
654 				goto bad;
655 		}
656 
657 		/* advance to next block */
658 		lbaoffs += length;
659 		datalen -= length;
660 		data += length;
661 	}
662 
663 	s = splbio();
664 	if (wu_r) {
665 		/* collide write request with reads */
666 		wu_r->swu_blk_start = wu->swu_blk_start;
667 		wu_r->swu_blk_end = wu->swu_blk_end;
668 
669 		wu->swu_state = SR_WU_DEFERRED;
670 		wu_r->swu_collider = wu;
671 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
672 
673 		wu = wu_r;
674 	}
675 
676 	/* rebuild io, let rebuild routine deal with it */
677 	if (wu->swu_flags & SR_WUF_REBUILD)
678 		goto queued;
679 
680 	/* current io failed, restart */
681 	if (wu->swu_state == SR_WU_RESTART)
682 		goto start;
683 
684 	/* deferred io failed, don't restart */
685 	if (wu->swu_state == SR_WU_REQUEUE)
686 		goto queued;
687 
688 	if (sr_check_io_collision(wu))
689 		goto queued;
690 
691 start:
692 	sr_raid_startwu(wu);
693 queued:
694 	splx(s);
695 	return (0);
696 bad:
697 	/* wu is unwound by sr_wu_put */
698 	if (wu_r)
699 		scsi_io_put(&sd->sd_iopool, wu_r);
700 	return (1);
701 }
702 
703 /* Handle failure I/O completion */
704 int
705 sr_failio(struct sr_workunit *wu)
706 {
707 	struct sr_discipline	*sd = wu->swu_dis;
708 	struct sr_ccb		*ccb;
709 
710 	if (!(wu->swu_flags & SR_WUF_FAIL))
711 		return (0);
712 
713 	/* Wu is a 'fake'.. don't do real I/O just intr */
714 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
715 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
716 		sr_raid6_intr(&ccb->ccb_buf);
717 	return (1);
718 }
719 
720 void
721 sr_raid6_intr(struct buf *bp)
722 {
723 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
724 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
725 	struct sr_discipline	*sd = wu->swu_dis;
726 	struct scsi_xfer	*xs = wu->swu_xs;
727 	struct sr_softc		*sc = sd->sd_sc;
728 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
729 	int			s, pend;
730 
731 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n",
732 	    DEVNAME(sc), bp, xs);
733 
734 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
735 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
736 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags,
737 	    ccb->ccb_buf.b_blkno, ccb->ccb_target);
738 
739 	s = splbio();
740 
741 	if (ccb->ccb_buf.b_flags & B_ERROR) {
742 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
743 		    DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target);
744 		printf("io error: disk %x\n", ccb->ccb_target);
745 		wu->swu_ios_failed++;
746 		ccb->ccb_state = SR_CCB_FAILED;
747 		if (ccb->ccb_target != -1)
748 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
749 			    BIOC_SDOFFLINE);
750 		else
751 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
752 	} else {
753 		ccb->ccb_state = SR_CCB_OK;
754 		wu->swu_ios_succeeded++;
755 
756 		/* XOR data to result */
757 		if (pq) {
758 			if (pq->pbuf)
759 				/* Calculate xor-parity */
760 				sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
761 				    ccb->ccb_buf.b_bcount);
762 			if (pq->qbuf)
763 				/* Calculate q-parity */
764 				sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
765 				    ccb->ccb_buf.b_bcount, pq->gn);
766 			free(pq, M_DEVBUF);
767 			ccb->ccb_opaque = NULL;
768 		}
769 	}
770 
771 	/* free allocated data buffer */
772 	if (ccb->ccb_flag & SR_CCBF_FREEBUF) {
773 		sr_put_block(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
774 		ccb->ccb_buf.b_data = NULL;
775 	}
776 	wu->swu_ios_complete++;
777 
778 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
779 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
780 	    wu->swu_ios_failed);
781 
782 	if (wu->swu_ios_complete >= wu->swu_io_count) {
783 
784 		/* if all ios failed, retry reads and give up on writes */
785 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
786 			if (xs->flags & SCSI_DATA_IN) {
787 				printf("%s: retrying read on block %lld\n",
788 				    DEVNAME(sc), ccb->ccb_buf.b_blkno);
789 				sr_ccb_put(ccb);
790 				TAILQ_INIT(&wu->swu_ccb);
791 				wu->swu_state = SR_WU_RESTART;
792 				if (sd->sd_scsi_rw(wu))
793 					goto bad;
794 				else
795 					goto retry;
796 			} else {
797 				printf("%s: permanently fail write on block "
798 				    "%lld\n", DEVNAME(sc),
799 				    ccb->ccb_buf.b_blkno);
800 				xs->error = XS_DRIVER_STUFFUP;
801 				goto bad;
802 			}
803 		}
804 
805 		if (xs != NULL) {
806 			xs->error = XS_NOERROR;
807 			xs->resid = 0;
808 		}
809 
810 		pend = 0;
811 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
812 			if (wu == wup) {
813 				/* wu on pendq, remove */
814 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
815 				pend = 1;
816 
817 				if (wu->swu_collider) {
818 					if (wu->swu_ios_failed)
819 						/* toss all ccbs and recreate */
820 						sr_raid6_recreate_wu(wu->swu_collider);
821 
822 					/* restart deferred wu */
823 					wu->swu_collider->swu_state =
824 					    SR_WU_INPROGRESS;
825 					TAILQ_REMOVE(&sd->sd_wu_defq,
826 					    wu->swu_collider, swu_link);
827 					if (sr_failio(wu->swu_collider) == 0)
828 						sr_raid_startwu(wu->swu_collider);
829 				}
830 				break;
831 			}
832 		}
833 
834 		if (!pend)
835 			printf("%s: wu: %p not on pending queue\n",
836 			    DEVNAME(sc), wu);
837 
838 		if (wu->swu_flags & SR_WUF_REBUILD) {
839 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
840 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
841 				wakeup(wu);
842 			}
843 		} else {
844 			if (xs != NULL)
845 				scsi_done(xs);
846 			else
847 				scsi_io_put(&sd->sd_iopool, wu);
848 		}
849 
850 		if (sd->sd_sync && sd->sd_wu_pending == 0)
851 			wakeup(sd);
852 	}
853 
854 retry:
855 	splx(s);
856 	return;
857 bad:
858 	xs->error = XS_DRIVER_STUFFUP;
859 	if (wu->swu_flags & SR_WUF_REBUILD) {
860 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
861 		wakeup(wu);
862 	} else {
863 		scsi_done(xs);
864 	}
865 
866 	splx(s);
867 }
868 
869 void
870 sr_raid6_recreate_wu(struct sr_workunit *wu)
871 {
872 	struct sr_discipline	*sd = wu->swu_dis;
873 	struct sr_workunit	*wup = wu;
874 	struct sr_ccb		*ccb;
875 
876 	do {
877 		DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup);
878 
879 		/* toss all ccbs */
880 		while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) {
881 			TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link);
882 			sr_ccb_put(ccb);
883 		}
884 		TAILQ_INIT(&wup->swu_ccb);
885 
886 		/* recreate ccbs */
887 		wup->swu_state = SR_WU_REQUEUE;
888 		if (sd->sd_scsi_rw(wup))
889 			panic("could not requeue io");
890 
891 		wup = wup->swu_collider;
892 	} while (wup);
893 }
894 
895 int
896 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len,
897     void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn)
898 {
899 	struct sr_discipline	*sd = wu->swu_dis;
900 	struct sr_ccb		*ccb;
901 	struct sr_raid6_opaque  *pqbuf;
902 
903 	ccb = sr_ccb_get(sd);
904 	if (!ccb)
905 		return (-1);
906 
907 	/* allocate temporary buffer */
908 	if (data == NULL) {
909 		data = sr_get_block(sd, len);
910 		if (data == NULL)
911 			return (-1);
912 	}
913 
914 	DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n",
915 	    flag & SCSI_DATA_IN ? "read" : "write",
916 	    dsk, blk, len, pbuf, qbuf);
917 
918 	ccb->ccb_flag = ccbflag;
919 	if (flag & SCSI_POLL) {
920 		ccb->ccb_buf.b_flags = 0;
921 		ccb->ccb_buf.b_iodone = NULL;
922 	} else {
923 		ccb->ccb_buf.b_flags = B_CALL;
924 		ccb->ccb_buf.b_iodone = sr_raid6_intr;
925 	}
926 	if (flag & SCSI_DATA_IN)
927 		ccb->ccb_buf.b_flags |= B_READ;
928 	else
929 		ccb->ccb_buf.b_flags |= B_WRITE;
930 
931 	/* add offset for metadata */
932 	ccb->ccb_buf.b_flags |= B_PHYS;
933 	ccb->ccb_buf.b_blkno = blk;
934 	ccb->ccb_buf.b_bcount = len;
935 	ccb->ccb_buf.b_bufsize = len;
936 	ccb->ccb_buf.b_resid = len;
937 	ccb->ccb_buf.b_data = data;
938 	ccb->ccb_buf.b_error = 0;
939 	ccb->ccb_buf.b_proc = curproc;
940 	ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm;
941 	ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn;
942 	ccb->ccb_buf.b_bq = NULL;
943 	if ((ccb->ccb_buf.b_flags & B_READ) == 0)
944 		ccb->ccb_buf.b_vp->v_numoutput++;
945 
946 	ccb->ccb_wu = wu;
947 	ccb->ccb_target = dsk;
948 	if (pbuf || qbuf) {
949 		if (qbuf && gf_premul(gn))
950 			return (-1);
951 
952 		pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_ZERO | M_NOWAIT);
953 		if (pqbuf == NULL) {
954 			sr_ccb_put(ccb);
955 			return (-1);
956 		}
957 		pqbuf->pbuf = pbuf;
958 		pqbuf->qbuf = qbuf;
959 		pqbuf->gn = gn;
960 		ccb->ccb_opaque = pqbuf;
961 	}
962 
963 	LIST_INIT(&ccb->ccb_buf.b_dep);
964 	TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
965 
966 	DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d "
967 	    "b_blkno: %x b_flags 0x%0x b_data %p\n",
968 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
969 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno,
970 	    ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data);
971 
972 	wu->swu_io_count++;
973 
974 	return (0);
975 }
976 
977 /* Perform RAID6 parity calculation.
978  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
979 void
980 sr_raid6_xorp(void *p, void *d, int len)
981 {
982 	uint32_t *pbuf = p, *data = d;
983 
984 	len >>= 2;
985 	while (len--)
986 		*pbuf++ ^= *data++;
987 }
988 
989 void
990 sr_raid6_xorq(void *q, void *d, int len, int gn)
991 {
992 	uint32_t 	*qbuf = q, *data = d, x;
993 	uint8_t	 	*gn_map = gf_map[gn];
994 
995 	len >>= 2;
996 	while (len--) {
997 		x = *data++;
998 		*qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) |
999 		  	    ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) |
1000 			    ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) |
1001 			    ((uint32_t)gn_map[(x >> 24) & 0xff] << 24));
1002 	}
1003 }
1004 
1005 /* Create GF256 log/pow tables: polynomial = 0x11D */
1006 void
1007 gf_init(void)
1008 {
1009 	int i;
1010 	uint8_t p = 1;
1011 
1012 	/* use 2N pow table to avoid using % in multiply */
1013 	for (i=0; i<256; i++) {
1014 		gf_log[p] = i;
1015 		gf_pow[i] = gf_pow[i+255] = p;
1016 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
1017 	}
1018 	gf_log[0] = 512;
1019 }
1020 
1021 uint8_t
1022 gf_inv(uint8_t a)
1023 {
1024 	return gf_pow[255 - gf_log[a]];
1025 }
1026 
1027 uint8_t
1028 gf_mul(uint8_t a, uint8_t b)
1029 {
1030 	return gf_pow[gf_log[a] + gf_log[b]];
1031 }
1032 
1033 /* Precalculate multiplication tables for drive gn */
1034 int
1035 gf_premul(uint8_t gn)
1036 {
1037 	int i;
1038 
1039 	if (gf_map[gn] != NULL)
1040 		return (0);
1041 
1042 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL)
1043 		return (-1);
1044 
1045 	for (i=0; i<256; i++)
1046 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
1047 	return (0);
1048 }
1049