xref: /openbsd-src/sys/dev/softraid_raid6.c (revision 50027fe110c3c362514cbbf1128910104a00203e)
1 /* $OpenBSD: softraid_raid6.c,v 1.9 2009/12/07 14:33:38 jsing Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/proc.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/conf.h>
38 #include <sys/uio.h>
39 
40 #include <scsi/scsi_all.h>
41 #include <scsi/scsiconf.h>
42 #include <scsi/scsi_disk.h>
43 
44 #include <dev/softraidvar.h>
45 #include <dev/rndvar.h>
46 
47 uint8_t *gf_map[256];
48 uint8_t	gf_pow[768];
49 int	gf_log[256];
50 
51 /* RAID 6 functions. */
52 int	sr_raid6_alloc_resources(struct sr_discipline *);
53 int	sr_raid6_free_resources(struct sr_discipline *);
54 int	sr_raid6_rw(struct sr_workunit *);
55 int	sr_raid6_openings(struct sr_discipline *);
56 void	sr_raid6_intr(struct buf *);
57 void	sr_raid6_recreate_wu(struct sr_workunit *);
58 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
59 void	sr_raid6_set_vol_state(struct sr_discipline *);
60 
61 void	sr_raid6_xorp(void *, void *, int);
62 void	sr_raid6_xorq(void *, void *, int, int);
63 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t,
64 	    void *, int, int, void *, void *, int);
65 void 	sr_dump(void *, int);
66 void	sr_raid6_scrub(struct sr_discipline *);
67 int	sr_failio(struct sr_workunit *);
68 
69 void	*sr_get_block(struct sr_discipline *, int);
70 void	sr_put_block(struct sr_discipline *, void *);
71 
72 void	gf_init(void);
73 uint8_t gf_inv(uint8_t);
74 int	gf_premul(uint8_t);
75 
76 #define SR_NOFAIL		0x00
77 #define SR_FAILX		(1L << 0)
78 #define SR_FAILY		(1L << 1)
79 #define SR_FAILP		(1L << 2)
80 #define SR_FAILQ		(1L << 3)
81 
82 #define M_FAIL 0x00
83 
84 #define M_RX   0x01
85 #define M_RXP  0x02
86 #define M_RXQ  0x03
87 #define M_RXY  0x04
88 #define M_RFLG 0x0F
89 
90 #define M_WXPQ 0x10
91 #define M_WXY  0x20
92 #define M_WPQ  0x30
93 #define M_WFLG 0xF0
94 
95 /* Mapping of Failure Flags to Read/Write state */
96 uint8_t sr_rwmode[16] = {
97 	[SR_FAILX+SR_FAILY+SR_FAILP] = M_FAIL,
98 	[SR_FAILX+SR_FAILY+SR_FAILQ] = M_FAIL,
99 	[SR_FAILX+SR_FAILP+SR_FAILQ] = M_FAIL,
100 	[SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL,
101 	[SR_FAILX+SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL,
102 
103 	[SR_NOFAIL]         = M_RX | M_WXPQ,
104 	[SR_FAILY]          = M_RX | M_WXPQ,
105 	[SR_FAILP]          = M_RX | M_WXPQ,
106 	[SR_FAILQ]          = M_RX | M_WXPQ,
107 	[SR_FAILY+SR_FAILP] = M_RX | M_WXPQ,
108 	[SR_FAILY+SR_FAILQ] = M_RX | M_WXPQ,
109 	[SR_FAILP+SR_FAILQ] = M_RX | M_WXPQ,
110 
111 	[SR_FAILX]          = M_RXQ | M_WPQ,
112 	[SR_FAILX+SR_FAILQ] = M_RXQ | M_WPQ,
113 	[SR_FAILX+SR_FAILP] = M_RXP | M_WPQ,
114 	[SR_FAILX+SR_FAILY] = M_RXY | M_WXY,
115 };
116 
117 struct sr_raid6_opaque {
118 	int      gn;
119 	void	*pbuf;
120 	void	*qbuf;
121 };
122 
123 /* discipline initialisation. */
124 void
125 sr_raid6_discipline_init(struct sr_discipline *sd)
126 {
127 
128 	/* Initialize GF256 tables */
129 	gf_init();
130 
131 	/* fill out discipline members. */
132 	sd->sd_type = SR_MD_RAID6;
133 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE;
134 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */
135 	sd->sd_max_wu = SR_RAID6_NOWU;
136 
137 	/* setup discipline pointers. */
138 	sd->sd_alloc_resources = sr_raid6_alloc_resources;
139 	sd->sd_free_resources = sr_raid6_free_resources;
140 	sd->sd_start_discipline = NULL;
141 	sd->sd_scsi_inquiry = sr_raid_inquiry;
142 	sd->sd_scsi_read_cap = sr_raid_read_cap;
143 	sd->sd_scsi_tur = sr_raid_tur;
144 	sd->sd_scsi_req_sense = sr_raid_request_sense;
145 	sd->sd_scsi_start_stop = sr_raid_start_stop;
146 	sd->sd_scsi_sync = sr_raid_sync;
147 	sd->sd_scsi_rw = sr_raid6_rw;
148 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
149 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
150 	sd->sd_openings = sr_raid6_openings;
151 }
152 
153 int
154 sr_raid6_openings(struct sr_discipline *sd)
155 {
156 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
157 }
158 
159 int
160 sr_raid6_alloc_resources(struct sr_discipline *sd)
161 {
162 	int			rv = EINVAL;
163 
164 	if (!sd)
165 		return (rv);
166 
167 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n",
168 	    DEVNAME(sd->sd_sc));
169 
170 	if (sr_wu_alloc(sd))
171 		goto bad;
172 	if (sr_ccb_alloc(sd))
173 		goto bad;
174 
175 	/* setup runtime values */
176 	sd->mds.mdd_raid6.sr6_strip_bits =
177 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
178 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1)
179 		goto bad;
180 
181 	rv = 0;
182 bad:
183 	return (rv);
184 }
185 
186 int
187 sr_raid6_free_resources(struct sr_discipline *sd)
188 {
189 	int			rv = EINVAL;
190 
191 	if (!sd)
192 		return (rv);
193 
194 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n",
195 	    DEVNAME(sd->sd_sc));
196 
197 	sr_wu_free(sd);
198 	sr_ccb_free(sd);
199 
200 	rv = 0;
201 	return (rv);
202 }
203 
204 void
205 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
206 {
207 	int			old_state, s;
208 
209 	/* XXX this is for RAID 0 */
210 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
211 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
212 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
213 
214 	/* ok to go to splbio since this only happens in error path */
215 	s = splbio();
216 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
217 
218 	/* multiple IOs to the same chunk that fail will come through here */
219 	if (old_state == new_state)
220 		goto done;
221 
222 	switch (old_state) {
223 	case BIOC_SDONLINE:
224 		switch (new_state) {
225 		case BIOC_SDOFFLINE:
226 		case BIOC_SDSCRUB:
227 			break;
228 		default:
229 			goto die;
230 		}
231 		break;
232 
233 	case BIOC_SDOFFLINE:
234 		if (new_state == BIOC_SDREBUILD) {
235 			;
236 		} else
237 			goto die;
238 		break;
239 
240 	case BIOC_SDSCRUB:
241 		switch (new_state) {
242 		case BIOC_SDONLINE:
243 		case BIOC_SDOFFLINE:
244 			break;
245 		default:
246 			goto die;
247 		}
248 		break;
249 
250 	case BIOC_SDREBUILD:
251 		switch (new_state) {
252 		case BIOC_SDONLINE:
253 		case BIOC_SDOFFLINE:
254 			break;
255 		default:
256 			goto die;
257 		}
258 		break;
259 
260 	default:
261 die:
262 		splx(s); /* XXX */
263 		panic("%s: %s: %s: invalid chunk state transition "
264 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
265 		    sd->sd_meta->ssd_devname,
266 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
267 		    old_state, new_state);
268 		/* NOTREACHED */
269 	}
270 
271 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
272 	sd->sd_set_vol_state(sd);
273 
274 	sd->sd_must_flush = 1;
275 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
276 done:
277 	splx(s);
278 }
279 
280 void
281 sr_raid6_set_vol_state(struct sr_discipline *sd)
282 {
283 	int			states[SR_MAX_STATES];
284 	int			new_state, i, s, nd;
285 	int			old_state = sd->sd_vol_status;
286 
287 	/* XXX this is for RAID 0 */
288 
289 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
290 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
291 
292 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
293 
294 	for (i = 0; i < SR_MAX_STATES; i++)
295 		states[i] = 0;
296 
297 	for (i = 0; i < nd; i++) {
298 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
299 		if (s >= SR_MAX_STATES)
300 			panic("%s: %s: %s: invalid chunk state",
301 			    DEVNAME(sd->sd_sc),
302 			    sd->sd_meta->ssd_devname,
303 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
304 		states[s]++;
305 	}
306 
307 	if (states[BIOC_SDONLINE] == nd)
308 		new_state = BIOC_SVONLINE;
309 	else if (states[BIOC_SDONLINE] < nd - 2)
310 		new_state = BIOC_SVOFFLINE;
311 	else if (states[BIOC_SDSCRUB] != 0)
312 		new_state = BIOC_SVSCRUB;
313 	else if (states[BIOC_SDREBUILD] != 0)
314 		new_state = BIOC_SVREBUILD;
315 	else if (states[BIOC_SDONLINE] < nd)
316 		new_state = BIOC_SVDEGRADED;
317 	else {
318 		printf("old_state = %d, ", old_state);
319 		for (i = 0; i < nd; i++)
320 			printf("%d = %d, ", i,
321 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
322 		panic("invalid new_state");
323 	}
324 
325 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
326 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
327 	    old_state, new_state);
328 
329 	switch (old_state) {
330 	case BIOC_SVONLINE:
331 		switch (new_state) {
332 		case BIOC_SVONLINE: /* can go to same state */
333 		case BIOC_SVOFFLINE:
334 		case BIOC_SVDEGRADED:
335 		case BIOC_SVREBUILD: /* happens on boot */
336 			break;
337 		default:
338 			goto die;
339 		}
340 		break;
341 
342 	case BIOC_SVOFFLINE:
343 		/* XXX this might be a little too much */
344 		goto die;
345 
346 	case BIOC_SVSCRUB:
347 		switch (new_state) {
348 		case BIOC_SVONLINE:
349 		case BIOC_SVOFFLINE:
350 		case BIOC_SVDEGRADED:
351 		case BIOC_SVSCRUB: /* can go to same state */
352 			break;
353 		default:
354 			goto die;
355 		}
356 		break;
357 
358 	case BIOC_SVBUILDING:
359 		switch (new_state) {
360 		case BIOC_SVONLINE:
361 		case BIOC_SVOFFLINE:
362 		case BIOC_SVBUILDING: /* can go to the same state */
363 			break;
364 		default:
365 			goto die;
366 		}
367 		break;
368 
369 	case BIOC_SVREBUILD:
370 		switch (new_state) {
371 		case BIOC_SVONLINE:
372 		case BIOC_SVOFFLINE:
373 		case BIOC_SVDEGRADED:
374 		case BIOC_SVREBUILD: /* can go to the same state */
375 			break;
376 		default:
377 			goto die;
378 		}
379 		break;
380 
381 	case BIOC_SVDEGRADED:
382 		switch (new_state) {
383 		case BIOC_SVOFFLINE:
384 		case BIOC_SVREBUILD:
385 		case BIOC_SVDEGRADED: /* can go to the same state */
386 			break;
387 		default:
388 			goto die;
389 		}
390 		break;
391 
392 	default:
393 die:
394 		panic("%s: %s: invalid volume state transition %d -> %d\n",
395 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
396 		    old_state, new_state);
397 		/* NOTREACHED */
398 	}
399 
400 	sd->sd_vol_status = new_state;
401 }
402 
403 /*  modes:
404  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
405  *	        SR_CCBF_FREEBUF, qbuf, NULL, 0);
406  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
407  *		SR_CCBF_FREEBUF, pbuf, NULL, 0);
408  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
409  *		SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]);
410  */
411 
412 int
413 sr_raid6_rw(struct sr_workunit *wu)
414 {
415 	struct sr_workunit	*wu_w = NULL;
416 	struct sr_discipline	*sd = wu->swu_dis;
417 	struct scsi_xfer	*xs = wu->swu_xs;
418 	struct sr_chunk		*scp;
419 	int			s, fail, i, rwmode;
420 	daddr64_t		blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk;
421 	daddr64_t		strip_size, no_chunk, lba, chunk_offs, phys_offs;
422 	daddr64_t		strip_bits, length, strip_offs, datalen;
423 	void		        *pbuf, *data, *qbuf;
424 
425 	/* blk and scsi error will be handled by sr_validate_io */
426 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
427 		goto bad;
428 
429 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
430 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
431 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
432 
433 	data = xs->data;
434 	datalen = xs->datalen;
435 	lbaoffs	= blk << DEV_BSHIFT;
436 
437 	rwmode = (xs->flags & SCSI_DATA_IN) ? M_RFLG : M_WFLG;
438 	if (xs->flags & SCSI_DATA_OUT)
439 		/* create write workunit */
440 		if ((wu_w = sr_wu_get(sd, 0)) == NULL) {
441 			printf("%s: can't get wu_w", DEVNAME(sd->sd_sc));
442 			goto bad;
443 		}
444 
445 	wu->swu_blk_start = 0;
446 	while (datalen != 0) {
447 		strip_no = lbaoffs >> strip_bits;
448 		strip_offs = lbaoffs & (strip_size - 1);
449 		chunk_offs = (strip_no / no_chunk) << strip_bits;
450 		phys_offs = chunk_offs + strip_offs +
451 		    ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT);
452 
453 		/* get size remaining in this stripe */
454 		length = MIN(strip_size - strip_offs, datalen);
455 
456 		/* map disk offset to parity/data drive */
457 		chunk = strip_no % no_chunk;
458 
459 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
460 		if (qchunk == 0)
461 			pchunk = no_chunk + 1;
462 		else
463 			pchunk = qchunk - 1;
464 		if (chunk >= pchunk)
465 			chunk++;
466 		if (chunk >= qchunk)
467 			chunk++;
468 
469 		lba = phys_offs >> DEV_BSHIFT;
470 
471 		/* XXX big hammer.. exclude I/O from entire stripe */
472 		if (wu->swu_blk_start == 0)
473 			wu->swu_blk_start = chunk_offs >> DEV_BSHIFT;
474 		wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1;
475 
476 		fail = 0;
477 		fchunk = -1;
478 
479 		/* Get disk-fail flags */
480 		for (i=0; i< no_chunk+2; i++) {
481 			scp = sd->sd_vol.sv_chunks[i];
482 			switch (scp->src_meta.scm_status) {
483 			case BIOC_SDOFFLINE:
484 			case BIOC_SDREBUILD:
485 			case BIOC_SDHOTSPARE:
486 				if (i == qchunk)
487 					fail |= SR_FAILQ;
488 				else if (i == pchunk)
489 					fail |= SR_FAILP;
490 				else if (i == chunk)
491 					fail |= SR_FAILX;
492 				else {
493 					/* dual data-disk failure */
494 					fail |= SR_FAILY;
495 					fchunk = i;
496 				}
497 				break;
498 			}
499 		}
500 		if (xs->flags & SCSI_DATA_IN) {
501 			if (!(fail & SR_FAILX)) {
502 				/* drive is good. issue single read request */
503 				if (sr_raid6_addio(wu, chunk, lba, length,
504 				    data, xs->flags, 0, NULL, NULL, 0))
505 					goto bad;
506 			} else if (fail & SR_FAILP) {
507 				/* Dx, P failed */
508 				printf("Disk %llx offline, "
509 				    "regenerating Dx+P\n", chunk);
510 
511 				qbuf = sr_get_block(sd, length);
512 				if (qbuf == NULL)
513 					goto bad;
514 
515 				/* Calculate: Dx*gx = Q^(Dz*gz)
516 				 *   Q:  sr_raid6_xorp(data, --, length);
517 				 *   Dz: sr_raid6_xorq(data, --, length, gf_pow[i]);
518 				 */
519 				memset(data, 0, length);
520 				for (i = 0; i < no_chunk+2; i++) {
521 					if  (i == qchunk) {
522 						/* Read Q */
523 						if (sr_raid6_addio(wu, i, lba,
524 						    length, NULL, SCSI_DATA_IN,
525 						    SR_CCBF_FREEBUF, qbuf,
526 						    NULL, 0))
527 						    	goto bad;
528 					} else if (i != chunk && i != pchunk) {
529 						/* Read Dz * gz */
530 						if (sr_raid6_addio(wu, i, lba,
531 						   length, NULL, SCSI_DATA_IN,
532 						   SR_CCBF_FREEBUF, NULL,
533 						   qbuf, gf_pow[i]))
534 						   	goto bad;
535 					}
536 				}
537 
538 				/* run fake wu when read i/o is complete */
539 				if (wu_w == NULL &&
540 				    (wu_w = sr_wu_get(sd, 0)) == NULL)
541 					goto bad;
542 
543 				wu_w->swu_flags |= SR_WUF_FAIL;
544 				if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0,
545 				    SR_CCBF_FREEBUF, NULL, data,
546 				    gf_inv(gf_pow[chunk])))
547 					goto bad;
548 			} else if (fail & SR_FAILY) {
549 				/* Dx, Dy failed */
550 				printf("Disk %llx & %llx offline, "
551 				    "regenerating Dx+Dy\n", chunk, fchunk);
552 				qbuf = sr_get_block(sd, length);
553 				if (qbuf == NULL)
554 					goto bad;
555 				pbuf = sr_get_block(sd, length);
556 				if (pbuf == NULL)
557 					goto bad;
558 
559 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
560 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
561 				 *   P:  sr_raid6_xorp(pbuf, --, length);
562 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
563 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
564 				 */
565 				memset(data, 0, length);
566 				for (i = 0; i < no_chunk+2; i++) {
567 					if (i == qchunk) {
568 						/* read Q */
569 						if (sr_raid6_addio(wu, i, lba,
570 						    length,  NULL, SCSI_DATA_IN,
571 						    SR_CCBF_FREEBUF, qbuf,
572 						    NULL, 0))
573 						    	goto bad;
574 					} else if (i == pchunk) {
575 						/* read P */
576 						if (sr_raid6_addio(wu, i, lba,
577 						    length,  NULL, SCSI_DATA_IN,
578 						    SR_CCBF_FREEBUF, pbuf,
579 						    NULL, 0))
580 						    	goto bad;
581 					} else if (i != chunk) {
582 						/* read Dz * gz */
583 						if (sr_raid6_addio(wu, i, lba,
584 						    length, NULL, SCSI_DATA_IN,
585 						    SR_CCBF_FREEBUF, pbuf,
586 						    qbuf, gf_pow[i]))
587 						    	goto bad;
588 					}
589 				}
590 
591 				/* run fake wu when read i/o is complete */
592 				if (wu_w == NULL &&
593 				    (wu_w = sr_wu_get(sd, 0)) == NULL)
594 					goto bad;
595 
596 				wu_w->swu_flags |= SR_WUF_FAIL;
597 				if (sr_raid6_addio(wu_w, 0, 0, length, pbuf, 0,
598 				    SR_CCBF_FREEBUF, NULL, data,
599 				    gf_inv(gf_pow[255+chunk-fchunk] ^ 1)))
600 					goto bad;
601 				if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0,
602 				    SR_CCBF_FREEBUF, NULL, data,
603 				    gf_inv(gf_pow[chunk] ^ gf_pow[fchunk])))
604 					goto bad;
605 			} else {
606 				/* Two cases: single disk (Dx) or (Dx+Q)
607 				 *   Dx = Dz ^ P (same as RAID5)
608 				 */
609 				printf("Disk %llx offline, "
610 				    "regenerating Dx%s\n", chunk,
611 				    fail & SR_FAILQ ? "+Q" : " single");
612 
613 				/* Calculate: Dx = P^Dz
614  				 *   P:  sr_raid6_xorp(data, ---, length);
615  				 *   Dz: sr_raid6_xorp(data, ---, length);
616 				 */
617 				memset(data, 0, length);
618 				for (i = 0; i < no_chunk+2; i++) {
619 					if (i != chunk && i != qchunk) {
620 						/* Read Dz */
621 						if (sr_raid6_addio(wu, i, lba,
622 						    length, NULL, SCSI_DATA_IN,
623 						    SR_CCBF_FREEBUF, data,
624 						    NULL, 0))
625 	 				    	    	goto bad;
626 					}
627 				}
628 
629 				/* data will contain correct value on completion */
630 			}
631 		} else {
632 			/* XXX handle writes to failed/offline disk? */
633 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
634 				goto bad;
635 
636 			/*
637 			 * initialize pbuf with contents of new data to be
638 			 * written. This will be XORed with old data and old
639 			 * parity in the intr routine. The result in pbuf
640 			 * is the new parity data.
641 			 */
642 			qbuf = sr_get_block(sd, length);
643 			if (qbuf == NULL)
644 				goto bad;
645 
646 			pbuf = sr_get_block(sd, length);
647 			if (pbuf == NULL)
648 				goto bad;
649 
650 			/* Calulate P = Dn; Q = gn * Dn */
651 			if (gf_premul(gf_pow[chunk]))
652 				goto bad;
653 			sr_raid6_xorp(pbuf, data, length);
654 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
655 
656 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
657 			if (sr_raid6_addio(wu, chunk, lba, length, NULL,
658 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf,
659 				gf_pow[chunk]))
660 				goto bad;
661 
662 			/* Read old xor-parity: P ^= P' */
663 			if (sr_raid6_addio(wu, pchunk, lba, length, NULL,
664 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0))
665 				goto bad;
666 
667 			/* Read old q-parity: Q ^= Q' */
668 			if (sr_raid6_addio(wu, qchunk, lba, length, NULL,
669 				SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0))
670 				goto bad;
671 
672 			/* write new data */
673 			if (sr_raid6_addio(wu_w, chunk, lba, length, data,
674 			    xs->flags, 0, NULL, NULL, 0))
675 				goto bad;
676 
677 			/* write new xor-parity */
678 			if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf,
679 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
680 				goto bad;
681 
682 			/* write new q-parity */
683 			if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf,
684 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
685 				goto bad;
686 		}
687 
688 		/* advance to next block */
689 		lbaoffs += length;
690 		datalen -= length;
691 		data += length;
692 	}
693 
694 	s = splbio();
695 	if (wu_w) {
696 		/* collide write request with reads */
697 		wu_w->swu_blk_start = wu->swu_blk_start;
698 		wu_w->swu_blk_end = wu->swu_blk_end;
699 
700 		/*
701 		 * put xs block in write request (scsi_done not called till
702 		 * write completes)
703 		 */
704 		wu_w->swu_xs = wu->swu_xs;
705 		wu->swu_xs = NULL;
706 
707 		wu_w->swu_state = SR_WU_DEFERRED;
708 		wu->swu_collider = wu_w;
709 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
710 	}
711 
712 	/* rebuild io, let rebuild routine deal with it */
713 	if (wu->swu_flags & SR_WUF_REBUILD)
714 		goto queued;
715 
716 	/* current io failed, restart */
717 	if (wu->swu_state == SR_WU_RESTART)
718 		goto start;
719 
720 	/* deferred io failed, don't restart */
721 	if (wu->swu_state == SR_WU_REQUEUE)
722 		goto queued;
723 
724 	if (sr_check_io_collision(wu))
725 		goto queued;
726 
727 start:
728 	sr_raid_startwu(wu);
729 queued:
730 	splx(s);
731 	return (0);
732 bad:
733 	/* wu is unwound by sr_wu_put */
734 	if (wu_w)
735 		sr_wu_put(wu_w);
736 	return (1);
737 }
738 
739 /* Handle failure I/O completion */
740 int
741 sr_failio(struct sr_workunit *wu)
742 {
743 	struct sr_discipline	*sd = wu->swu_dis;
744 	struct sr_ccb		*ccb;
745 
746 	if (!(wu->swu_flags & SR_WUF_FAIL))
747 		return (0);
748 
749 	/* Wu is a 'fake'.. don't do real I/O just intr */
750 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
751 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
752 		sr_raid6_intr(&ccb->ccb_buf);
753 	return (1);
754 }
755 
756 void
757 sr_raid6_intr(struct buf *bp)
758 {
759 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
760 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
761 	struct sr_discipline	*sd = wu->swu_dis;
762 	struct scsi_xfer	*xs = wu->swu_xs;
763 	struct sr_softc		*sc = sd->sd_sc;
764 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
765 	int			s, pend;
766 
767 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n",
768 	    DEVNAME(sc), bp, xs);
769 
770 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
771 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
772 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags,
773 	    ccb->ccb_buf.b_blkno, ccb->ccb_target);
774 
775 	s = splbio();
776 
777 	if (ccb->ccb_buf.b_flags & B_ERROR) {
778 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
779 		    DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target);
780 		printf("io error: disk %x\n", ccb->ccb_target);
781 		wu->swu_ios_failed++;
782 		ccb->ccb_state = SR_CCB_FAILED;
783 		if (ccb->ccb_target != -1)
784 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
785 			    BIOC_SDOFFLINE);
786 		else
787 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
788 	} else {
789 		ccb->ccb_state = SR_CCB_OK;
790 		wu->swu_ios_succeeded++;
791 
792 		/* XOR data to result */
793 		if (pq) {
794 			if (pq->pbuf)
795 				/* Calculate xor-parity */
796 				sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
797 				    ccb->ccb_buf.b_bcount);
798 			if (pq->qbuf)
799 				/* Calculate q-parity */
800 				sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
801 				    ccb->ccb_buf.b_bcount, pq->gn);
802 			free(pq, M_DEVBUF);
803 			ccb->ccb_opaque = NULL;
804 		}
805 	}
806 
807 	/* free allocated data buffer */
808 	if (ccb->ccb_flag & SR_CCBF_FREEBUF) {
809 		sr_put_block(sd, ccb->ccb_buf.b_data);
810 		ccb->ccb_buf.b_data = NULL;
811 	}
812 	wu->swu_ios_complete++;
813 
814 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
815 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
816 	    wu->swu_ios_failed);
817 
818 	if (wu->swu_ios_complete >= wu->swu_io_count) {
819 
820 		/* if all ios failed, retry reads and give up on writes */
821 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
822 			if (xs->flags & SCSI_DATA_IN) {
823 				printf("%s: retrying read on block %lld\n",
824 				    DEVNAME(sc), ccb->ccb_buf.b_blkno);
825 				sr_ccb_put(ccb);
826 				TAILQ_INIT(&wu->swu_ccb);
827 				wu->swu_state = SR_WU_RESTART;
828 				if (sd->sd_scsi_rw(wu))
829 					goto bad;
830 				else
831 					goto retry;
832 			} else {
833 				printf("%s: permanently fail write on block "
834 				    "%lld\n", DEVNAME(sc),
835 				    ccb->ccb_buf.b_blkno);
836 				xs->error = XS_DRIVER_STUFFUP;
837 				goto bad;
838 			}
839 		}
840 
841 		if (xs != NULL) {
842 			xs->error = XS_NOERROR;
843 			xs->resid = 0;
844 			xs->flags |= ITSDONE;
845 		}
846 
847 		pend = 0;
848 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
849 			if (wu == wup) {
850 				/* wu on pendq, remove */
851 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
852 				pend = 1;
853 
854 				if (wu->swu_collider) {
855 					if (wu->swu_ios_failed)
856 						/* toss all ccbs and recreate */
857 						sr_raid6_recreate_wu(wu->swu_collider);
858 
859 					/* restart deferred wu */
860 					wu->swu_collider->swu_state =
861 					    SR_WU_INPROGRESS;
862 					TAILQ_REMOVE(&sd->sd_wu_defq,
863 					    wu->swu_collider, swu_link);
864 					if (sr_failio(wu->swu_collider) == 0)
865 						sr_raid_startwu(wu->swu_collider);
866 				}
867 				break;
868 			}
869 		}
870 
871 		if (!pend)
872 			printf("%s: wu: %p not on pending queue\n",
873 			    DEVNAME(sc), wu);
874 
875 		if (wu->swu_flags & SR_WUF_REBUILD) {
876 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
877 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
878 				wakeup(wu);
879 			}
880 		} else {
881 			/* do not change the order of these 2 functions */
882 			sr_wu_put(wu);
883 			if (xs != NULL)
884 				scsi_done(xs);
885 		}
886 
887 		if (sd->sd_sync && sd->sd_wu_pending == 0)
888 			wakeup(sd);
889 	}
890 
891 retry:
892 	splx(s);
893 	return;
894 bad:
895 	xs->error = XS_DRIVER_STUFFUP;
896 	xs->flags |= ITSDONE;
897 	if (wu->swu_flags & SR_WUF_REBUILD) {
898 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
899 		wakeup(wu);
900 	} else {
901 		/* do not change the order of these 2 functions */
902 		sr_wu_put(wu);
903 		scsi_done(xs);
904 	}
905 
906 	splx(s);
907 }
908 
909 void
910 sr_raid6_recreate_wu(struct sr_workunit *wu)
911 {
912 	struct sr_discipline	*sd = wu->swu_dis;
913 	struct sr_workunit	*wup = wu;
914 	struct sr_ccb		*ccb;
915 
916 	do {
917 		DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup);
918 
919 		/* toss all ccbs */
920 		while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) {
921 			TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link);
922 			sr_ccb_put(ccb);
923 		}
924 		TAILQ_INIT(&wup->swu_ccb);
925 
926 		/* recreate ccbs */
927 		wup->swu_state = SR_WU_REQUEUE;
928 		if (sd->sd_scsi_rw(wup))
929 			panic("could not requeue io");
930 
931 		wup = wup->swu_collider;
932 	} while (wup);
933 }
934 
935 int
936 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len,
937     void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn)
938 {
939 	struct sr_discipline 	*sd = wu->swu_dis;
940 	struct sr_ccb		*ccb;
941 	struct sr_raid6_opaque  *pqbuf;
942 
943 	ccb = sr_ccb_get(sd);
944 	if (!ccb)
945 		return (-1);
946 
947 	/* allocate temporary buffer */
948 	if (data == NULL) {
949 		data = sr_get_block(sd, len);
950 		if (data == NULL)
951 			return (-1);
952 	}
953 
954 	DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n",
955 	    flag & SCSI_DATA_IN ? "read" : "write",
956 	    dsk, blk, len, pbuf, qbuf);
957 
958 	ccb->ccb_flag = ccbflag;
959 	if (flag & SCSI_POLL) {
960 		ccb->ccb_buf.b_flags = 0;
961 		ccb->ccb_buf.b_iodone = NULL;
962 	} else {
963 		ccb->ccb_buf.b_flags = B_CALL;
964 		ccb->ccb_buf.b_iodone = sr_raid6_intr;
965 	}
966 	if (flag & SCSI_DATA_IN)
967 		ccb->ccb_buf.b_flags |= B_READ;
968 	else
969 		ccb->ccb_buf.b_flags |= B_WRITE;
970 
971 	/* add offset for metadata */
972 	ccb->ccb_buf.b_flags |= B_PHYS;
973 	ccb->ccb_buf.b_blkno = blk;
974 	ccb->ccb_buf.b_bcount = len;
975 	ccb->ccb_buf.b_bufsize = len;
976 	ccb->ccb_buf.b_resid = len;
977 	ccb->ccb_buf.b_data = data;
978 	ccb->ccb_buf.b_error = 0;
979 	ccb->ccb_buf.b_proc = curproc;
980 	ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm;
981 	ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn;
982 	if ((ccb->ccb_buf.b_flags & B_READ) == 0)
983 		ccb->ccb_buf.b_vp->v_numoutput++;
984 
985 	ccb->ccb_wu = wu;
986 	ccb->ccb_target = dsk;
987 	if (pbuf || qbuf) {
988 		if (qbuf && gf_premul(gn))
989 			return (-1);
990 
991 		pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL);
992 		if (pqbuf == NULL) {
993 			sr_ccb_put(ccb);
994 			return (-1);
995 		}
996 		pqbuf->pbuf = pbuf;
997 		pqbuf->qbuf = qbuf;
998 		pqbuf->gn = gn;
999 		ccb->ccb_opaque = pqbuf;
1000 	}
1001 
1002 	LIST_INIT(&ccb->ccb_buf.b_dep);
1003 	TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
1004 
1005 	DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d "
1006 	    "b_blkno: %x b_flags 0x%0x b_data %p\n",
1007 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
1008 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno,
1009 	    ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data);
1010 
1011 	wu->swu_io_count++;
1012 
1013 	return (0);
1014 }
1015 
1016 /* Perform RAID6 parity calculation.
1017  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
1018 void
1019 sr_raid6_xorp(void *p, void *d, int len)
1020 {
1021 	uint8_t *pbuf = p, *data = d;
1022 
1023 	while (len--)
1024 		pbuf[len] ^= data[len];
1025 }
1026 
1027 void
1028 sr_raid6_xorq(void *q, void *d, int len, int gn)
1029 {
1030 	uint8_t		*qbuf = q, *data = d;
1031 	uint8_t		*gn_map = gf_map[gn];
1032 
1033 	/* Have to do this a byte at a time */
1034 	/* Faster multiply.. gn is always constant */
1035 	while (len--)
1036 		qbuf[len] ^= gn_map[data[len]];
1037 }
1038 
1039 /* Create GF256 log/pow tables: polynomial = 0x11D */
1040 void
1041 gf_init(void)
1042 {
1043 	int i;
1044 	uint8_t p = 1;
1045 
1046 	/* use 2N pow table to avoid using % in multiply */
1047 	for (i=0; i<256; i++) {
1048 		gf_log[p] = i;
1049 		gf_pow[i] = gf_pow[i+255] = p;
1050 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
1051 	}
1052 	gf_log[0] = 512;
1053 }
1054 
1055 uint8_t
1056 gf_inv(uint8_t a)
1057 {
1058 	return gf_pow[255 - gf_log[a]];
1059 }
1060 
1061 /* Precalculate multiplication tables for drive gn */
1062 int
1063 gf_premul(uint8_t gn)
1064 {
1065 	int i;
1066 
1067 	if (gf_map[gn] != NULL)
1068 		return (0);
1069 
1070 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL)
1071 		return (-1);
1072 
1073 	for (i=0; i<256; i++)
1074 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
1075 	return (0);
1076 }
1077