xref: /openbsd-src/sys/dev/softraid_raid6.c (revision 5054e3e78af0749a9bb00ba9a024b3ee2d90290f)
1 /* $OpenBSD: softraid_raid6.c,v 1.7 2009/11/13 23:34:24 jordan Exp $ */
2 /*
3  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
4  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bio.h"
20 
21 #include <sys/param.h>
22 #include <sys/systm.h>
23 #include <sys/buf.h>
24 #include <sys/device.h>
25 #include <sys/ioctl.h>
26 #include <sys/proc.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/conf.h>
38 #include <sys/uio.h>
39 
40 #include <scsi/scsi_all.h>
41 #include <scsi/scsiconf.h>
42 #include <scsi/scsi_disk.h>
43 
44 #include <dev/softraidvar.h>
45 #include <dev/rndvar.h>
46 
47 uint8_t *gf_map[256];
48 uint8_t	gf_pow[768];
49 int	gf_log[256];
50 
51 /* RAID 6 functions. */
52 int	sr_raid6_alloc_resources(struct sr_discipline *);
53 int	sr_raid6_free_resources(struct sr_discipline *);
54 int	sr_raid6_rw(struct sr_workunit *);
55 int	sr_raid6_openings(struct sr_discipline *);
56 void	sr_raid6_intr(struct buf *);
57 void	sr_raid6_recreate_wu(struct sr_workunit *);
58 void	sr_raid6_set_chunk_state(struct sr_discipline *, int, int);
59 void	sr_raid6_set_vol_state(struct sr_discipline *);
60 
61 void	sr_raid6_xorp(void *, void *, int);
62 void	sr_raid6_xorq(void *, void *, int, int);
63 int	sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t,
64 	    void *, int, int, void *, void *, int);
65 void 	sr_dump(void *, int);
66 void	sr_raid6_scrub(struct sr_discipline *);
67 int	sr_failio(struct sr_workunit *);
68 
69 void	*sr_get_block(struct sr_discipline *, int);
70 void	sr_put_block(struct sr_discipline *, void *);
71 
72 void	gf_init(void);
73 uint8_t gf_inv(uint8_t);
74 int	gf_premul(uint8_t);
75 
76 #define SR_NOFAIL		0x00
77 #define SR_FAILX		(1L << 0)
78 #define SR_FAILY		(1L << 1)
79 #define SR_FAILP		(1L << 2)
80 #define SR_FAILQ		(1L << 3)
81 
82 #define M_FAIL 0x00
83 
84 #define M_RX   0x01
85 #define M_RXP  0x02
86 #define M_RXQ  0x03
87 #define M_RXY  0x04
88 #define M_RFLG 0x0F
89 
90 #define M_WXPQ 0x10
91 #define M_WXY  0x20
92 #define M_WPQ  0x30
93 #define M_WFLG 0xF0
94 
95 /* Mapping of Failure Flags to Read/Write state */
96 uint8_t sr_rwmode[16] = {
97 	[SR_FAILX+SR_FAILY+SR_FAILP] = M_FAIL,
98 	[SR_FAILX+SR_FAILY+SR_FAILQ] = M_FAIL,
99 	[SR_FAILX+SR_FAILP+SR_FAILQ] = M_FAIL,
100 	[SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL,
101 	[SR_FAILX+SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL,
102 
103 	[SR_NOFAIL]         = M_RX | M_WXPQ,
104 	[SR_FAILY]          = M_RX | M_WXPQ,
105 	[SR_FAILP]          = M_RX | M_WXPQ,
106 	[SR_FAILQ]          = M_RX | M_WXPQ,
107 	[SR_FAILY+SR_FAILP] = M_RX | M_WXPQ,
108 	[SR_FAILY+SR_FAILQ] = M_RX | M_WXPQ,
109 	[SR_FAILP+SR_FAILQ] = M_RX | M_WXPQ,
110 
111 	[SR_FAILX]          = M_RXQ | M_WPQ,
112 	[SR_FAILX+SR_FAILQ] = M_RXQ | M_WPQ,
113 	[SR_FAILX+SR_FAILP] = M_RXP | M_WPQ,
114 	[SR_FAILX+SR_FAILY] = M_RXY | M_WXY,
115 };
116 
117 struct sr_raid6_opaque {
118 	int      gn;
119 	void	*pbuf;
120 	void	*qbuf;
121 };
122 
123 /* discipline initialisation. */
124 void
125 sr_raid6_discipline_init(struct sr_discipline *sd)
126 {
127 	/* Initialize GF256 tables */
128 	gf_init();
129 
130 	/* fill out discipline members. */
131 	sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */
132 	sd->sd_max_wu = SR_RAID6_NOWU;
133 	sd->sd_rebuild = 0;
134 
135 	/* setup discipline pointers. */
136 	sd->sd_alloc_resources = sr_raid6_alloc_resources;
137 	sd->sd_free_resources = sr_raid6_free_resources;
138 	sd->sd_start_discipline = NULL;
139 	sd->sd_scsi_inquiry = sr_raid_inquiry;
140 	sd->sd_scsi_read_cap = sr_raid_read_cap;
141 	sd->sd_scsi_tur = sr_raid_tur;
142 	sd->sd_scsi_req_sense = sr_raid_request_sense;
143 	sd->sd_scsi_start_stop = sr_raid_start_stop;
144 	sd->sd_scsi_sync = sr_raid_sync;
145 	sd->sd_scsi_rw = sr_raid6_rw;
146 	sd->sd_set_chunk_state = sr_raid6_set_chunk_state;
147 	sd->sd_set_vol_state = sr_raid6_set_vol_state;
148 	sd->sd_openings = sr_raid6_openings;
149 }
150 
151 int
152 sr_raid6_openings(struct sr_discipline *sd)
153 {
154 	return (sd->sd_max_wu >> 1); /* 2 wu's per IO */
155 }
156 
157 int
158 sr_raid6_alloc_resources(struct sr_discipline *sd)
159 {
160 	int			rv = EINVAL;
161 
162 	if (!sd)
163 		return (rv);
164 
165 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n",
166 	    DEVNAME(sd->sd_sc));
167 
168 	if (sr_wu_alloc(sd))
169 		goto bad;
170 	if (sr_ccb_alloc(sd))
171 		goto bad;
172 
173 	/* setup runtime values */
174 	sd->mds.mdd_raid6.sr6_strip_bits =
175 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
176 	if (sd->mds.mdd_raid6.sr6_strip_bits == -1)
177 		goto bad;
178 
179 	rv = 0;
180 bad:
181 	return (rv);
182 }
183 
184 int
185 sr_raid6_free_resources(struct sr_discipline *sd)
186 {
187 	int			rv = EINVAL;
188 
189 	if (!sd)
190 		return (rv);
191 
192 	DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n",
193 	    DEVNAME(sd->sd_sc));
194 
195 	sr_wu_free(sd);
196 	sr_ccb_free(sd);
197 
198 	rv = 0;
199 	return (rv);
200 }
201 
202 void
203 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
204 {
205 	int			old_state, s;
206 
207 	/* XXX this is for RAID 0 */
208 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
209 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
210 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
211 
212 	/* ok to go to splbio since this only happens in error path */
213 	s = splbio();
214 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
215 
216 	/* multiple IOs to the same chunk that fail will come through here */
217 	if (old_state == new_state)
218 		goto done;
219 
220 	switch (old_state) {
221 	case BIOC_SDONLINE:
222 		switch (new_state) {
223 		case BIOC_SDOFFLINE:
224 		case BIOC_SDSCRUB:
225 			break;
226 		default:
227 			goto die;
228 		}
229 		break;
230 
231 	case BIOC_SDOFFLINE:
232 		if (new_state == BIOC_SDREBUILD) {
233 			;
234 		} else
235 			goto die;
236 		break;
237 
238 	case BIOC_SDSCRUB:
239 		switch (new_state) {
240 		case BIOC_SDONLINE:
241 		case BIOC_SDOFFLINE:
242 			break;
243 		default:
244 			goto die;
245 		}
246 		break;
247 
248 	case BIOC_SDREBUILD:
249 		switch (new_state) {
250 		case BIOC_SDONLINE:
251 		case BIOC_SDOFFLINE:
252 			break;
253 		default:
254 			goto die;
255 		}
256 		break;
257 
258 	default:
259 die:
260 		splx(s); /* XXX */
261 		panic("%s: %s: %s: invalid chunk state transition "
262 		    "%d -> %d\n", DEVNAME(sd->sd_sc),
263 		    sd->sd_meta->ssd_devname,
264 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
265 		    old_state, new_state);
266 		/* NOTREACHED */
267 	}
268 
269 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
270 	sd->sd_set_vol_state(sd);
271 
272 	sd->sd_must_flush = 1;
273 	workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL);
274 done:
275 	splx(s);
276 }
277 
278 void
279 sr_raid6_set_vol_state(struct sr_discipline *sd)
280 {
281 	int			states[SR_MAX_STATES];
282 	int			new_state, i, s, nd;
283 	int			old_state = sd->sd_vol_status;
284 
285 	/* XXX this is for RAID 0 */
286 
287 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
288 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
289 
290 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
291 
292 	for (i = 0; i < SR_MAX_STATES; i++)
293 		states[i] = 0;
294 
295 	for (i = 0; i < nd; i++) {
296 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
297 		if (s >= SR_MAX_STATES)
298 			panic("%s: %s: %s: invalid chunk state",
299 			    DEVNAME(sd->sd_sc),
300 			    sd->sd_meta->ssd_devname,
301 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
302 		states[s]++;
303 	}
304 
305 	if (states[BIOC_SDONLINE] == nd)
306 		new_state = BIOC_SVONLINE;
307 	else if (states[BIOC_SDONLINE] < nd - 2)
308 		new_state = BIOC_SVOFFLINE;
309 	else if (states[BIOC_SDSCRUB] != 0)
310 		new_state = BIOC_SVSCRUB;
311 	else if (states[BIOC_SDREBUILD] != 0)
312 		new_state = BIOC_SVREBUILD;
313 	else if (states[BIOC_SDONLINE] < nd)
314 		new_state = BIOC_SVDEGRADED;
315 	else {
316 		printf("old_state = %d, ", old_state);
317 		for (i = 0; i < nd; i++)
318 			printf("%d = %d, ", i,
319 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
320 		panic("invalid new_state");
321 	}
322 
323 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n",
324 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
325 	    old_state, new_state);
326 
327 	switch (old_state) {
328 	case BIOC_SVONLINE:
329 		switch (new_state) {
330 		case BIOC_SVONLINE: /* can go to same state */
331 		case BIOC_SVOFFLINE:
332 		case BIOC_SVDEGRADED:
333 		case BIOC_SVREBUILD: /* happens on boot */
334 			break;
335 		default:
336 			goto die;
337 		}
338 		break;
339 
340 	case BIOC_SVOFFLINE:
341 		/* XXX this might be a little too much */
342 		goto die;
343 
344 	case BIOC_SVSCRUB:
345 		switch (new_state) {
346 		case BIOC_SVONLINE:
347 		case BIOC_SVOFFLINE:
348 		case BIOC_SVDEGRADED:
349 		case BIOC_SVSCRUB: /* can go to same state */
350 			break;
351 		default:
352 			goto die;
353 		}
354 		break;
355 
356 	case BIOC_SVBUILDING:
357 		switch (new_state) {
358 		case BIOC_SVONLINE:
359 		case BIOC_SVOFFLINE:
360 		case BIOC_SVBUILDING: /* can go to the same state */
361 			break;
362 		default:
363 			goto die;
364 		}
365 		break;
366 
367 	case BIOC_SVREBUILD:
368 		switch (new_state) {
369 		case BIOC_SVONLINE:
370 		case BIOC_SVOFFLINE:
371 		case BIOC_SVDEGRADED:
372 		case BIOC_SVREBUILD: /* can go to the same state */
373 			break;
374 		default:
375 			goto die;
376 		}
377 		break;
378 
379 	case BIOC_SVDEGRADED:
380 		switch (new_state) {
381 		case BIOC_SVOFFLINE:
382 		case BIOC_SVREBUILD:
383 		case BIOC_SVDEGRADED: /* can go to the same state */
384 			break;
385 		default:
386 			goto die;
387 		}
388 		break;
389 
390 	default:
391 die:
392 		panic("%s: %s: invalid volume state transition %d -> %d\n",
393 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
394 		    old_state, new_state);
395 		/* NOTREACHED */
396 	}
397 
398 	sd->sd_vol_status = new_state;
399 }
400 
401 /*  modes:
402  *   readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
403  *	        SR_CCBF_FREEBUF, qbuf, NULL, 0);
404  *   readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
405  *		SR_CCBF_FREEBUF, pbuf, NULL, 0);
406  *   readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN,
407  *		SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]);
408  */
409 
410 int
411 sr_raid6_rw(struct sr_workunit *wu)
412 {
413 	struct sr_workunit	*wu_w = NULL;
414 	struct sr_discipline	*sd = wu->swu_dis;
415 	struct scsi_xfer	*xs = wu->swu_xs;
416 	struct sr_chunk		*scp;
417 	int			s, fail, i, rwmode;
418 	daddr64_t		blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk;
419 	daddr64_t		strip_size, no_chunk, lba, chunk_offs, phys_offs;
420 	daddr64_t		strip_bits, length, strip_offs, datalen;
421 	void		        *pbuf, *data, *qbuf;
422 
423 	/* blk and scsi error will be handled by sr_validate_io */
424 	if (sr_validate_io(wu, &blk, "sr_raid6_rw"))
425 		goto bad;
426 
427 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
428 	strip_bits = sd->mds.mdd_raid6.sr6_strip_bits;
429 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2;
430 
431 	data = xs->data;
432 	datalen = xs->datalen;
433 	lbaoffs	= blk << DEV_BSHIFT;
434 
435 	rwmode = (xs->flags & SCSI_DATA_IN) ? M_RFLG : M_WFLG;
436 	if (xs->flags & SCSI_DATA_OUT)
437 		/* create write workunit */
438 		if ((wu_w = sr_wu_get(sd, 0)) == NULL) {
439 			printf("%s: can't get wu_w", DEVNAME(sd->sd_sc));
440 			goto bad;
441 		}
442 
443 	wu->swu_blk_start = 0;
444 	while (datalen != 0) {
445 		strip_no = lbaoffs >> strip_bits;
446 		strip_offs = lbaoffs & (strip_size - 1);
447 		chunk_offs = (strip_no / no_chunk) << strip_bits;
448 		phys_offs = chunk_offs + strip_offs +
449 		    ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT);
450 
451 		/* get size remaining in this stripe */
452 		length = MIN(strip_size - strip_offs, datalen);
453 
454 		/* map disk offset to parity/data drive */
455 		chunk = strip_no % no_chunk;
456 
457 		qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2));
458 		if (qchunk == 0)
459 			pchunk = no_chunk + 1;
460 		else
461 			pchunk = qchunk - 1;
462 		if (chunk >= pchunk)
463 			chunk++;
464 		if (chunk >= qchunk)
465 			chunk++;
466 
467 		lba = phys_offs >> DEV_BSHIFT;
468 
469 		/* XXX big hammer.. exclude I/O from entire stripe */
470 		if (wu->swu_blk_start == 0)
471 			wu->swu_blk_start = chunk_offs >> DEV_BSHIFT;
472 		wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1;
473 
474 		fail = 0;
475 		fchunk = -1;
476 
477 		/* Get disk-fail flags */
478 		for (i=0; i< no_chunk+2; i++) {
479 			scp = sd->sd_vol.sv_chunks[i];
480 			switch (scp->src_meta.scm_status) {
481 			case BIOC_SDOFFLINE:
482 			case BIOC_SDREBUILD:
483 			case BIOC_SDHOTSPARE:
484 				if (i == qchunk)
485 					fail |= SR_FAILQ;
486 				else if (i == pchunk)
487 					fail |= SR_FAILP;
488 				else if (i == chunk)
489 					fail |= SR_FAILX;
490 				else {
491 					/* dual data-disk failure */
492 					fail |= SR_FAILY;
493 					fchunk = i;
494 				}
495 				break;
496 			}
497 		}
498 		if (xs->flags & SCSI_DATA_IN) {
499 			if (!(fail & SR_FAILX)) {
500 				/* drive is good. issue single read request */
501 				if (sr_raid6_addio(wu, chunk, lba, length,
502 				    data, xs->flags, 0, NULL, NULL, 0))
503 					goto bad;
504 			} else if (fail & SR_FAILP) {
505 				/* Dx, P failed */
506 				printf("Disk %llx offline, "
507 				    "regenerating Dx+P\n", chunk);
508 
509 				qbuf = sr_get_block(sd, length);
510 				if (qbuf == NULL)
511 					goto bad;
512 
513 				/* Calculate: Dx*gx = Q^(Dz*gz)
514 				 *   Q:  sr_raid6_xorp(data, --, length);
515 				 *   Dz: sr_raid6_xorq(data, --, length, gf_pow[i]);
516 				 */
517 				memset(data, 0, length);
518 				for (i = 0; i < no_chunk+2; i++) {
519 					if  (i == qchunk) {
520 						/* Read Q */
521 						if (sr_raid6_addio(wu, i, lba,
522 						    length, NULL, SCSI_DATA_IN,
523 						    SR_CCBF_FREEBUF, qbuf,
524 						    NULL, 0))
525 						    	goto bad;
526 					} else if (i != chunk && i != pchunk) {
527 						/* Read Dz * gz */
528 						if (sr_raid6_addio(wu, i, lba,
529 						   length, NULL, SCSI_DATA_IN,
530 						   SR_CCBF_FREEBUF, NULL,
531 						   qbuf, gf_pow[i]))
532 						   	goto bad;
533 					}
534 				}
535 
536 				/* run fake wu when read i/o is complete */
537 				if (wu_w == NULL &&
538 				    (wu_w = sr_wu_get(sd, 0)) == NULL)
539 					goto bad;
540 
541 				wu_w->swu_flags |= SR_WUF_FAIL;
542 				if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0,
543 				    SR_CCBF_FREEBUF, NULL, data,
544 				    gf_inv(gf_pow[chunk])))
545 					goto bad;
546 			} else if (fail & SR_FAILY) {
547 				/* Dx, Dy failed */
548 				printf("Disk %llx & %llx offline, "
549 				    "regenerating Dx+Dy\n", chunk, fchunk);
550 				qbuf = sr_get_block(sd, length);
551 				if (qbuf == NULL)
552 					goto bad;
553 				pbuf = sr_get_block(sd, length);
554 				if (pbuf == NULL)
555 					goto bad;
556 
557 				/* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz
558 				 *   Q:  sr_raid6_xorp(qbuf, --, length);
559 				 *   P:  sr_raid6_xorp(pbuf, --, length);
560 				 *   Dz: sr_raid6_xorp(pbuf, --, length);
561 				 *	 sr_raid6_xorq(qbuf, --, length, gf_pow[i]);
562 				 */
563 				memset(data, 0, length);
564 				for (i = 0; i < no_chunk+2; i++) {
565 					if (i == qchunk) {
566 						/* read Q */
567 						if (sr_raid6_addio(wu, i, lba,
568 						    length,  NULL, SCSI_DATA_IN,
569 						    SR_CCBF_FREEBUF, qbuf,
570 						    NULL, 0))
571 						    	goto bad;
572 					} else if (i == pchunk) {
573 						/* read P */
574 						if (sr_raid6_addio(wu, i, lba,
575 						    length,  NULL, SCSI_DATA_IN,
576 						    SR_CCBF_FREEBUF, pbuf,
577 						    NULL, 0))
578 						    	goto bad;
579 					} else if (i != chunk) {
580 						/* read Dz * gz */
581 						if (sr_raid6_addio(wu, i, lba,
582 						    length, NULL, SCSI_DATA_IN,
583 						    SR_CCBF_FREEBUF, pbuf,
584 						    qbuf, gf_pow[i]))
585 						    	goto bad;
586 					}
587 				}
588 
589 				/* run fake wu when read i/o is complete */
590 				if (wu_w == NULL &&
591 				    (wu_w = sr_wu_get(sd, 0)) == NULL)
592 					goto bad;
593 
594 				wu_w->swu_flags |= SR_WUF_FAIL;
595 				if (sr_raid6_addio(wu_w, 0, 0, length, pbuf, 0,
596 				    SR_CCBF_FREEBUF, NULL, data,
597 				    gf_inv(gf_pow[255+chunk-fchunk] ^ 1)))
598 					goto bad;
599 				if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0,
600 				    SR_CCBF_FREEBUF, NULL, data,
601 				    gf_inv(gf_pow[chunk] ^ gf_pow[fchunk])))
602 					goto bad;
603 			} else {
604 				/* Two cases: single disk (Dx) or (Dx+Q)
605 				 *   Dx = Dz ^ P (same as RAID5)
606 				 */
607 				printf("Disk %llx offline, "
608 				    "regenerating Dx%s\n", chunk,
609 				    fail & SR_FAILQ ? "+Q" : " single");
610 
611 				/* Calculate: Dx = P^Dz
612  				 *   P:  sr_raid6_xorp(data, ---, length);
613  				 *   Dz: sr_raid6_xorp(data, ---, length);
614 				 */
615 				memset(data, 0, length);
616 				for (i = 0; i < no_chunk+2; i++) {
617 					if (i != chunk && i != qchunk) {
618 						/* Read Dz */
619 						if (sr_raid6_addio(wu, i, lba,
620 						    length, NULL, SCSI_DATA_IN,
621 						    SR_CCBF_FREEBUF, data,
622 						    NULL, 0))
623 	 				    	    	goto bad;
624 					}
625 				}
626 
627 				/* data will contain correct value on completion */
628 			}
629 		} else {
630 			/* XXX handle writes to failed/offline disk? */
631 			if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP))
632 				goto bad;
633 
634 			/*
635 			 * initialize pbuf with contents of new data to be
636 			 * written. This will be XORed with old data and old
637 			 * parity in the intr routine. The result in pbuf
638 			 * is the new parity data.
639 			 */
640 			qbuf = sr_get_block(sd, length);
641 			if (qbuf == NULL)
642 				goto bad;
643 
644 			pbuf = sr_get_block(sd, length);
645 			if (pbuf == NULL)
646 				goto bad;
647 
648 			/* Calulate P = Dn; Q = gn * Dn */
649 			if (gf_premul(gf_pow[chunk]))
650 				goto bad;
651 			sr_raid6_xorp(pbuf, data, length);
652 			sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]);
653 
654 			/* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */
655 			if (sr_raid6_addio(wu, chunk, lba, length, NULL,
656 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf,
657 				gf_pow[chunk]))
658 				goto bad;
659 
660 			/* Read old xor-parity: P ^= P' */
661 			if (sr_raid6_addio(wu, pchunk, lba, length, NULL,
662 				SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0))
663 				goto bad;
664 
665 			/* Read old q-parity: Q ^= Q' */
666 			if (sr_raid6_addio(wu, qchunk, lba, length, NULL,
667 				SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0))
668 				goto bad;
669 
670 			/* write new data */
671 			if (sr_raid6_addio(wu_w, chunk, lba, length, data,
672 			    xs->flags, 0, NULL, NULL, 0))
673 				goto bad;
674 
675 			/* write new xor-parity */
676 			if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf,
677 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
678 				goto bad;
679 
680 			/* write new q-parity */
681 			if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf,
682 			    xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0))
683 				goto bad;
684 		}
685 
686 		/* advance to next block */
687 		lbaoffs += length;
688 		datalen -= length;
689 		data += length;
690 	}
691 
692 	s = splbio();
693 	if (wu_w) {
694 		/* collide write request with reads */
695 		wu_w->swu_blk_start = wu->swu_blk_start;
696 		wu_w->swu_blk_end = wu->swu_blk_end;
697 
698 		/*
699 		 * put xs block in write request (scsi_done not called till
700 		 * write completes)
701 		 */
702 		wu_w->swu_xs = wu->swu_xs;
703 		wu->swu_xs = NULL;
704 
705 		wu_w->swu_state = SR_WU_DEFERRED;
706 		wu->swu_collider = wu_w;
707 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
708 	}
709 
710 	/* rebuild io, let rebuild routine deal with it */
711 	if (wu->swu_flags & SR_WUF_REBUILD)
712 		goto queued;
713 
714 	/* current io failed, restart */
715 	if (wu->swu_state == SR_WU_RESTART)
716 		goto start;
717 
718 	/* deferred io failed, don't restart */
719 	if (wu->swu_state == SR_WU_REQUEUE)
720 		goto queued;
721 
722 	if (sr_check_io_collision(wu))
723 		goto queued;
724 
725 start:
726 	sr_raid_startwu(wu);
727 queued:
728 	splx(s);
729 	return (0);
730 bad:
731 	/* wu is unwound by sr_wu_put */
732 	if (wu_w)
733 		sr_wu_put(wu_w);
734 	return (1);
735 }
736 
737 /* Handle failure I/O completion */
738 int
739 sr_failio(struct sr_workunit *wu)
740 {
741 	struct sr_discipline	*sd = wu->swu_dis;
742 	struct sr_ccb		*ccb;
743 
744 	if (!(wu->swu_flags & SR_WUF_FAIL))
745 		return (0);
746 
747 	/* Wu is a 'fake'.. don't do real I/O just intr */
748 	TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
749 	TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link)
750 		sr_raid6_intr(&ccb->ccb_buf);
751 	return (1);
752 }
753 
754 void
755 sr_raid6_intr(struct buf *bp)
756 {
757 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
758 	struct sr_workunit	*wu = ccb->ccb_wu, *wup;
759 	struct sr_discipline	*sd = wu->swu_dis;
760 	struct scsi_xfer	*xs = wu->swu_xs;
761 	struct sr_softc		*sc = sd->sd_sc;
762 	struct sr_raid6_opaque  *pq = ccb->ccb_opaque;
763 	int			s, pend;
764 
765 	DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n",
766 	    DEVNAME(sc), bp, xs);
767 
768 	DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d"
769 	    " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc),
770 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags,
771 	    ccb->ccb_buf.b_blkno, ccb->ccb_target);
772 
773 	s = splbio();
774 
775 	if (ccb->ccb_buf.b_flags & B_ERROR) {
776 		DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n",
777 		    DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target);
778 		printf("io error: disk %x\n", ccb->ccb_target);
779 		wu->swu_ios_failed++;
780 		ccb->ccb_state = SR_CCB_FAILED;
781 		if (ccb->ccb_target != -1)
782 			sd->sd_set_chunk_state(sd, ccb->ccb_target,
783 			    BIOC_SDOFFLINE);
784 		else
785 			panic("%s: invalid target on wu: %p", DEVNAME(sc), wu);
786 	} else {
787 		ccb->ccb_state = SR_CCB_OK;
788 		wu->swu_ios_succeeded++;
789 
790 		/* XOR data to result */
791 		if (pq) {
792 			if (pq->pbuf)
793 				/* Calculate xor-parity */
794 				sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data,
795 				    ccb->ccb_buf.b_bcount);
796 			if (pq->qbuf)
797 				/* Calculate q-parity */
798 				sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data,
799 				    ccb->ccb_buf.b_bcount, pq->gn);
800 			free(pq, M_DEVBUF);
801 			ccb->ccb_opaque = NULL;
802 		}
803 	}
804 
805 	/* free allocated data buffer */
806 	if (ccb->ccb_flag & SR_CCBF_FREEBUF) {
807 		sr_put_block(sd, ccb->ccb_buf.b_data);
808 		ccb->ccb_buf.b_data = NULL;
809 	}
810 	wu->swu_ios_complete++;
811 
812 	DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n",
813 	    DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count,
814 	    wu->swu_ios_failed);
815 
816 	if (wu->swu_ios_complete >= wu->swu_io_count) {
817 
818 		/* if all ios failed, retry reads and give up on writes */
819 		if (wu->swu_ios_failed == wu->swu_ios_complete) {
820 			if (xs->flags & SCSI_DATA_IN) {
821 				printf("%s: retrying read on block %lld\n",
822 				    DEVNAME(sc), ccb->ccb_buf.b_blkno);
823 				sr_ccb_put(ccb);
824 				TAILQ_INIT(&wu->swu_ccb);
825 				wu->swu_state = SR_WU_RESTART;
826 				if (sd->sd_scsi_rw(wu))
827 					goto bad;
828 				else
829 					goto retry;
830 			} else {
831 				printf("%s: permanently fail write on block "
832 				    "%lld\n", DEVNAME(sc),
833 				    ccb->ccb_buf.b_blkno);
834 				xs->error = XS_DRIVER_STUFFUP;
835 				goto bad;
836 			}
837 		}
838 
839 		if (xs != NULL) {
840 			xs->error = XS_NOERROR;
841 			xs->resid = 0;
842 			xs->flags |= ITSDONE;
843 		}
844 
845 		pend = 0;
846 		TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
847 			if (wu == wup) {
848 				/* wu on pendq, remove */
849 				TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
850 				pend = 1;
851 
852 				if (wu->swu_collider) {
853 					if (wu->swu_ios_failed)
854 						/* toss all ccbs and recreate */
855 						sr_raid6_recreate_wu(wu->swu_collider);
856 
857 					/* restart deferred wu */
858 					wu->swu_collider->swu_state =
859 					    SR_WU_INPROGRESS;
860 					TAILQ_REMOVE(&sd->sd_wu_defq,
861 					    wu->swu_collider, swu_link);
862 					if (sr_failio(wu->swu_collider) == 0)
863 						sr_raid_startwu(wu->swu_collider);
864 				}
865 				break;
866 			}
867 		}
868 
869 		if (!pend)
870 			printf("%s: wu: %p not on pending queue\n",
871 			    DEVNAME(sc), wu);
872 
873 		if (wu->swu_flags & SR_WUF_REBUILD) {
874 			if (wu->swu_xs->flags & SCSI_DATA_OUT) {
875 				wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
876 				wakeup(wu);
877 			}
878 		} else {
879 			/* do not change the order of these 2 functions */
880 			sr_wu_put(wu);
881 			if (xs != NULL)
882 				scsi_done(xs);
883 		}
884 
885 		if (sd->sd_sync && sd->sd_wu_pending == 0)
886 			wakeup(sd);
887 	}
888 
889 retry:
890 	splx(s);
891 	return;
892 bad:
893 	xs->error = XS_DRIVER_STUFFUP;
894 	xs->flags |= ITSDONE;
895 	if (wu->swu_flags & SR_WUF_REBUILD) {
896 		wu->swu_flags |= SR_WUF_REBUILDIOCOMP;
897 		wakeup(wu);
898 	} else {
899 		/* do not change the order of these 2 functions */
900 		sr_wu_put(wu);
901 		scsi_done(xs);
902 	}
903 
904 	splx(s);
905 }
906 
907 void
908 sr_raid6_recreate_wu(struct sr_workunit *wu)
909 {
910 	struct sr_discipline	*sd = wu->swu_dis;
911 	struct sr_workunit	*wup = wu;
912 	struct sr_ccb		*ccb;
913 
914 	do {
915 		DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup);
916 
917 		/* toss all ccbs */
918 		while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) {
919 			TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link);
920 			sr_ccb_put(ccb);
921 		}
922 		TAILQ_INIT(&wup->swu_ccb);
923 
924 		/* recreate ccbs */
925 		wup->swu_state = SR_WU_REQUEUE;
926 		if (sd->sd_scsi_rw(wup))
927 			panic("could not requeue io");
928 
929 		wup = wup->swu_collider;
930 	} while (wup);
931 }
932 
933 int
934 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len,
935     void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn)
936 {
937 	struct sr_discipline 	*sd = wu->swu_dis;
938 	struct sr_ccb		*ccb;
939 	struct sr_raid6_opaque  *pqbuf;
940 
941 	ccb = sr_ccb_get(sd);
942 	if (!ccb)
943 		return (-1);
944 
945 	/* allocate temporary buffer */
946 	if (data == NULL) {
947 		data = sr_get_block(sd, len);
948 		if (data == NULL)
949 			return (-1);
950 	}
951 
952 	DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n",
953 	    flag & SCSI_DATA_IN ? "read" : "write",
954 	    dsk, blk, len, pbuf, qbuf);
955 
956 	ccb->ccb_flag = ccbflag;
957 	if (flag & SCSI_POLL) {
958 		ccb->ccb_buf.b_flags = 0;
959 		ccb->ccb_buf.b_iodone = NULL;
960 	} else {
961 		ccb->ccb_buf.b_flags = B_CALL;
962 		ccb->ccb_buf.b_iodone = sr_raid6_intr;
963 	}
964 	if (flag & SCSI_DATA_IN)
965 		ccb->ccb_buf.b_flags |= B_READ;
966 	else
967 		ccb->ccb_buf.b_flags |= B_WRITE;
968 
969 	/* add offset for metadata */
970 	ccb->ccb_buf.b_flags |= B_PHYS;
971 	ccb->ccb_buf.b_blkno = blk;
972 	ccb->ccb_buf.b_bcount = len;
973 	ccb->ccb_buf.b_bufsize = len;
974 	ccb->ccb_buf.b_resid = len;
975 	ccb->ccb_buf.b_data = data;
976 	ccb->ccb_buf.b_error = 0;
977 	ccb->ccb_buf.b_proc = curproc;
978 	ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm;
979 	ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn;
980 	if ((ccb->ccb_buf.b_flags & B_READ) == 0)
981 		ccb->ccb_buf.b_vp->v_numoutput++;
982 
983 	ccb->ccb_wu = wu;
984 	ccb->ccb_target = dsk;
985 	if (pbuf || qbuf) {
986 		if (qbuf && gf_premul(gn))
987 			return (-1);
988 
989 		pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL);
990 		if (pqbuf == NULL) {
991 			sr_ccb_put(ccb);
992 			return (-1);
993 		}
994 		pqbuf->pbuf = pbuf;
995 		pqbuf->qbuf = qbuf;
996 		pqbuf->gn = gn;
997 		ccb->ccb_opaque = pqbuf;
998 	}
999 
1000 	LIST_INIT(&ccb->ccb_buf.b_dep);
1001 	TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link);
1002 
1003 	DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d "
1004 	    "b_blkno: %x b_flags 0x%0x b_data %p\n",
1005 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
1006 	    ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno,
1007 	    ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data);
1008 
1009 	wu->swu_io_count++;
1010 
1011 	return (0);
1012 }
1013 
1014 /* Perform RAID6 parity calculation.
1015  *   P=xor parity, Q=GF256 parity, D=data, gn=disk# */
1016 void
1017 sr_raid6_xorp(void *p, void *d, int len)
1018 {
1019 	uint8_t *pbuf = p, *data = d;
1020 
1021 	while (len--)
1022 		pbuf[len] ^= data[len];
1023 }
1024 
1025 void
1026 sr_raid6_xorq(void *q, void *d, int len, int gn)
1027 {
1028 	uint8_t		*qbuf = q, *data = d;
1029 	uint8_t		*gn_map = gf_map[gn];
1030 
1031 	/* Have to do this a byte at a time */
1032 	/* Faster multiply.. gn is always constant */
1033 	while (len--)
1034 		qbuf[len] ^= gn_map[data[len]];
1035 }
1036 
1037 /* Create GF256 log/pow tables: polynomial = 0x11D */
1038 void
1039 gf_init(void)
1040 {
1041 	int i;
1042 	uint8_t p = 1;
1043 
1044 	/* use 2N pow table to avoid using % in multiply */
1045 	for (i=0; i<256; i++) {
1046 		gf_log[p] = i;
1047 		gf_pow[i] = gf_pow[i+255] = p;
1048 		p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00));
1049 	}
1050 	gf_log[0] = 512;
1051 }
1052 
1053 uint8_t
1054 gf_inv(uint8_t a)
1055 {
1056 	return gf_pow[255 - gf_log[a]];
1057 }
1058 
1059 /* Precalculate multiplication tables for drive gn */
1060 int
1061 gf_premul(uint8_t gn)
1062 {
1063 	int i;
1064 
1065 	if (gf_map[gn] != NULL)
1066 		return (0);
1067 
1068 	if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL)
1069 		return (-1);
1070 
1071 	for (i=0; i<256; i++)
1072 		gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]];
1073 	return (0);
1074 }
1075