xref: /openbsd-src/sys/dev/softraid_raid5.c (revision e5157e49389faebcb42b7237d55fbf096d9c2523)
1 /* $OpenBSD: softraid_raid5.c,v 1.16 2014/09/14 14:17:24 jsg Exp $ */
2 /*
3  * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include "bio.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/buf.h>
25 #include <sys/device.h>
26 #include <sys/ioctl.h>
27 #include <sys/malloc.h>
28 #include <sys/kernel.h>
29 #include <sys/disk.h>
30 #include <sys/rwlock.h>
31 #include <sys/queue.h>
32 #include <sys/fcntl.h>
33 #include <sys/disklabel.h>
34 #include <sys/mount.h>
35 #include <sys/sensors.h>
36 #include <sys/stat.h>
37 #include <sys/task.h>
38 #include <sys/pool.h>
39 #include <sys/conf.h>
40 #include <sys/uio.h>
41 
42 #include <scsi/scsi_all.h>
43 #include <scsi/scsiconf.h>
44 #include <scsi/scsi_disk.h>
45 
46 #include <dev/softraidvar.h>
47 #include <dev/rndvar.h>
48 
49 /* RAID 5 functions. */
50 int	sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
51 	    int, int64_t);
52 int	sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
53 	    int, void *);
54 int	sr_raid5_init(struct sr_discipline *);
55 int	sr_raid5_rw(struct sr_workunit *);
56 int	sr_raid5_openings(struct sr_discipline *);
57 void	sr_raid5_intr(struct buf *);
58 int	sr_raid5_wu_done(struct sr_workunit *);
59 void	sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
60 void	sr_raid5_set_vol_state(struct sr_discipline *);
61 
62 int	sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, daddr_t,
63 	    void *, int, int, void *);
64 int	sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, daddr_t,
65 	    void *);
66 int	sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
67 	    daddr_t, daddr_t, void *, int, int);
68 void	sr_raid5_xor(void *, void *, int);
69 
70 void	sr_raid5_rebuild(struct sr_discipline *);
71 void	sr_raid5_scrub(struct sr_discipline *);
72 
73 /* discipline initialisation. */
74 void
75 sr_raid5_discipline_init(struct sr_discipline *sd)
76 {
77 	/* Fill out discipline members. */
78 	sd->sd_type = SR_MD_RAID5;
79 	strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
80 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
81 	    SR_CAP_REBUILD | SR_CAP_REDUNDANT;
82 	sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */
83 	sd->sd_max_wu = SR_RAID5_NOWU + 2;	/* Two for scrub/rebuild. */
84 
85 	/* Setup discipline specific function pointers. */
86 	sd->sd_assemble = sr_raid5_assemble;
87 	sd->sd_create = sr_raid5_create;
88 	sd->sd_openings = sr_raid5_openings;
89 	sd->sd_rebuild = sr_raid5_rebuild;
90 	sd->sd_scsi_rw = sr_raid5_rw;
91 	sd->sd_scsi_intr = sr_raid5_intr;
92 	sd->sd_scsi_wu_done = sr_raid5_wu_done;
93 	sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
94 	sd->sd_set_vol_state = sr_raid5_set_vol_state;
95 }
96 
97 int
98 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
99     int no_chunk, int64_t coerced_size)
100 {
101 	if (no_chunk < 3) {
102 		sr_error(sd->sd_sc, "%s requires three or more chunks",
103 		    sd->sd_name);
104 		return EINVAL;
105 	}
106 
107 	/*
108 	 * XXX add variable strip size later even though MAXPHYS is really
109 	 * the clever value, users like to tinker with that type of stuff.
110 	 */
111 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
112 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
113 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
114 	    DEV_BSHIFT) - 1)) * (no_chunk - 1);
115 
116 	return sr_raid5_init(sd);
117 }
118 
119 int
120 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
121     int no_chunk, void *data)
122 {
123 	return sr_raid5_init(sd);
124 }
125 
126 int
127 sr_raid5_init(struct sr_discipline *sd)
128 {
129 	/* Initialise runtime values. */
130 	sd->mds.mdd_raid5.sr5_strip_bits =
131 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
132 	if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
133 		sr_error(sd->sd_sc, "invalid strip size");
134 		return EINVAL;
135 	}
136 
137 	return 0;
138 }
139 
140 int
141 sr_raid5_openings(struct sr_discipline *sd)
142 {
143 	/* Two work units per I/O, two for rebuild/scrub. */
144 	return ((sd->sd_max_wu - 2) >> 1);
145 }
146 
147 void
148 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
149 {
150 	int			old_state, s;
151 
152 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
153 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
154 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
155 
156 	/* ok to go to splbio since this only happens in error path */
157 	s = splbio();
158 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
159 
160 	/* multiple IOs to the same chunk that fail will come through here */
161 	if (old_state == new_state)
162 		goto done;
163 
164 	switch (old_state) {
165 	case BIOC_SDONLINE:
166 		switch (new_state) {
167 		case BIOC_SDOFFLINE:
168 		case BIOC_SDSCRUB:
169 			break;
170 		default:
171 			goto die;
172 		}
173 		break;
174 
175 	case BIOC_SDOFFLINE:
176 		if (new_state == BIOC_SDREBUILD) {
177 			;
178 		} else
179 			goto die;
180 		break;
181 
182 	case BIOC_SDSCRUB:
183 		switch (new_state) {
184 		case BIOC_SDONLINE:
185 		case BIOC_SDOFFLINE:
186 			break;
187 		default:
188 			goto die;
189 		}
190 		break;
191 
192 	case BIOC_SDREBUILD:
193 		switch (new_state) {
194 		case BIOC_SDONLINE:
195 		case BIOC_SDOFFLINE:
196 			break;
197 		default:
198 			goto die;
199 		}
200 		break;
201 
202 	default:
203 die:
204 		splx(s); /* XXX */
205 		panic("%s: %s: %s: invalid chunk state transition "
206 		    "%d -> %d", DEVNAME(sd->sd_sc),
207 		    sd->sd_meta->ssd_devname,
208 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
209 		    old_state, new_state);
210 		/* NOTREACHED */
211 	}
212 
213 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
214 	sd->sd_set_vol_state(sd);
215 
216 	sd->sd_must_flush = 1;
217 	task_add(systq, &sd->sd_meta_save_task);
218 done:
219 	splx(s);
220 }
221 
222 void
223 sr_raid5_set_vol_state(struct sr_discipline *sd)
224 {
225 	int			states[SR_MAX_STATES];
226 	int			new_state, i, s, nd;
227 	int			old_state = sd->sd_vol_status;
228 
229 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
230 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
231 
232 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
233 
234 	for (i = 0; i < SR_MAX_STATES; i++)
235 		states[i] = 0;
236 
237 	for (i = 0; i < nd; i++) {
238 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
239 		if (s >= SR_MAX_STATES)
240 			panic("%s: %s: %s: invalid chunk state",
241 			    DEVNAME(sd->sd_sc),
242 			    sd->sd_meta->ssd_devname,
243 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
244 		states[s]++;
245 	}
246 
247 	if (states[BIOC_SDONLINE] == nd)
248 		new_state = BIOC_SVONLINE;
249 	else if (states[BIOC_SDONLINE] < nd - 1)
250 		new_state = BIOC_SVOFFLINE;
251 	else if (states[BIOC_SDSCRUB] != 0)
252 		new_state = BIOC_SVSCRUB;
253 	else if (states[BIOC_SDREBUILD] != 0)
254 		new_state = BIOC_SVREBUILD;
255 	else if (states[BIOC_SDONLINE] == nd - 1)
256 		new_state = BIOC_SVDEGRADED;
257 	else {
258 #ifdef SR_DEBUG
259 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
260 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
261 		for (i = 0; i < nd; i++)
262 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
263 			    DEVNAME(sd->sd_sc), i,
264 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
265 #endif
266 		panic("invalid volume state");
267 	}
268 
269 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
270 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
271 	    old_state, new_state);
272 
273 	switch (old_state) {
274 	case BIOC_SVONLINE:
275 		switch (new_state) {
276 		case BIOC_SVONLINE: /* can go to same state */
277 		case BIOC_SVOFFLINE:
278 		case BIOC_SVDEGRADED:
279 		case BIOC_SVREBUILD: /* happens on boot */
280 			break;
281 		default:
282 			goto die;
283 		}
284 		break;
285 
286 	case BIOC_SVOFFLINE:
287 		/* XXX this might be a little too much */
288 		goto die;
289 
290 	case BIOC_SVDEGRADED:
291 		switch (new_state) {
292 		case BIOC_SVOFFLINE:
293 		case BIOC_SVREBUILD:
294 		case BIOC_SVDEGRADED: /* can go to the same state */
295 			break;
296 		default:
297 			goto die;
298 		}
299 		break;
300 
301 	case BIOC_SVBUILDING:
302 		switch (new_state) {
303 		case BIOC_SVONLINE:
304 		case BIOC_SVOFFLINE:
305 		case BIOC_SVBUILDING: /* can go to the same state */
306 			break;
307 		default:
308 			goto die;
309 		}
310 		break;
311 
312 	case BIOC_SVSCRUB:
313 		switch (new_state) {
314 		case BIOC_SVONLINE:
315 		case BIOC_SVOFFLINE:
316 		case BIOC_SVDEGRADED:
317 		case BIOC_SVSCRUB: /* can go to same state */
318 			break;
319 		default:
320 			goto die;
321 		}
322 		break;
323 
324 	case BIOC_SVREBUILD:
325 		switch (new_state) {
326 		case BIOC_SVONLINE:
327 		case BIOC_SVOFFLINE:
328 		case BIOC_SVDEGRADED:
329 		case BIOC_SVREBUILD: /* can go to the same state */
330 			break;
331 		default:
332 			goto die;
333 		}
334 		break;
335 
336 	default:
337 die:
338 		panic("%s: %s: invalid volume state transition %d -> %d",
339 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
340 		    old_state, new_state);
341 		/* NOTREACHED */
342 	}
343 
344 	sd->sd_vol_status = new_state;
345 }
346 
347 static inline int
348 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
349 {
350 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
351 	case BIOC_SDONLINE:
352 	case BIOC_SDSCRUB:
353 		return 1;
354 	default:
355 		return 0;
356 	}
357 }
358 
359 static inline int
360 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
361 {
362 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
363 	case BIOC_SDREBUILD:
364 		return 1;
365 	default:
366 		return 0;
367 	}
368 }
369 
370 int
371 sr_raid5_rw(struct sr_workunit *wu)
372 {
373 	struct sr_workunit	*wu_r = NULL;
374 	struct sr_discipline	*sd = wu->swu_dis;
375 	struct scsi_xfer	*xs = wu->swu_xs;
376 	struct sr_chunk		*scp;
377 	daddr_t			blk, lba;
378 	int64_t			chunk_offs, lbaoffs, phys_offs, strip_offs;
379 	int64_t			strip_bits, strip_no, strip_size;
380 	int64_t			chunk, no_chunk;
381 	int64_t			length, parity, datalen, row_size;
382 	void			*data;
383 	int			s;
384 
385 	/* blk and scsi error will be handled by sr_validate_io */
386 	if (sr_validate_io(wu, &blk, "sr_raid5_rw"))
387 		goto bad;
388 
389 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: lba %lld size %d\n",
390 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
391 	    (xs->flags & SCSI_DATA_IN) ? "read" : "write",
392 	    (long long)blk, xs->datalen);
393 
394 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
395 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
396 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
397 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
398 
399 	data = xs->data;
400 	datalen = xs->datalen;
401 	lbaoffs	= blk << DEV_BSHIFT;
402 
403 	if (xs->flags & SCSI_DATA_OUT) {
404 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
405 			printf("%s: %s failed to get read work unit",
406 			    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
407 			goto bad;
408 		}
409 		wu_r->swu_state = SR_WU_INPROGRESS;
410 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
411 	}
412 
413 	wu->swu_blk_start = 0;
414 	while (datalen != 0) {
415 		strip_no = lbaoffs >> strip_bits;
416 		strip_offs = lbaoffs & (strip_size - 1);
417 		chunk_offs = (strip_no / no_chunk) << strip_bits;
418 		phys_offs = chunk_offs + strip_offs +
419 		    (sd->sd_meta->ssd_data_offset << DEV_BSHIFT);
420 
421 		/* get size remaining in this stripe */
422 		length = MIN(strip_size - strip_offs, datalen);
423 
424 		/*
425 		 * Map disk offset to data and parity chunks, using a left
426 		 * asymmetric algorithm for the parity assignment.
427 		 */
428 		chunk = strip_no % no_chunk;
429 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
430 		if (chunk >= parity)
431 			chunk++;
432 
433 		lba = phys_offs >> DEV_BSHIFT;
434 
435 		/* XXX big hammer.. exclude I/O from entire stripe */
436 		if (wu->swu_blk_start == 0)
437 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
438 		wu->swu_blk_end = (strip_no / no_chunk) * row_size +
439 		    (row_size - 1);
440 
441 		scp = sd->sd_vol.sv_chunks[chunk];
442 		if (xs->flags & SCSI_DATA_IN) {
443 			switch (scp->src_meta.scm_status) {
444 			case BIOC_SDONLINE:
445 			case BIOC_SDSCRUB:
446 				/*
447 				 * Chunk is online, issue a single read
448 				 * request.
449 				 */
450 				if (sr_raid5_addio(wu, chunk, lba, length,
451 				    data, xs->flags, 0, NULL))
452 					goto bad;
453 				break;
454 			case BIOC_SDOFFLINE:
455 			case BIOC_SDREBUILD:
456 			case BIOC_SDHOTSPARE:
457 				if (sr_raid5_regenerate(wu, chunk, lba,
458 				    length, data))
459 					goto bad;
460 				break;
461 			default:
462 				printf("%s: is offline, can't read\n",
463 				    DEVNAME(sd->sd_sc));
464 				goto bad;
465 			}
466 		} else {
467 			if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
468 			    length, data, xs->flags, 0))
469 				goto bad;
470 		}
471 
472 		/* advance to next block */
473 		lbaoffs += length;
474 		datalen -= length;
475 		data += length;
476 	}
477 
478 	s = splbio();
479 	if (wu_r) {
480 		if (wu_r->swu_io_count > 0) {
481 			/* collide write request with reads */
482 			wu_r->swu_blk_start = wu->swu_blk_start;
483 			wu_r->swu_blk_end = wu->swu_blk_end;
484 
485 			wu->swu_state = SR_WU_DEFERRED;
486 			wu_r->swu_collider = wu;
487 			TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
488 
489 			wu = wu_r;
490 		} else {
491 			sr_scsi_wu_put(sd, wu_r);
492 		}
493 	}
494 	splx(s);
495 
496 	sr_schedule_wu(wu);
497 
498 	return (0);
499 
500 bad:
501 	/* wu is unwound by sr_wu_put */
502 	if (wu_r)
503 		sr_scsi_wu_put(sd, wu_r);
504 	return (1);
505 }
506 
507 int
508 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
509     daddr_t len, void *data)
510 {
511 	struct sr_discipline	*sd = wu->swu_dis;
512 	int			i;
513 
514 	/*
515 	 * Regenerate a block on a RAID 5 volume by xoring the data and parity
516 	 * from all of the remaining online chunks. This requires the parity
517 	 * to already be correct.
518  	 */
519 
520 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
521 	    "regenerating block %llu\n",
522 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
523 
524 	memset(data, 0, len);
525 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
526 		if (i == chunk)
527 			continue;
528 		if (!sr_raid5_chunk_online(sd, i))
529 			goto bad;
530 		if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
531 		    0, data))
532 			goto bad;
533 	}
534 	return (0);
535 
536 bad:
537 	return (1);
538 }
539 
540 int
541 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
542     int parity, daddr_t blkno, daddr_t len, void *data, int xsflags,
543     int ccbflags)
544 {
545 	struct sr_discipline	*sd = wu->swu_dis;
546 	struct scsi_xfer	*xs = wu->swu_xs;
547 	void			*xorbuf;
548 	int			chunk_online, chunk_rebuild;
549 	int			parity_online, parity_rebuild;
550 	int			other_offline = 0, other_rebuild = 0;
551 	int			i;
552 
553 	/*
554 	 * Perform a write to a RAID 5 volume. This write routine does not
555 	 * require the parity to already be correct and will operate on a
556 	 * uninitialised volume.
557 	 *
558 	 * There are four possible cases:
559 	 *
560 	 * 1) All data chunks and parity are online. In this case we read the
561 	 *    data from all data chunks, except the one we are writing to, in
562 	 *    order to calculate and write the new parity.
563 	 *
564 	 * 2) The parity chunk is offline. In this case we only need to write
565 	 *    to the data chunk. No parity calculation is required.
566 	 *
567 	 * 3) The data chunk is offline. In this case we read the data from all
568 	 *    online chunks in order to calculate and write the new parity.
569 	 *    This is the same as (1) except we do not write the data chunk.
570 	 *
571 	 * 4) A different data chunk is offline. The new parity is calculated
572 	 *    by taking the existing parity, xoring the original data and
573 	 *    xoring in the new data. This requires that the parity already be
574 	 *    correct, which it will be if any of the data chunks has
575 	 *    previously been written.
576 	 *
577 	 * There is an additional complication introduced by a chunk that is
578 	 * being rebuilt. If this is the data or parity chunk, then we want
579 	 * to write to it as per normal. If it is another data chunk then we
580 	 * need to presume that it has not yet been regenerated and use the
581 	 * same method as detailed in (4) above.
582 	 */
583 
584 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
585 	    "blk %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
586 	    chunk, parity, (unsigned long long)blkno);
587 
588 	chunk_online = sr_raid5_chunk_online(sd, chunk);
589 	chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
590 	parity_online = sr_raid5_chunk_online(sd, parity);
591 	parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
592 
593 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
594 		if (i == chunk || i == parity)
595 			continue;
596 		if (sr_raid5_chunk_rebuild(sd, i))
597 			other_rebuild = 1;
598 		else if (!sr_raid5_chunk_online(sd, i))
599 			other_offline = 1;
600 	}
601 
602 	DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
603 	    "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
604 	    chunk_online, parity_online, other_offline);
605 
606 	if (!parity_online && !parity_rebuild)
607 		goto data_write;
608 
609 	xorbuf = sr_block_get(sd, len);
610 	if (xorbuf == NULL)
611 		goto bad;
612 	memcpy(xorbuf, data, len);
613 
614 	if (other_offline || other_rebuild) {
615 
616 		/*
617 		 * XXX - If we can guarantee that this LBA has been scrubbed
618 		 * then we can also take this faster path.
619 		 */
620 
621 		/* Read in existing data and existing parity. */
622 		if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
623 		    SCSI_DATA_IN, 0, xorbuf))
624 			goto bad;
625 		if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
626 		    SCSI_DATA_IN, 0, xorbuf))
627 			goto bad;
628 
629 	} else {
630 
631 		/* Read in existing data from all other chunks. */
632 		for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
633 			if (i == chunk || i == parity)
634 				continue;
635 			if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
636 			    SCSI_DATA_IN, 0, xorbuf))
637 				goto bad;
638 		}
639 
640 	}
641 
642 	/* Write new parity. */
643 	if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
644 	    SR_CCBF_FREEBUF, NULL))
645 		goto bad;
646 
647 data_write:
648 	/* Write new data. */
649 	if (chunk_online || chunk_rebuild)
650 		if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
651 		    0, NULL))
652 			goto bad;
653 
654 	return (0);
655 
656 bad:
657 	return (1);
658 }
659 
660 void
661 sr_raid5_intr(struct buf *bp)
662 {
663 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
664 	struct sr_workunit	*wu = ccb->ccb_wu;
665 	struct sr_discipline	*sd = wu->swu_dis;
666 	int			s;
667 
668 	DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
669 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
670 
671 	s = splbio();
672 	sr_ccb_done(ccb);
673 
674 	/* XXX - Should this be done via the taskq? */
675 
676 	/* XOR data to result. */
677 	if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
678 		sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
679 		    ccb->ccb_buf.b_bcount);
680 
681 	/* Free allocated data buffer. */
682 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
683 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
684 		ccb->ccb_buf.b_data = NULL;
685 	}
686 
687 	sr_wu_done(wu);
688 	splx(s);
689 }
690 
691 int
692 sr_raid5_wu_done(struct sr_workunit *wu)
693 {
694 	struct sr_discipline	*sd = wu->swu_dis;
695 	struct scsi_xfer	*xs = wu->swu_xs;
696 
697 	/* XXX - we have no way of propagating errors... */
698 	if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
699 		return SR_WU_OK;
700 
701 	/* XXX - This is insufficient for RAID 5. */
702 	if (wu->swu_ios_succeeded > 0) {
703 		xs->error = XS_NOERROR;
704 		return SR_WU_OK;
705 	}
706 
707 	if (xs->flags & SCSI_DATA_IN) {
708 		printf("%s: retrying read on block %lld\n",
709 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
710 		sr_wu_release_ccbs(wu);
711 		wu->swu_state = SR_WU_RESTART;
712 		if (sd->sd_scsi_rw(wu) == 0)
713 			return SR_WU_RESTART;
714 	} else {
715 		/* XXX - retry write if we just went from online to degraded. */
716 		printf("%s: permanently fail write on block %lld\n",
717 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
718 	}
719 
720 	wu->swu_state = SR_WU_FAILED;
721 	xs->error = XS_DRIVER_STUFFUP;
722 
723 	return SR_WU_FAILED;
724 }
725 
726 int
727 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
728     daddr_t len, void *data, int xsflags, int ccbflags, void *xorbuf)
729 {
730 	struct sr_discipline	*sd = wu->swu_dis;
731 	struct sr_ccb		*ccb;
732 
733 	DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
734 	    "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
735 	    chunk, (long long)blkno, (long long)len, xorbuf ? "X0R" : "-");
736 
737 	/* Allocate temporary buffer. */
738 	if (data == NULL) {
739 		data = sr_block_get(sd, len);
740 		if (data == NULL)
741 			return (-1);
742 		ccbflags |= SR_CCBF_FREEBUF;
743 	}
744 
745 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
746 	if (ccb == NULL) {
747 		if (ccbflags & SR_CCBF_FREEBUF)
748 			sr_block_put(sd, data, len);
749 		return (-1);
750 	}
751 	ccb->ccb_opaque = xorbuf;
752 	sr_wu_enqueue_ccb(wu, ccb);
753 
754 	return (0);
755 }
756 
757 void
758 sr_raid5_xor(void *a, void *b, int len)
759 {
760 	uint32_t		*xa = a, *xb = b;
761 
762 	len >>= 2;
763 	while (len--)
764 		*xa++ ^= *xb++;
765 }
766 
767 void
768 sr_raid5_rebuild(struct sr_discipline *sd)
769 {
770 	int64_t strip_no, strip_size, strip_bits, i, psz, rb;
771 	int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
772 	struct sr_workunit *wu_r, *wu_w;
773 	int s, slept, percent = 0, old_percent = -1;
774 	int rebuild_chunk = -1;
775 	void *xorbuf;
776 
777 	/* Find the rebuild chunk. */
778 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
779 		if (sr_raid5_chunk_rebuild(sd, i)) {
780 			rebuild_chunk = i;
781 			break;
782 		}
783 	}
784 	if (rebuild_chunk == -1)
785 		goto bad;
786 
787 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
788 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
789 	chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
790 	chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
791 	chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
792 	row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
793 
794 	/* XXX - handle restarts. */
795 	DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
796 	    "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
797 	    "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
798 	    sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
799 	    row_size);
800 
801 	for (strip_no = 0; strip_no < chunk_strips; strip_no++) {
802 		chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no +
803 		    sd->sd_meta->ssd_data_offset;
804 
805 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
806 		    "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
807 		    sd->sd_meta->ssd_devname, strip_no, chunk_lba);
808 
809 		wu_w = sr_scsi_wu_get(sd, 0);
810 		wu_r = sr_scsi_wu_get(sd, 0);
811 
812 		xorbuf = sr_block_get(sd, strip_size);
813 		if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
814 		    strip_size, xorbuf))
815 			goto bad;
816 		if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
817 		    xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
818 			goto bad;
819 
820 		/* Collide write work unit with read work unit. */
821 		wu_r->swu_state = SR_WU_INPROGRESS;
822 		wu_r->swu_flags |= SR_WUF_REBUILD;
823 		wu_w->swu_state = SR_WU_DEFERRED;
824 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
825 		wu_r->swu_collider = wu_w;
826 
827 		/* Block I/O to this strip while we rebuild it. */
828 		wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
829 		wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
830 		wu_w->swu_blk_start = wu_r->swu_blk_start;
831 		wu_w->swu_blk_end = wu_r->swu_blk_end;
832 
833 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
834 		    "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
835 		    sd->sd_meta->ssd_devname,
836 		    wu_r->swu_blk_start, wu_r->swu_blk_end);
837 
838 		s = splbio();
839 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
840 		splx(s);
841 
842 		sr_schedule_wu(wu_r);
843 
844 		slept = 0;
845 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
846 			tsleep(wu_w, PRIBIO, "sr_rebuild", 0);
847 			slept = 1;
848 		}
849 		if (!slept)
850 			tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
851 
852 		sr_scsi_wu_put(sd, wu_r);
853 		sr_scsi_wu_put(sd, wu_w);
854 
855 		sd->sd_meta->ssd_rebuild =
856 		    (chunk_lba - sd->sd_meta->ssd_data_offset) * chunk_count;
857 
858 		psz = sd->sd_meta->ssdi.ssd_size;
859 		rb = sd->sd_meta->ssd_rebuild;
860 		if (rb > 0)
861 			percent = 100 - ((psz * 100 - rb * 100) / psz) - 1;
862 		else
863 			percent = 0;
864 		if (percent != old_percent && strip_no != chunk_strips - 1) {
865 			if (sr_meta_save(sd, SR_META_DIRTY))
866 				printf("%s: could not save metadata to %s\n",
867 				    DEVNAME(sd->sd_sc),
868 				    sd->sd_meta->ssd_devname);
869 			old_percent = percent;
870 		}
871 
872 		if (sd->sd_reb_abort)
873 			goto abort;
874 	}
875 
876 	DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
877 	    sd->sd_meta->ssd_devname);
878 
879 	/* all done */
880 	sd->sd_meta->ssd_rebuild = 0;
881 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
882 		if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
883 		    BIOC_SDREBUILD) {
884 			sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
885 			break;
886 		}
887 	}
888 
889 	return;
890 
891 abort:
892 	if (sr_meta_save(sd, SR_META_DIRTY))
893 		printf("%s: could not save metadata to %s\n",
894 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
895 bad:
896 	return;
897 }
898 
899 #if 0
900 void
901 sr_raid5_scrub(struct sr_discipline *sd)
902 {
903 	int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
904 	int64_t i;
905 	struct sr_workunit *wu_r, *wu_w;
906 	int s, slept;
907 	void *xorbuf;
908 
909 	wu_w = sr_scsi_wu_get(sd, 0);
910 	wu_r = sr_scsi_wu_get(sd, 0);
911 
912 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
913 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
914 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
915 	max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
916 
917 	for (strip_no = 0; strip_no < max_strip; strip_no++) {
918 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
919 
920 		xorbuf = sr_block_get(sd, strip_size);
921 		for (i = 0; i <= no_chunk; i++) {
922 			if (i != parity)
923 				sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
924 				    NULL, SCSI_DATA_IN, 0, xorbuf);
925 		}
926 		sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
927 		    SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
928 
929 		wu_r->swu_flags |= SR_WUF_REBUILD;
930 
931 		/* Collide wu_w with wu_r */
932 		wu_w->swu_state = SR_WU_DEFERRED;
933 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
934 		wu_r->swu_collider = wu_w;
935 
936 		s = splbio();
937 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
938 		splx(s);
939 
940 		wu_r->swu_state = SR_WU_INPROGRESS;
941 		sr_schedule_wu(wu_r);
942 
943 		slept = 0;
944 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
945 			tsleep(wu_w, PRIBIO, "sr_scrub", 0);
946 			slept = 1;
947 		}
948 		if (!slept)
949 			tsleep(sd->sd_sc, PWAIT, "sr_yield", 1);
950 	}
951 done:
952 	return;
953 }
954 #endif
955