xref: /openbsd-src/sys/dev/softraid_raid5.c (revision f6d8fcaed18e5ccb8e77952c23ec09be39e71cef)
1*f6d8fcaeSderaadt /* $OpenBSD: softraid_raid5.c,v 1.32 2021/05/16 15:12:37 deraadt Exp $ */
2f8000896Sjsing /*
37b8b4b44Sjsing  * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4f8000896Sjsing  * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5f8000896Sjsing  * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6f8000896Sjsing  *
7f8000896Sjsing  * Permission to use, copy, modify, and distribute this software for any
8f8000896Sjsing  * purpose with or without fee is hereby granted, provided that the above
9f8000896Sjsing  * copyright notice and this permission notice appear in all copies.
10f8000896Sjsing  *
11f8000896Sjsing  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12f8000896Sjsing  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13f8000896Sjsing  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14f8000896Sjsing  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15f8000896Sjsing  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16f8000896Sjsing  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17f8000896Sjsing  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18f8000896Sjsing  */
19f8000896Sjsing 
20f8000896Sjsing #include "bio.h"
21f8000896Sjsing 
22f8000896Sjsing #include <sys/param.h>
23f8000896Sjsing #include <sys/systm.h>
24f8000896Sjsing #include <sys/buf.h>
25f8000896Sjsing #include <sys/device.h>
26f8000896Sjsing #include <sys/ioctl.h>
27f8000896Sjsing #include <sys/malloc.h>
28f8000896Sjsing #include <sys/kernel.h>
29f8000896Sjsing #include <sys/disk.h>
30f8000896Sjsing #include <sys/rwlock.h>
31f8000896Sjsing #include <sys/queue.h>
32f8000896Sjsing #include <sys/fcntl.h>
33f8000896Sjsing #include <sys/mount.h>
34f8000896Sjsing #include <sys/sensors.h>
35f8000896Sjsing #include <sys/stat.h>
36e328a933Sjsing #include <sys/task.h>
37f8000896Sjsing #include <sys/pool.h>
38f8000896Sjsing #include <sys/conf.h>
39f8000896Sjsing #include <sys/uio.h>
40f8000896Sjsing 
41f8000896Sjsing #include <scsi/scsi_all.h>
42f8000896Sjsing #include <scsi/scsiconf.h>
43f8000896Sjsing #include <scsi/scsi_disk.h>
44f8000896Sjsing 
45f8000896Sjsing #include <dev/softraidvar.h>
46f8000896Sjsing 
47f8000896Sjsing /* RAID 5 functions. */
48f8000896Sjsing int	sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
49f8000896Sjsing 	    int, int64_t);
50f8000896Sjsing int	sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
51f8000896Sjsing 	    int, void *);
52f8000896Sjsing int	sr_raid5_init(struct sr_discipline *);
53f8000896Sjsing int	sr_raid5_rw(struct sr_workunit *);
54f8000896Sjsing int	sr_raid5_openings(struct sr_discipline *);
55f8000896Sjsing void	sr_raid5_intr(struct buf *);
56f8000896Sjsing int	sr_raid5_wu_done(struct sr_workunit *);
57f8000896Sjsing void	sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
58f8000896Sjsing void	sr_raid5_set_vol_state(struct sr_discipline *);
59f8000896Sjsing 
60c804f705Skrw int	sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
61f8000896Sjsing 	    void *, int, int, void *);
62c804f705Skrw int	sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
63d4c09c60Sjsing 	    void *);
64e3e73c0eSjsing int	sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
65c804f705Skrw 	    daddr_t, long, void *, int, int);
66d4c09c60Sjsing void	sr_raid5_xor(void *, void *, int);
67d4c09c60Sjsing 
68fd7fd89bSjsing void	sr_raid5_rebuild(struct sr_discipline *);
69f8000896Sjsing void	sr_raid5_scrub(struct sr_discipline *);
70f8000896Sjsing 
71f8000896Sjsing /* discipline initialisation. */
72f8000896Sjsing void
sr_raid5_discipline_init(struct sr_discipline * sd)73f8000896Sjsing sr_raid5_discipline_init(struct sr_discipline *sd)
74f8000896Sjsing {
75f8000896Sjsing 	/* Fill out discipline members. */
76f8000896Sjsing 	sd->sd_type = SR_MD_RAID5;
77f8000896Sjsing 	strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
78f8000896Sjsing 	sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
79fd7fd89bSjsing 	    SR_CAP_REBUILD | SR_CAP_REDUNDANT;
808a279608Sjsing 	sd->sd_max_wu = SR_RAID5_NOWU + 2;	/* Two for scrub/rebuild. */
81f8000896Sjsing 
82f8000896Sjsing 	/* Setup discipline specific function pointers. */
83f8000896Sjsing 	sd->sd_assemble = sr_raid5_assemble;
84f8000896Sjsing 	sd->sd_create = sr_raid5_create;
85f8000896Sjsing 	sd->sd_openings = sr_raid5_openings;
86fd7fd89bSjsing 	sd->sd_rebuild = sr_raid5_rebuild;
87f8000896Sjsing 	sd->sd_scsi_rw = sr_raid5_rw;
88f8000896Sjsing 	sd->sd_scsi_intr = sr_raid5_intr;
89f8000896Sjsing 	sd->sd_scsi_wu_done = sr_raid5_wu_done;
90f8000896Sjsing 	sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
91f8000896Sjsing 	sd->sd_set_vol_state = sr_raid5_set_vol_state;
92f8000896Sjsing }
93f8000896Sjsing 
94f8000896Sjsing int
sr_raid5_create(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,int64_t coerced_size)95f8000896Sjsing sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
96f8000896Sjsing     int no_chunk, int64_t coerced_size)
97f8000896Sjsing {
98f8000896Sjsing 	if (no_chunk < 3) {
99f8000896Sjsing 		sr_error(sd->sd_sc, "%s requires three or more chunks",
100f8000896Sjsing 		    sd->sd_name);
101f8000896Sjsing 		return EINVAL;
102f8000896Sjsing 	}
103f8000896Sjsing 
104f8000896Sjsing 	/*
105f8000896Sjsing 	 * XXX add variable strip size later even though MAXPHYS is really
106f8000896Sjsing 	 * the clever value, users like to tinker with that type of stuff.
107f8000896Sjsing 	 */
108f8000896Sjsing 	sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
109f8000896Sjsing 	sd->sd_meta->ssdi.ssd_size = (coerced_size &
110f8000896Sjsing 	    ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
111f8000896Sjsing 	    DEV_BSHIFT) - 1)) * (no_chunk - 1);
112f8000896Sjsing 
113f8000896Sjsing 	return sr_raid5_init(sd);
114f8000896Sjsing }
115f8000896Sjsing 
116f8000896Sjsing int
sr_raid5_assemble(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,void * data)117f8000896Sjsing sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
118f8000896Sjsing     int no_chunk, void *data)
119f8000896Sjsing {
120f8000896Sjsing 	return sr_raid5_init(sd);
121f8000896Sjsing }
122f8000896Sjsing 
123f8000896Sjsing int
sr_raid5_init(struct sr_discipline * sd)124f8000896Sjsing sr_raid5_init(struct sr_discipline *sd)
125f8000896Sjsing {
126f8000896Sjsing 	/* Initialise runtime values. */
127f8000896Sjsing 	sd->mds.mdd_raid5.sr5_strip_bits =
128f8000896Sjsing 	    sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
129f8000896Sjsing 	if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
130f8000896Sjsing 		sr_error(sd->sd_sc, "invalid strip size");
131f8000896Sjsing 		return EINVAL;
132f8000896Sjsing 	}
133f8000896Sjsing 
1343976ba54Skrw 	sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
1353976ba54Skrw 
136f8000896Sjsing 	return 0;
137f8000896Sjsing }
138f8000896Sjsing 
139f8000896Sjsing int
sr_raid5_openings(struct sr_discipline * sd)140f8000896Sjsing sr_raid5_openings(struct sr_discipline *sd)
141f8000896Sjsing {
1428a279608Sjsing 	/* Two work units per I/O, two for rebuild/scrub. */
1438a279608Sjsing 	return ((sd->sd_max_wu - 2) >> 1);
144f8000896Sjsing }
145f8000896Sjsing 
146f8000896Sjsing void
sr_raid5_set_chunk_state(struct sr_discipline * sd,int c,int new_state)147f8000896Sjsing sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
148f8000896Sjsing {
149f8000896Sjsing 	int			old_state, s;
150f8000896Sjsing 
151f8000896Sjsing 	DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
152f8000896Sjsing 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
153f8000896Sjsing 	    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
154f8000896Sjsing 
155f8000896Sjsing 	/* ok to go to splbio since this only happens in error path */
156f8000896Sjsing 	s = splbio();
157f8000896Sjsing 	old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
158f8000896Sjsing 
159f8000896Sjsing 	/* multiple IOs to the same chunk that fail will come through here */
160f8000896Sjsing 	if (old_state == new_state)
161f8000896Sjsing 		goto done;
162f8000896Sjsing 
163f8000896Sjsing 	switch (old_state) {
164f8000896Sjsing 	case BIOC_SDONLINE:
165f8000896Sjsing 		switch (new_state) {
166f8000896Sjsing 		case BIOC_SDOFFLINE:
167f8000896Sjsing 		case BIOC_SDSCRUB:
168f8000896Sjsing 			break;
169f8000896Sjsing 		default:
170f8000896Sjsing 			goto die;
171f8000896Sjsing 		}
172f8000896Sjsing 		break;
173f8000896Sjsing 
174f8000896Sjsing 	case BIOC_SDOFFLINE:
175f8000896Sjsing 		if (new_state == BIOC_SDREBUILD) {
176f8000896Sjsing 			;
177f8000896Sjsing 		} else
178f8000896Sjsing 			goto die;
179f8000896Sjsing 		break;
180f8000896Sjsing 
181f8000896Sjsing 	case BIOC_SDSCRUB:
182f8000896Sjsing 		switch (new_state) {
183f8000896Sjsing 		case BIOC_SDONLINE:
184f8000896Sjsing 		case BIOC_SDOFFLINE:
185f8000896Sjsing 			break;
186f8000896Sjsing 		default:
187f8000896Sjsing 			goto die;
188f8000896Sjsing 		}
189f8000896Sjsing 		break;
190f8000896Sjsing 
191f8000896Sjsing 	case BIOC_SDREBUILD:
192f8000896Sjsing 		switch (new_state) {
193f8000896Sjsing 		case BIOC_SDONLINE:
194f8000896Sjsing 		case BIOC_SDOFFLINE:
195f8000896Sjsing 			break;
196f8000896Sjsing 		default:
197f8000896Sjsing 			goto die;
198f8000896Sjsing 		}
199f8000896Sjsing 		break;
200f8000896Sjsing 
201f8000896Sjsing 	default:
202f8000896Sjsing die:
203f8000896Sjsing 		splx(s); /* XXX */
204*f6d8fcaeSderaadt 		panic("%s: %s: %s: invalid chunk state transition %d -> %d",
205*f6d8fcaeSderaadt 		    DEVNAME(sd->sd_sc),
206f8000896Sjsing 		    sd->sd_meta->ssd_devname,
207f8000896Sjsing 		    sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
208f8000896Sjsing 		    old_state, new_state);
209f8000896Sjsing 		/* NOTREACHED */
210f8000896Sjsing 	}
211f8000896Sjsing 
212f8000896Sjsing 	sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
213f8000896Sjsing 	sd->sd_set_vol_state(sd);
214f8000896Sjsing 
215f8000896Sjsing 	sd->sd_must_flush = 1;
216e328a933Sjsing 	task_add(systq, &sd->sd_meta_save_task);
217f8000896Sjsing done:
218f8000896Sjsing 	splx(s);
219f8000896Sjsing }
220f8000896Sjsing 
221f8000896Sjsing void
sr_raid5_set_vol_state(struct sr_discipline * sd)222f8000896Sjsing sr_raid5_set_vol_state(struct sr_discipline *sd)
223f8000896Sjsing {
224f8000896Sjsing 	int			states[SR_MAX_STATES];
225f8000896Sjsing 	int			new_state, i, s, nd;
226f8000896Sjsing 	int			old_state = sd->sd_vol_status;
227f8000896Sjsing 
228f8000896Sjsing 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
229f8000896Sjsing 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
230f8000896Sjsing 
231f8000896Sjsing 	nd = sd->sd_meta->ssdi.ssd_chunk_no;
232f8000896Sjsing 
233f8000896Sjsing 	for (i = 0; i < SR_MAX_STATES; i++)
234f8000896Sjsing 		states[i] = 0;
235f8000896Sjsing 
236f8000896Sjsing 	for (i = 0; i < nd; i++) {
237f8000896Sjsing 		s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
238f8000896Sjsing 		if (s >= SR_MAX_STATES)
239f8000896Sjsing 			panic("%s: %s: %s: invalid chunk state",
240f8000896Sjsing 			    DEVNAME(sd->sd_sc),
241f8000896Sjsing 			    sd->sd_meta->ssd_devname,
242f8000896Sjsing 			    sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
243f8000896Sjsing 		states[s]++;
244f8000896Sjsing 	}
245f8000896Sjsing 
246f8000896Sjsing 	if (states[BIOC_SDONLINE] == nd)
247f8000896Sjsing 		new_state = BIOC_SVONLINE;
248f8000896Sjsing 	else if (states[BIOC_SDONLINE] < nd - 1)
249f8000896Sjsing 		new_state = BIOC_SVOFFLINE;
250f8000896Sjsing 	else if (states[BIOC_SDSCRUB] != 0)
251f8000896Sjsing 		new_state = BIOC_SVSCRUB;
252f8000896Sjsing 	else if (states[BIOC_SDREBUILD] != 0)
253f8000896Sjsing 		new_state = BIOC_SVREBUILD;
254f8000896Sjsing 	else if (states[BIOC_SDONLINE] == nd - 1)
255f8000896Sjsing 		new_state = BIOC_SVDEGRADED;
256f8000896Sjsing 	else {
257f8000896Sjsing #ifdef SR_DEBUG
258f8000896Sjsing 		DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
259f8000896Sjsing 		    "was %d\n", DEVNAME(sd->sd_sc), old_state);
260f8000896Sjsing 		for (i = 0; i < nd; i++)
261f8000896Sjsing 			DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
262f8000896Sjsing 			    DEVNAME(sd->sd_sc), i,
263f8000896Sjsing 			    sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
264f8000896Sjsing #endif
265f8000896Sjsing 		panic("invalid volume state");
266f8000896Sjsing 	}
267f8000896Sjsing 
268f8000896Sjsing 	DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
269f8000896Sjsing 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
270f8000896Sjsing 	    old_state, new_state);
271f8000896Sjsing 
272f8000896Sjsing 	switch (old_state) {
273f8000896Sjsing 	case BIOC_SVONLINE:
274f8000896Sjsing 		switch (new_state) {
275f8000896Sjsing 		case BIOC_SVONLINE: /* can go to same state */
2760e0446b0Sjsing 		case BIOC_SVOFFLINE:
277f8000896Sjsing 		case BIOC_SVDEGRADED:
278f8000896Sjsing 		case BIOC_SVREBUILD: /* happens on boot */
279f8000896Sjsing 			break;
280f8000896Sjsing 		default:
281f8000896Sjsing 			goto die;
282f8000896Sjsing 		}
283f8000896Sjsing 		break;
284f8000896Sjsing 
285f8000896Sjsing 	case BIOC_SVOFFLINE:
286f8000896Sjsing 		/* XXX this might be a little too much */
287f8000896Sjsing 		goto die;
288f8000896Sjsing 
28997e41644Sjsing 	case BIOC_SVDEGRADED:
29097e41644Sjsing 		switch (new_state) {
29197e41644Sjsing 		case BIOC_SVOFFLINE:
29297e41644Sjsing 		case BIOC_SVREBUILD:
29397e41644Sjsing 		case BIOC_SVDEGRADED: /* can go to the same state */
29497e41644Sjsing 			break;
29597e41644Sjsing 		default:
29697e41644Sjsing 			goto die;
29797e41644Sjsing 		}
29897e41644Sjsing 		break;
29997e41644Sjsing 
3000e0446b0Sjsing 	case BIOC_SVBUILDING:
3010e0446b0Sjsing 		switch (new_state) {
3020e0446b0Sjsing 		case BIOC_SVONLINE:
3030e0446b0Sjsing 		case BIOC_SVOFFLINE:
3040e0446b0Sjsing 		case BIOC_SVBUILDING: /* can go to the same state */
3050e0446b0Sjsing 			break;
3060e0446b0Sjsing 		default:
3070e0446b0Sjsing 			goto die;
3080e0446b0Sjsing 		}
3090e0446b0Sjsing 		break;
3100e0446b0Sjsing 
311f8000896Sjsing 	case BIOC_SVSCRUB:
312f8000896Sjsing 		switch (new_state) {
313f8000896Sjsing 		case BIOC_SVONLINE:
314f8000896Sjsing 		case BIOC_SVOFFLINE:
315f8000896Sjsing 		case BIOC_SVDEGRADED:
316f8000896Sjsing 		case BIOC_SVSCRUB: /* can go to same state */
317f8000896Sjsing 			break;
318f8000896Sjsing 		default:
319f8000896Sjsing 			goto die;
320f8000896Sjsing 		}
321f8000896Sjsing 		break;
322f8000896Sjsing 
323f8000896Sjsing 	case BIOC_SVREBUILD:
324f8000896Sjsing 		switch (new_state) {
325f8000896Sjsing 		case BIOC_SVONLINE:
326f8000896Sjsing 		case BIOC_SVOFFLINE:
327f8000896Sjsing 		case BIOC_SVDEGRADED:
328f8000896Sjsing 		case BIOC_SVREBUILD: /* can go to the same state */
329f8000896Sjsing 			break;
330f8000896Sjsing 		default:
331f8000896Sjsing 			goto die;
332f8000896Sjsing 		}
333f8000896Sjsing 		break;
334f8000896Sjsing 
335f8000896Sjsing 	default:
336f8000896Sjsing die:
337f8000896Sjsing 		panic("%s: %s: invalid volume state transition %d -> %d",
338f8000896Sjsing 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
339f8000896Sjsing 		    old_state, new_state);
340f8000896Sjsing 		/* NOTREACHED */
341f8000896Sjsing 	}
342f8000896Sjsing 
343f8000896Sjsing 	sd->sd_vol_status = new_state;
344f8000896Sjsing }
345f8000896Sjsing 
346d4c09c60Sjsing static inline int
sr_raid5_chunk_online(struct sr_discipline * sd,int chunk)347d4c09c60Sjsing sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
348d4c09c60Sjsing {
349d4c09c60Sjsing 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
350d4c09c60Sjsing 	case BIOC_SDONLINE:
351d4c09c60Sjsing 	case BIOC_SDSCRUB:
352d4c09c60Sjsing 		return 1;
353d4c09c60Sjsing 	default:
354d4c09c60Sjsing 		return 0;
355d4c09c60Sjsing 	}
356d4c09c60Sjsing }
357d4c09c60Sjsing 
358efce0101Sjsing static inline int
sr_raid5_chunk_rebuild(struct sr_discipline * sd,int chunk)359efce0101Sjsing sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
360efce0101Sjsing {
361efce0101Sjsing 	switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
362efce0101Sjsing 	case BIOC_SDREBUILD:
363efce0101Sjsing 		return 1;
364efce0101Sjsing 	default:
365efce0101Sjsing 		return 0;
366efce0101Sjsing 	}
367efce0101Sjsing }
368efce0101Sjsing 
369f8000896Sjsing int
sr_raid5_rw(struct sr_workunit * wu)370f8000896Sjsing sr_raid5_rw(struct sr_workunit *wu)
371f8000896Sjsing {
372f8000896Sjsing 	struct sr_workunit	*wu_r = NULL;
373f8000896Sjsing 	struct sr_discipline	*sd = wu->swu_dis;
374f8000896Sjsing 	struct scsi_xfer	*xs = wu->swu_xs;
375f8000896Sjsing 	struct sr_chunk		*scp;
376d9ec6765Skrw 	daddr_t			blkno, lba;
377d9ec6765Skrw 	int64_t			chunk_offs, lbaoffs, offset, strip_offs;
378f8000896Sjsing 	int64_t			strip_bits, strip_no, strip_size;
379f8000896Sjsing 	int64_t			chunk, no_chunk;
380c804f705Skrw 	int64_t			parity, row_size;
381c804f705Skrw 	long			length, datalen;
382e3e73c0eSjsing 	void			*data;
383d4c09c60Sjsing 	int			s;
384f8000896Sjsing 
385d9ec6765Skrw 	/* blkno and scsi error will be handled by sr_validate_io */
386d9ec6765Skrw 	if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
387f8000896Sjsing 		goto bad;
388f8000896Sjsing 
389d9ec6765Skrw 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
390e3e73c0eSjsing 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
391e3e73c0eSjsing 	    (xs->flags & SCSI_DATA_IN) ? "read" : "write",
392d9ec6765Skrw 	    (long long)blkno, xs->datalen);
393e3e73c0eSjsing 
394f8000896Sjsing 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
395f8000896Sjsing 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
396f8000896Sjsing 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
397f8000896Sjsing 	row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
398f8000896Sjsing 
399f8000896Sjsing 	data = xs->data;
400f8000896Sjsing 	datalen = xs->datalen;
401d9ec6765Skrw 	lbaoffs	= blkno << DEV_BSHIFT;
402f8000896Sjsing 
403f8000896Sjsing 	if (xs->flags & SCSI_DATA_OUT) {
404f8000896Sjsing 		if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
4058a279608Sjsing 			printf("%s: %s failed to get read work unit",
4068a279608Sjsing 			    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
407f8000896Sjsing 			goto bad;
408f8000896Sjsing 		}
409f8000896Sjsing 		wu_r->swu_state = SR_WU_INPROGRESS;
410f8000896Sjsing 		wu_r->swu_flags |= SR_WUF_DISCIPLINE;
411f8000896Sjsing 	}
412f8000896Sjsing 
413f8000896Sjsing 	wu->swu_blk_start = 0;
414f8000896Sjsing 	while (datalen != 0) {
415f8000896Sjsing 		strip_no = lbaoffs >> strip_bits;
416f8000896Sjsing 		strip_offs = lbaoffs & (strip_size - 1);
417f8000896Sjsing 		chunk_offs = (strip_no / no_chunk) << strip_bits;
418d9ec6765Skrw 		offset = chunk_offs + strip_offs;
419f8000896Sjsing 
420f8000896Sjsing 		/* get size remaining in this stripe */
421f8000896Sjsing 		length = MIN(strip_size - strip_offs, datalen);
422f8000896Sjsing 
4232e5d087bSjsing 		/*
4242e5d087bSjsing 		 * Map disk offset to data and parity chunks, using a left
4252e5d087bSjsing 		 * asymmetric algorithm for the parity assignment.
4262e5d087bSjsing 		 */
427f8000896Sjsing 		chunk = strip_no % no_chunk;
428f8000896Sjsing 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
429f8000896Sjsing 		if (chunk >= parity)
430f8000896Sjsing 			chunk++;
431f8000896Sjsing 
432d9ec6765Skrw 		lba = offset >> DEV_BSHIFT;
433f8000896Sjsing 
434f8000896Sjsing 		/* XXX big hammer.. exclude I/O from entire stripe */
435f8000896Sjsing 		if (wu->swu_blk_start == 0)
436f8000896Sjsing 			wu->swu_blk_start = (strip_no / no_chunk) * row_size;
4372e5d087bSjsing 		wu->swu_blk_end = (strip_no / no_chunk) * row_size +
4382e5d087bSjsing 		    (row_size - 1);
439f8000896Sjsing 
440f8000896Sjsing 		scp = sd->sd_vol.sv_chunks[chunk];
441f8000896Sjsing 		if (xs->flags & SCSI_DATA_IN) {
442f8000896Sjsing 			switch (scp->src_meta.scm_status) {
443f8000896Sjsing 			case BIOC_SDONLINE:
444f8000896Sjsing 			case BIOC_SDSCRUB:
4452e5d087bSjsing 				/*
4462e5d087bSjsing 				 * Chunk is online, issue a single read
4472e5d087bSjsing 				 * request.
4482e5d087bSjsing 				 */
449f8000896Sjsing 				if (sr_raid5_addio(wu, chunk, lba, length,
450f8000896Sjsing 				    data, xs->flags, 0, NULL))
451f8000896Sjsing 					goto bad;
452f8000896Sjsing 				break;
453f8000896Sjsing 			case BIOC_SDOFFLINE:
454f8000896Sjsing 			case BIOC_SDREBUILD:
455f8000896Sjsing 			case BIOC_SDHOTSPARE:
456d4c09c60Sjsing 				if (sr_raid5_regenerate(wu, chunk, lba,
457d4c09c60Sjsing 				    length, data))
458f8000896Sjsing 					goto bad;
459f8000896Sjsing 				break;
460f8000896Sjsing 			default:
461f8000896Sjsing 				printf("%s: is offline, can't read\n",
462f8000896Sjsing 				    DEVNAME(sd->sd_sc));
463f8000896Sjsing 				goto bad;
464f8000896Sjsing 			}
465f8000896Sjsing 		} else {
466e3e73c0eSjsing 			if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
467e3e73c0eSjsing 			    length, data, xs->flags, 0))
468f8000896Sjsing 				goto bad;
469f8000896Sjsing 		}
470f8000896Sjsing 
471f8000896Sjsing 		/* advance to next block */
472f8000896Sjsing 		lbaoffs += length;
473f8000896Sjsing 		datalen -= length;
474f8000896Sjsing 		data += length;
475f8000896Sjsing 	}
476f8000896Sjsing 
477f8000896Sjsing 	s = splbio();
478f8000896Sjsing 	if (wu_r) {
479ebed995dSjsing 		if (wu_r->swu_io_count > 0) {
480f8000896Sjsing 			/* collide write request with reads */
481f8000896Sjsing 			wu_r->swu_blk_start = wu->swu_blk_start;
482f8000896Sjsing 			wu_r->swu_blk_end = wu->swu_blk_end;
483f8000896Sjsing 
484f8000896Sjsing 			wu->swu_state = SR_WU_DEFERRED;
485f8000896Sjsing 			wu_r->swu_collider = wu;
486f8000896Sjsing 			TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
487f8000896Sjsing 
488f8000896Sjsing 			wu = wu_r;
489ebed995dSjsing 		} else {
490ebed995dSjsing 			sr_scsi_wu_put(sd, wu_r);
491ebed995dSjsing 		}
492f8000896Sjsing 	}
493f8000896Sjsing 	splx(s);
494f8000896Sjsing 
495f8000896Sjsing 	sr_schedule_wu(wu);
496f8000896Sjsing 
497f8000896Sjsing 	return (0);
498f8000896Sjsing 
499f8000896Sjsing bad:
500f8000896Sjsing 	/* wu is unwound by sr_wu_put */
501f8000896Sjsing 	if (wu_r)
502f8000896Sjsing 		sr_scsi_wu_put(sd, wu_r);
503f8000896Sjsing 	return (1);
504f8000896Sjsing }
505f8000896Sjsing 
506d4c09c60Sjsing int
sr_raid5_regenerate(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data)507d4c09c60Sjsing sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
508c804f705Skrw     long len, void *data)
509d4c09c60Sjsing {
510d4c09c60Sjsing 	struct sr_discipline	*sd = wu->swu_dis;
511d4c09c60Sjsing 	int			i;
512d4c09c60Sjsing 
513d4c09c60Sjsing 	/*
514d4c09c60Sjsing 	 * Regenerate a block on a RAID 5 volume by xoring the data and parity
515d4c09c60Sjsing 	 * from all of the remaining online chunks. This requires the parity
516d4c09c60Sjsing 	 * to already be correct.
517d4c09c60Sjsing 	 */
518d4c09c60Sjsing 
519d4c09c60Sjsing 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
520d4c09c60Sjsing 	    "regenerating block %llu\n",
521d4c09c60Sjsing 	    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
522d4c09c60Sjsing 
523d4c09c60Sjsing 	memset(data, 0, len);
524d4c09c60Sjsing 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
525d4c09c60Sjsing 		if (i == chunk)
526d4c09c60Sjsing 			continue;
527d4c09c60Sjsing 		if (!sr_raid5_chunk_online(sd, i))
528d4c09c60Sjsing 			goto bad;
529d4c09c60Sjsing 		if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
530d4c09c60Sjsing 		    0, data))
531d4c09c60Sjsing 			goto bad;
532d4c09c60Sjsing 	}
533d4c09c60Sjsing 	return (0);
534d4c09c60Sjsing 
535d4c09c60Sjsing bad:
536d4c09c60Sjsing 	return (1);
537d4c09c60Sjsing }
538d4c09c60Sjsing 
539e3e73c0eSjsing int
sr_raid5_write(struct sr_workunit * wu,struct sr_workunit * wu_r,int chunk,int parity,daddr_t blkno,long len,void * data,int xsflags,int ccbflags)540e3e73c0eSjsing sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
541c804f705Skrw     int parity, daddr_t blkno, long len, void *data, int xsflags,
542e3e73c0eSjsing     int ccbflags)
543e3e73c0eSjsing {
544e3e73c0eSjsing 	struct sr_discipline	*sd = wu->swu_dis;
545e3e73c0eSjsing 	struct scsi_xfer	*xs = wu->swu_xs;
546e3e73c0eSjsing 	void			*xorbuf;
547efce0101Sjsing 	int			chunk_online, chunk_rebuild;
548efce0101Sjsing 	int			parity_online, parity_rebuild;
549efce0101Sjsing 	int			other_offline = 0, other_rebuild = 0;
550e3e73c0eSjsing 	int			i;
551e3e73c0eSjsing 
552e3e73c0eSjsing 	/*
553e3e73c0eSjsing 	 * Perform a write to a RAID 5 volume. This write routine does not
554e3e73c0eSjsing 	 * require the parity to already be correct and will operate on a
555e3e73c0eSjsing 	 * uninitialised volume.
556e3e73c0eSjsing 	 *
557e3e73c0eSjsing 	 * There are four possible cases:
558e3e73c0eSjsing 	 *
559e3e73c0eSjsing 	 * 1) All data chunks and parity are online. In this case we read the
560e3e73c0eSjsing 	 *    data from all data chunks, except the one we are writing to, in
561e3e73c0eSjsing 	 *    order to calculate and write the new parity.
562e3e73c0eSjsing 	 *
563e3e73c0eSjsing 	 * 2) The parity chunk is offline. In this case we only need to write
564e3e73c0eSjsing 	 *    to the data chunk. No parity calculation is required.
565e3e73c0eSjsing 	 *
566e3e73c0eSjsing 	 * 3) The data chunk is offline. In this case we read the data from all
567e3e73c0eSjsing 	 *    online chunks in order to calculate and write the new parity.
568e3e73c0eSjsing 	 *    This is the same as (1) except we do not write the data chunk.
569e3e73c0eSjsing 	 *
570e3e73c0eSjsing 	 * 4) A different data chunk is offline. The new parity is calculated
571e3e73c0eSjsing 	 *    by taking the existing parity, xoring the original data and
572e3e73c0eSjsing 	 *    xoring in the new data. This requires that the parity already be
573e3e73c0eSjsing 	 *    correct, which it will be if any of the data chunks has
574e3e73c0eSjsing 	 *    previously been written.
575efce0101Sjsing 	 *
576efce0101Sjsing 	 * There is an additional complication introduced by a chunk that is
577efce0101Sjsing 	 * being rebuilt. If this is the data or parity chunk, then we want
578efce0101Sjsing 	 * to write to it as per normal. If it is another data chunk then we
579efce0101Sjsing 	 * need to presume that it has not yet been regenerated and use the
580efce0101Sjsing 	 * same method as detailed in (4) above.
581e3e73c0eSjsing 	 */
582e3e73c0eSjsing 
583e3e73c0eSjsing 	DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
584d9ec6765Skrw 	    "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
585e3e73c0eSjsing 	    chunk, parity, (unsigned long long)blkno);
586e3e73c0eSjsing 
587e3e73c0eSjsing 	chunk_online = sr_raid5_chunk_online(sd, chunk);
588efce0101Sjsing 	chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
589e3e73c0eSjsing 	parity_online = sr_raid5_chunk_online(sd, parity);
590efce0101Sjsing 	parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
591e3e73c0eSjsing 
592e3e73c0eSjsing 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
593e3e73c0eSjsing 		if (i == chunk || i == parity)
594e3e73c0eSjsing 			continue;
595efce0101Sjsing 		if (sr_raid5_chunk_rebuild(sd, i))
596efce0101Sjsing 			other_rebuild = 1;
597efce0101Sjsing 		else if (!sr_raid5_chunk_online(sd, i))
598e3e73c0eSjsing 			other_offline = 1;
599e3e73c0eSjsing 	}
600e3e73c0eSjsing 
601e3e73c0eSjsing 	DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
602e3e73c0eSjsing 	    "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
603e3e73c0eSjsing 	    chunk_online, parity_online, other_offline);
604e3e73c0eSjsing 
605efce0101Sjsing 	if (!parity_online && !parity_rebuild)
606e3e73c0eSjsing 		goto data_write;
607e3e73c0eSjsing 
608e3e73c0eSjsing 	xorbuf = sr_block_get(sd, len);
609e3e73c0eSjsing 	if (xorbuf == NULL)
610e3e73c0eSjsing 		goto bad;
611e3e73c0eSjsing 	memcpy(xorbuf, data, len);
612e3e73c0eSjsing 
613efce0101Sjsing 	if (other_offline || other_rebuild) {
614e3e73c0eSjsing 
615e3e73c0eSjsing 		/*
616e3e73c0eSjsing 		 * XXX - If we can guarantee that this LBA has been scrubbed
617e3e73c0eSjsing 		 * then we can also take this faster path.
618e3e73c0eSjsing 		 */
619e3e73c0eSjsing 
620e3e73c0eSjsing 		/* Read in existing data and existing parity. */
621e3e73c0eSjsing 		if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
622e3e73c0eSjsing 		    SCSI_DATA_IN, 0, xorbuf))
623e3e73c0eSjsing 			goto bad;
624e3e73c0eSjsing 		if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
625e3e73c0eSjsing 		    SCSI_DATA_IN, 0, xorbuf))
626e3e73c0eSjsing 			goto bad;
627e3e73c0eSjsing 
628e3e73c0eSjsing 	} else {
629e3e73c0eSjsing 
630e3e73c0eSjsing 		/* Read in existing data from all other chunks. */
631e3e73c0eSjsing 		for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
632e3e73c0eSjsing 			if (i == chunk || i == parity)
633e3e73c0eSjsing 				continue;
634e3e73c0eSjsing 			if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
635e3e73c0eSjsing 			    SCSI_DATA_IN, 0, xorbuf))
636e3e73c0eSjsing 				goto bad;
637e3e73c0eSjsing 		}
638e3e73c0eSjsing 
639e3e73c0eSjsing 	}
640e3e73c0eSjsing 
641e3e73c0eSjsing 	/* Write new parity. */
642e3e73c0eSjsing 	if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
643e3e73c0eSjsing 	    SR_CCBF_FREEBUF, NULL))
644e3e73c0eSjsing 		goto bad;
645e3e73c0eSjsing 
646e3e73c0eSjsing data_write:
647e3e73c0eSjsing 	/* Write new data. */
648efce0101Sjsing 	if (chunk_online || chunk_rebuild)
649e3e73c0eSjsing 		if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
650e3e73c0eSjsing 		    0, NULL))
651e3e73c0eSjsing 			goto bad;
652e3e73c0eSjsing 
653e3e73c0eSjsing 	return (0);
654e3e73c0eSjsing 
655e3e73c0eSjsing bad:
656e3e73c0eSjsing 	return (1);
657e3e73c0eSjsing }
658e3e73c0eSjsing 
659f8000896Sjsing void
sr_raid5_intr(struct buf * bp)660f8000896Sjsing sr_raid5_intr(struct buf *bp)
661f8000896Sjsing {
662f8000896Sjsing 	struct sr_ccb		*ccb = (struct sr_ccb *)bp;
663f8000896Sjsing 	struct sr_workunit	*wu = ccb->ccb_wu;
664f8000896Sjsing 	struct sr_discipline	*sd = wu->swu_dis;
665f8000896Sjsing 	int			s;
666f8000896Sjsing 
667f8000896Sjsing 	DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
668f8000896Sjsing 	    DEVNAME(sd->sd_sc), bp, wu->swu_xs);
669f8000896Sjsing 
670f8000896Sjsing 	s = splbio();
671f8000896Sjsing 	sr_ccb_done(ccb);
672f8000896Sjsing 
673e328a933Sjsing 	/* XXX - Should this be done via the taskq? */
674f8000896Sjsing 
675f8000896Sjsing 	/* XOR data to result. */
676f8000896Sjsing 	if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
677f8000896Sjsing 		sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
678f8000896Sjsing 		    ccb->ccb_buf.b_bcount);
679f8000896Sjsing 
680f8000896Sjsing 	/* Free allocated data buffer. */
681f8000896Sjsing 	if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
6824c26ac15Sjsing 		sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
683f8000896Sjsing 		ccb->ccb_buf.b_data = NULL;
684f8000896Sjsing 	}
685f8000896Sjsing 
686f8000896Sjsing 	sr_wu_done(wu);
687f8000896Sjsing 	splx(s);
688f8000896Sjsing }
689f8000896Sjsing 
690f8000896Sjsing int
sr_raid5_wu_done(struct sr_workunit * wu)691f8000896Sjsing sr_raid5_wu_done(struct sr_workunit *wu)
692f8000896Sjsing {
693f8000896Sjsing 	struct sr_discipline	*sd = wu->swu_dis;
694f8000896Sjsing 	struct scsi_xfer	*xs = wu->swu_xs;
695f8000896Sjsing 
696f8000896Sjsing 	/* XXX - we have no way of propagating errors... */
697a6c8c894Sjsing 	if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
698f8000896Sjsing 		return SR_WU_OK;
699f8000896Sjsing 
700d4c09c60Sjsing 	/* XXX - This is insufficient for RAID 5. */
701f8000896Sjsing 	if (wu->swu_ios_succeeded > 0) {
702f8000896Sjsing 		xs->error = XS_NOERROR;
703f8000896Sjsing 		return SR_WU_OK;
704f8000896Sjsing 	}
705f8000896Sjsing 
706f8000896Sjsing 	if (xs->flags & SCSI_DATA_IN) {
707f8000896Sjsing 		printf("%s: retrying read on block %lld\n",
708f8000896Sjsing 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
709f8000896Sjsing 		sr_wu_release_ccbs(wu);
710f8000896Sjsing 		wu->swu_state = SR_WU_RESTART;
711f8000896Sjsing 		if (sd->sd_scsi_rw(wu) == 0)
712f8000896Sjsing 			return SR_WU_RESTART;
713f8000896Sjsing 	} else {
714a6c8c894Sjsing 		/* XXX - retry write if we just went from online to degraded. */
715f8000896Sjsing 		printf("%s: permanently fail write on block %lld\n",
716f8000896Sjsing 		    sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
717f8000896Sjsing 	}
718f8000896Sjsing 
719f8000896Sjsing 	wu->swu_state = SR_WU_FAILED;
720f8000896Sjsing 	xs->error = XS_DRIVER_STUFFUP;
721f8000896Sjsing 
722f8000896Sjsing 	return SR_WU_FAILED;
723f8000896Sjsing }
724f8000896Sjsing 
725f8000896Sjsing int
sr_raid5_addio(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data,int xsflags,int ccbflags,void * xorbuf)726f8000896Sjsing sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
727c804f705Skrw     long len, void *data, int xsflags, int ccbflags, void *xorbuf)
728f8000896Sjsing {
729f8000896Sjsing 	struct sr_discipline	*sd = wu->swu_dis;
730f8000896Sjsing 	struct sr_ccb		*ccb;
731f8000896Sjsing 
732d4c09c60Sjsing 	DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
733c804f705Skrw 	    "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
734c804f705Skrw 	    chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
735f8000896Sjsing 
736f8000896Sjsing 	/* Allocate temporary buffer. */
737f8000896Sjsing 	if (data == NULL) {
7384c26ac15Sjsing 		data = sr_block_get(sd, len);
739f8000896Sjsing 		if (data == NULL)
740f8000896Sjsing 			return (-1);
741f8000896Sjsing 		ccbflags |= SR_CCBF_FREEBUF;
742f8000896Sjsing 	}
743f8000896Sjsing 
744f8000896Sjsing 	ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
745f8000896Sjsing 	if (ccb == NULL) {
746f8000896Sjsing 		if (ccbflags & SR_CCBF_FREEBUF)
7474c26ac15Sjsing 			sr_block_put(sd, data, len);
748f8000896Sjsing 		return (-1);
749f8000896Sjsing 	}
750f8000896Sjsing 	ccb->ccb_opaque = xorbuf;
751f8000896Sjsing 	sr_wu_enqueue_ccb(wu, ccb);
752f8000896Sjsing 
753f8000896Sjsing 	return (0);
754f8000896Sjsing }
755f8000896Sjsing 
756f8000896Sjsing void
sr_raid5_xor(void * a,void * b,int len)757f8000896Sjsing sr_raid5_xor(void *a, void *b, int len)
758f8000896Sjsing {
759f8000896Sjsing 	uint32_t		*xa = a, *xb = b;
760f8000896Sjsing 
761f8000896Sjsing 	len >>= 2;
762f8000896Sjsing 	while (len--)
763f8000896Sjsing 		*xa++ ^= *xb++;
764f8000896Sjsing }
765f8000896Sjsing 
766fd7fd89bSjsing void
sr_raid5_rebuild(struct sr_discipline * sd)767fd7fd89bSjsing sr_raid5_rebuild(struct sr_discipline *sd)
768fd7fd89bSjsing {
76938301a9fSjsing 	int64_t strip_no, strip_size, strip_bits, i, restart;
770fd7fd89bSjsing 	int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
771fd7fd89bSjsing 	struct sr_workunit *wu_r, *wu_w;
772fd7fd89bSjsing 	int s, slept, percent = 0, old_percent = -1;
773fd7fd89bSjsing 	int rebuild_chunk = -1;
774fd7fd89bSjsing 	void *xorbuf;
775fd7fd89bSjsing 
776fd7fd89bSjsing 	/* Find the rebuild chunk. */
777fd7fd89bSjsing 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
778fd7fd89bSjsing 		if (sr_raid5_chunk_rebuild(sd, i)) {
779fd7fd89bSjsing 			rebuild_chunk = i;
780fd7fd89bSjsing 			break;
781fd7fd89bSjsing 		}
782fd7fd89bSjsing 	}
783fd7fd89bSjsing 	if (rebuild_chunk == -1)
784fd7fd89bSjsing 		goto bad;
785fd7fd89bSjsing 
786fd7fd89bSjsing 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
787fd7fd89bSjsing 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
788fd7fd89bSjsing 	chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
789fd7fd89bSjsing 	chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
790fd7fd89bSjsing 	chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
791fd7fd89bSjsing 	row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
792fd7fd89bSjsing 
793fd7fd89bSjsing 	DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
794fd7fd89bSjsing 	    "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
795fd7fd89bSjsing 	    "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
796fd7fd89bSjsing 	    sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
797fd7fd89bSjsing 	    row_size);
798fd7fd89bSjsing 
799fd37253eSjsing 	restart = sd->sd_meta->ssd_rebuild / row_size;
800fd37253eSjsing 	if (restart > chunk_strips) {
801fd37253eSjsing 		printf("%s: bogus rebuild restart offset, starting from 0\n",
802fd37253eSjsing 		    DEVNAME(sd->sd_sc));
803fd37253eSjsing 		restart = 0;
804fd37253eSjsing 	}
805fd37253eSjsing 	if (restart != 0) {
80638301a9fSjsing 		percent = sr_rebuild_percent(sd);
807fd37253eSjsing 		printf("%s: resuming rebuild on %s at %d%%\n",
808fd37253eSjsing 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
809fd37253eSjsing 	}
810fd37253eSjsing 
811fd37253eSjsing 	for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
812bd56a12fSkrw 		chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
813fd7fd89bSjsing 
814fd7fd89bSjsing 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
815fd7fd89bSjsing 		    "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
816fd7fd89bSjsing 		    sd->sd_meta->ssd_devname, strip_no, chunk_lba);
817fd7fd89bSjsing 
818fd7fd89bSjsing 		wu_w = sr_scsi_wu_get(sd, 0);
819fd7fd89bSjsing 		wu_r = sr_scsi_wu_get(sd, 0);
820fd7fd89bSjsing 
821fd7fd89bSjsing 		xorbuf = sr_block_get(sd, strip_size);
82231135a12Stobhe 		if (xorbuf == NULL)
82331135a12Stobhe 			goto bad;
824fd7fd89bSjsing 		if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
825fd7fd89bSjsing 		    strip_size, xorbuf))
826fd7fd89bSjsing 			goto bad;
827fd7fd89bSjsing 		if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
828fd7fd89bSjsing 		    xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
829fd7fd89bSjsing 			goto bad;
830fd7fd89bSjsing 
831fd7fd89bSjsing 		/* Collide write work unit with read work unit. */
832fd7fd89bSjsing 		wu_r->swu_state = SR_WU_INPROGRESS;
833fd7fd89bSjsing 		wu_r->swu_flags |= SR_WUF_REBUILD;
834fd7fd89bSjsing 		wu_w->swu_state = SR_WU_DEFERRED;
835fd7fd89bSjsing 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
836fd7fd89bSjsing 		wu_r->swu_collider = wu_w;
837fd7fd89bSjsing 
838fd7fd89bSjsing 		/* Block I/O to this strip while we rebuild it. */
839fd7fd89bSjsing 		wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
840fd7fd89bSjsing 		wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
841fd7fd89bSjsing 		wu_w->swu_blk_start = wu_r->swu_blk_start;
842fd7fd89bSjsing 		wu_w->swu_blk_end = wu_r->swu_blk_end;
843fd7fd89bSjsing 
844fd7fd89bSjsing 		DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
845fd7fd89bSjsing 		    "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
846fd7fd89bSjsing 		    sd->sd_meta->ssd_devname,
847fd7fd89bSjsing 		    wu_r->swu_blk_start, wu_r->swu_blk_end);
848fd7fd89bSjsing 
849fd7fd89bSjsing 		s = splbio();
850fd7fd89bSjsing 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
851fd7fd89bSjsing 		splx(s);
852fd7fd89bSjsing 
853fd7fd89bSjsing 		sr_schedule_wu(wu_r);
854fd7fd89bSjsing 
855fd7fd89bSjsing 		slept = 0;
856fd7fd89bSjsing 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
857fe2d3ae9Scheloha 			tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP);
858fd7fd89bSjsing 			slept = 1;
859fd7fd89bSjsing 		}
860632f6d33Scheloha 		if (!slept) {
861632f6d33Scheloha 			tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
862632f6d33Scheloha 			    MSEC_TO_NSEC(1));
863632f6d33Scheloha 		}
864fd7fd89bSjsing 
865fd7fd89bSjsing 		sr_scsi_wu_put(sd, wu_r);
866fd7fd89bSjsing 		sr_scsi_wu_put(sd, wu_w);
867fd7fd89bSjsing 
868bd56a12fSkrw 		sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
869fd7fd89bSjsing 
87038301a9fSjsing 		percent = sr_rebuild_percent(sd);
871fd7fd89bSjsing 		if (percent != old_percent && strip_no != chunk_strips - 1) {
872fd7fd89bSjsing 			if (sr_meta_save(sd, SR_META_DIRTY))
873fd7fd89bSjsing 				printf("%s: could not save metadata to %s\n",
874fd7fd89bSjsing 				    DEVNAME(sd->sd_sc),
875fd7fd89bSjsing 				    sd->sd_meta->ssd_devname);
876fd7fd89bSjsing 			old_percent = percent;
877fd7fd89bSjsing 		}
878fd7fd89bSjsing 
879fd7fd89bSjsing 		if (sd->sd_reb_abort)
880fd7fd89bSjsing 			goto abort;
881fd7fd89bSjsing 	}
882fd7fd89bSjsing 
883fd7fd89bSjsing 	DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
884fd7fd89bSjsing 	    sd->sd_meta->ssd_devname);
885fd7fd89bSjsing 
886fd7fd89bSjsing 	/* all done */
887fd7fd89bSjsing 	sd->sd_meta->ssd_rebuild = 0;
888fd7fd89bSjsing 	for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
889fd7fd89bSjsing 		if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
890fd7fd89bSjsing 		    BIOC_SDREBUILD) {
891fd7fd89bSjsing 			sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
892fd7fd89bSjsing 			break;
893fd7fd89bSjsing 		}
894fd7fd89bSjsing 	}
895fd7fd89bSjsing 
896fd7fd89bSjsing 	return;
897fd7fd89bSjsing 
898fd7fd89bSjsing abort:
899fd7fd89bSjsing 	if (sr_meta_save(sd, SR_META_DIRTY))
900fd7fd89bSjsing 		printf("%s: could not save metadata to %s\n",
901fd7fd89bSjsing 		    DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
902fd7fd89bSjsing bad:
903fd7fd89bSjsing 	return;
904fd7fd89bSjsing }
905fd7fd89bSjsing 
906f8000896Sjsing #if 0
907f8000896Sjsing void
908f8000896Sjsing sr_raid5_scrub(struct sr_discipline *sd)
909f8000896Sjsing {
910f8000896Sjsing 	int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
911f8000896Sjsing 	int64_t i;
912f8000896Sjsing 	struct sr_workunit *wu_r, *wu_w;
913f8000896Sjsing 	int s, slept;
914f8000896Sjsing 	void *xorbuf;
915f8000896Sjsing 
916f8000896Sjsing 	wu_w = sr_scsi_wu_get(sd, 0);
917f8000896Sjsing 	wu_r = sr_scsi_wu_get(sd, 0);
918f8000896Sjsing 
919f8000896Sjsing 	no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
920f8000896Sjsing 	strip_size = sd->sd_meta->ssdi.ssd_strip_size;
921f8000896Sjsing 	strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
922f8000896Sjsing 	max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
923f8000896Sjsing 
924f8000896Sjsing 	for (strip_no = 0; strip_no < max_strip; strip_no++) {
925f8000896Sjsing 		parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
926f8000896Sjsing 
9274c26ac15Sjsing 		xorbuf = sr_block_get(sd, strip_size);
928f8000896Sjsing 		for (i = 0; i <= no_chunk; i++) {
929f8000896Sjsing 			if (i != parity)
930f8000896Sjsing 				sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
931f8000896Sjsing 				    NULL, SCSI_DATA_IN, 0, xorbuf);
932f8000896Sjsing 		}
933f8000896Sjsing 		sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
934f8000896Sjsing 		    SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
935f8000896Sjsing 
936f8000896Sjsing 		wu_r->swu_flags |= SR_WUF_REBUILD;
937f8000896Sjsing 
938f8000896Sjsing 		/* Collide wu_w with wu_r */
939f8000896Sjsing 		wu_w->swu_state = SR_WU_DEFERRED;
940f8000896Sjsing 		wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
941f8000896Sjsing 		wu_r->swu_collider = wu_w;
942f8000896Sjsing 
943f8000896Sjsing 		s = splbio();
944f8000896Sjsing 		TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
945f8000896Sjsing 		splx(s);
946f8000896Sjsing 
947f8000896Sjsing 		wu_r->swu_state = SR_WU_INPROGRESS;
948f8000896Sjsing 		sr_schedule_wu(wu_r);
949f8000896Sjsing 
950f8000896Sjsing 		slept = 0;
951f8000896Sjsing 		while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
952fe2d3ae9Scheloha 			tsleep_nsec(wu_w, PRIBIO, "sr_scrub", INFSLP);
953f8000896Sjsing 			slept = 1;
954f8000896Sjsing 		}
955632f6d33Scheloha 		if (!slept) {
956632f6d33Scheloha 			tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
957632f6d33Scheloha 			    MSEC_TO_NSEC(1));
958632f6d33Scheloha 		}
959f8000896Sjsing 	}
960f8000896Sjsing }
961f8000896Sjsing #endif
962