1*f6d8fcaeSderaadt /* $OpenBSD: softraid_raid5.c,v 1.32 2021/05/16 15:12:37 deraadt Exp $ */
2f8000896Sjsing /*
37b8b4b44Sjsing * Copyright (c) 2014 Joel Sing <jsing@openbsd.org>
4f8000896Sjsing * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us>
5f8000896Sjsing * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org>
6f8000896Sjsing *
7f8000896Sjsing * Permission to use, copy, modify, and distribute this software for any
8f8000896Sjsing * purpose with or without fee is hereby granted, provided that the above
9f8000896Sjsing * copyright notice and this permission notice appear in all copies.
10f8000896Sjsing *
11f8000896Sjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12f8000896Sjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13f8000896Sjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14f8000896Sjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15f8000896Sjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16f8000896Sjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17f8000896Sjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18f8000896Sjsing */
19f8000896Sjsing
20f8000896Sjsing #include "bio.h"
21f8000896Sjsing
22f8000896Sjsing #include <sys/param.h>
23f8000896Sjsing #include <sys/systm.h>
24f8000896Sjsing #include <sys/buf.h>
25f8000896Sjsing #include <sys/device.h>
26f8000896Sjsing #include <sys/ioctl.h>
27f8000896Sjsing #include <sys/malloc.h>
28f8000896Sjsing #include <sys/kernel.h>
29f8000896Sjsing #include <sys/disk.h>
30f8000896Sjsing #include <sys/rwlock.h>
31f8000896Sjsing #include <sys/queue.h>
32f8000896Sjsing #include <sys/fcntl.h>
33f8000896Sjsing #include <sys/mount.h>
34f8000896Sjsing #include <sys/sensors.h>
35f8000896Sjsing #include <sys/stat.h>
36e328a933Sjsing #include <sys/task.h>
37f8000896Sjsing #include <sys/pool.h>
38f8000896Sjsing #include <sys/conf.h>
39f8000896Sjsing #include <sys/uio.h>
40f8000896Sjsing
41f8000896Sjsing #include <scsi/scsi_all.h>
42f8000896Sjsing #include <scsi/scsiconf.h>
43f8000896Sjsing #include <scsi/scsi_disk.h>
44f8000896Sjsing
45f8000896Sjsing #include <dev/softraidvar.h>
46f8000896Sjsing
47f8000896Sjsing /* RAID 5 functions. */
48f8000896Sjsing int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *,
49f8000896Sjsing int, int64_t);
50f8000896Sjsing int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *,
51f8000896Sjsing int, void *);
52f8000896Sjsing int sr_raid5_init(struct sr_discipline *);
53f8000896Sjsing int sr_raid5_rw(struct sr_workunit *);
54f8000896Sjsing int sr_raid5_openings(struct sr_discipline *);
55f8000896Sjsing void sr_raid5_intr(struct buf *);
56f8000896Sjsing int sr_raid5_wu_done(struct sr_workunit *);
57f8000896Sjsing void sr_raid5_set_chunk_state(struct sr_discipline *, int, int);
58f8000896Sjsing void sr_raid5_set_vol_state(struct sr_discipline *);
59f8000896Sjsing
60c804f705Skrw int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long,
61f8000896Sjsing void *, int, int, void *);
62c804f705Skrw int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long,
63d4c09c60Sjsing void *);
64e3e73c0eSjsing int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int,
65c804f705Skrw daddr_t, long, void *, int, int);
66d4c09c60Sjsing void sr_raid5_xor(void *, void *, int);
67d4c09c60Sjsing
68fd7fd89bSjsing void sr_raid5_rebuild(struct sr_discipline *);
69f8000896Sjsing void sr_raid5_scrub(struct sr_discipline *);
70f8000896Sjsing
71f8000896Sjsing /* discipline initialisation. */
72f8000896Sjsing void
sr_raid5_discipline_init(struct sr_discipline * sd)73f8000896Sjsing sr_raid5_discipline_init(struct sr_discipline *sd)
74f8000896Sjsing {
75f8000896Sjsing /* Fill out discipline members. */
76f8000896Sjsing sd->sd_type = SR_MD_RAID5;
77f8000896Sjsing strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name));
78f8000896Sjsing sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
79fd7fd89bSjsing SR_CAP_REBUILD | SR_CAP_REDUNDANT;
808a279608Sjsing sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */
81f8000896Sjsing
82f8000896Sjsing /* Setup discipline specific function pointers. */
83f8000896Sjsing sd->sd_assemble = sr_raid5_assemble;
84f8000896Sjsing sd->sd_create = sr_raid5_create;
85f8000896Sjsing sd->sd_openings = sr_raid5_openings;
86fd7fd89bSjsing sd->sd_rebuild = sr_raid5_rebuild;
87f8000896Sjsing sd->sd_scsi_rw = sr_raid5_rw;
88f8000896Sjsing sd->sd_scsi_intr = sr_raid5_intr;
89f8000896Sjsing sd->sd_scsi_wu_done = sr_raid5_wu_done;
90f8000896Sjsing sd->sd_set_chunk_state = sr_raid5_set_chunk_state;
91f8000896Sjsing sd->sd_set_vol_state = sr_raid5_set_vol_state;
92f8000896Sjsing }
93f8000896Sjsing
94f8000896Sjsing int
sr_raid5_create(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,int64_t coerced_size)95f8000896Sjsing sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc,
96f8000896Sjsing int no_chunk, int64_t coerced_size)
97f8000896Sjsing {
98f8000896Sjsing if (no_chunk < 3) {
99f8000896Sjsing sr_error(sd->sd_sc, "%s requires three or more chunks",
100f8000896Sjsing sd->sd_name);
101f8000896Sjsing return EINVAL;
102f8000896Sjsing }
103f8000896Sjsing
104f8000896Sjsing /*
105f8000896Sjsing * XXX add variable strip size later even though MAXPHYS is really
106f8000896Sjsing * the clever value, users like to tinker with that type of stuff.
107f8000896Sjsing */
108f8000896Sjsing sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS;
109f8000896Sjsing sd->sd_meta->ssdi.ssd_size = (coerced_size &
110f8000896Sjsing ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >>
111f8000896Sjsing DEV_BSHIFT) - 1)) * (no_chunk - 1);
112f8000896Sjsing
113f8000896Sjsing return sr_raid5_init(sd);
114f8000896Sjsing }
115f8000896Sjsing
116f8000896Sjsing int
sr_raid5_assemble(struct sr_discipline * sd,struct bioc_createraid * bc,int no_chunk,void * data)117f8000896Sjsing sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
118f8000896Sjsing int no_chunk, void *data)
119f8000896Sjsing {
120f8000896Sjsing return sr_raid5_init(sd);
121f8000896Sjsing }
122f8000896Sjsing
123f8000896Sjsing int
sr_raid5_init(struct sr_discipline * sd)124f8000896Sjsing sr_raid5_init(struct sr_discipline *sd)
125f8000896Sjsing {
126f8000896Sjsing /* Initialise runtime values. */
127f8000896Sjsing sd->mds.mdd_raid5.sr5_strip_bits =
128f8000896Sjsing sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size);
129f8000896Sjsing if (sd->mds.mdd_raid5.sr5_strip_bits == -1) {
130f8000896Sjsing sr_error(sd->sd_sc, "invalid strip size");
131f8000896Sjsing return EINVAL;
132f8000896Sjsing }
133f8000896Sjsing
1343976ba54Skrw sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
1353976ba54Skrw
136f8000896Sjsing return 0;
137f8000896Sjsing }
138f8000896Sjsing
139f8000896Sjsing int
sr_raid5_openings(struct sr_discipline * sd)140f8000896Sjsing sr_raid5_openings(struct sr_discipline *sd)
141f8000896Sjsing {
1428a279608Sjsing /* Two work units per I/O, two for rebuild/scrub. */
1438a279608Sjsing return ((sd->sd_max_wu - 2) >> 1);
144f8000896Sjsing }
145f8000896Sjsing
146f8000896Sjsing void
sr_raid5_set_chunk_state(struct sr_discipline * sd,int c,int new_state)147f8000896Sjsing sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
148f8000896Sjsing {
149f8000896Sjsing int old_state, s;
150f8000896Sjsing
151f8000896Sjsing DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n",
152f8000896Sjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
153f8000896Sjsing sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state);
154f8000896Sjsing
155f8000896Sjsing /* ok to go to splbio since this only happens in error path */
156f8000896Sjsing s = splbio();
157f8000896Sjsing old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status;
158f8000896Sjsing
159f8000896Sjsing /* multiple IOs to the same chunk that fail will come through here */
160f8000896Sjsing if (old_state == new_state)
161f8000896Sjsing goto done;
162f8000896Sjsing
163f8000896Sjsing switch (old_state) {
164f8000896Sjsing case BIOC_SDONLINE:
165f8000896Sjsing switch (new_state) {
166f8000896Sjsing case BIOC_SDOFFLINE:
167f8000896Sjsing case BIOC_SDSCRUB:
168f8000896Sjsing break;
169f8000896Sjsing default:
170f8000896Sjsing goto die;
171f8000896Sjsing }
172f8000896Sjsing break;
173f8000896Sjsing
174f8000896Sjsing case BIOC_SDOFFLINE:
175f8000896Sjsing if (new_state == BIOC_SDREBUILD) {
176f8000896Sjsing ;
177f8000896Sjsing } else
178f8000896Sjsing goto die;
179f8000896Sjsing break;
180f8000896Sjsing
181f8000896Sjsing case BIOC_SDSCRUB:
182f8000896Sjsing switch (new_state) {
183f8000896Sjsing case BIOC_SDONLINE:
184f8000896Sjsing case BIOC_SDOFFLINE:
185f8000896Sjsing break;
186f8000896Sjsing default:
187f8000896Sjsing goto die;
188f8000896Sjsing }
189f8000896Sjsing break;
190f8000896Sjsing
191f8000896Sjsing case BIOC_SDREBUILD:
192f8000896Sjsing switch (new_state) {
193f8000896Sjsing case BIOC_SDONLINE:
194f8000896Sjsing case BIOC_SDOFFLINE:
195f8000896Sjsing break;
196f8000896Sjsing default:
197f8000896Sjsing goto die;
198f8000896Sjsing }
199f8000896Sjsing break;
200f8000896Sjsing
201f8000896Sjsing default:
202f8000896Sjsing die:
203f8000896Sjsing splx(s); /* XXX */
204*f6d8fcaeSderaadt panic("%s: %s: %s: invalid chunk state transition %d -> %d",
205*f6d8fcaeSderaadt DEVNAME(sd->sd_sc),
206f8000896Sjsing sd->sd_meta->ssd_devname,
207f8000896Sjsing sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname,
208f8000896Sjsing old_state, new_state);
209f8000896Sjsing /* NOTREACHED */
210f8000896Sjsing }
211f8000896Sjsing
212f8000896Sjsing sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state;
213f8000896Sjsing sd->sd_set_vol_state(sd);
214f8000896Sjsing
215f8000896Sjsing sd->sd_must_flush = 1;
216e328a933Sjsing task_add(systq, &sd->sd_meta_save_task);
217f8000896Sjsing done:
218f8000896Sjsing splx(s);
219f8000896Sjsing }
220f8000896Sjsing
221f8000896Sjsing void
sr_raid5_set_vol_state(struct sr_discipline * sd)222f8000896Sjsing sr_raid5_set_vol_state(struct sr_discipline *sd)
223f8000896Sjsing {
224f8000896Sjsing int states[SR_MAX_STATES];
225f8000896Sjsing int new_state, i, s, nd;
226f8000896Sjsing int old_state = sd->sd_vol_status;
227f8000896Sjsing
228f8000896Sjsing DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n",
229f8000896Sjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
230f8000896Sjsing
231f8000896Sjsing nd = sd->sd_meta->ssdi.ssd_chunk_no;
232f8000896Sjsing
233f8000896Sjsing for (i = 0; i < SR_MAX_STATES; i++)
234f8000896Sjsing states[i] = 0;
235f8000896Sjsing
236f8000896Sjsing for (i = 0; i < nd; i++) {
237f8000896Sjsing s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status;
238f8000896Sjsing if (s >= SR_MAX_STATES)
239f8000896Sjsing panic("%s: %s: %s: invalid chunk state",
240f8000896Sjsing DEVNAME(sd->sd_sc),
241f8000896Sjsing sd->sd_meta->ssd_devname,
242f8000896Sjsing sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname);
243f8000896Sjsing states[s]++;
244f8000896Sjsing }
245f8000896Sjsing
246f8000896Sjsing if (states[BIOC_SDONLINE] == nd)
247f8000896Sjsing new_state = BIOC_SVONLINE;
248f8000896Sjsing else if (states[BIOC_SDONLINE] < nd - 1)
249f8000896Sjsing new_state = BIOC_SVOFFLINE;
250f8000896Sjsing else if (states[BIOC_SDSCRUB] != 0)
251f8000896Sjsing new_state = BIOC_SVSCRUB;
252f8000896Sjsing else if (states[BIOC_SDREBUILD] != 0)
253f8000896Sjsing new_state = BIOC_SVREBUILD;
254f8000896Sjsing else if (states[BIOC_SDONLINE] == nd - 1)
255f8000896Sjsing new_state = BIOC_SVDEGRADED;
256f8000896Sjsing else {
257f8000896Sjsing #ifdef SR_DEBUG
258f8000896Sjsing DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state "
259f8000896Sjsing "was %d\n", DEVNAME(sd->sd_sc), old_state);
260f8000896Sjsing for (i = 0; i < nd; i++)
261f8000896Sjsing DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n",
262f8000896Sjsing DEVNAME(sd->sd_sc), i,
263f8000896Sjsing sd->sd_vol.sv_chunks[i]->src_meta.scm_status);
264f8000896Sjsing #endif
265f8000896Sjsing panic("invalid volume state");
266f8000896Sjsing }
267f8000896Sjsing
268f8000896Sjsing DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n",
269f8000896Sjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
270f8000896Sjsing old_state, new_state);
271f8000896Sjsing
272f8000896Sjsing switch (old_state) {
273f8000896Sjsing case BIOC_SVONLINE:
274f8000896Sjsing switch (new_state) {
275f8000896Sjsing case BIOC_SVONLINE: /* can go to same state */
2760e0446b0Sjsing case BIOC_SVOFFLINE:
277f8000896Sjsing case BIOC_SVDEGRADED:
278f8000896Sjsing case BIOC_SVREBUILD: /* happens on boot */
279f8000896Sjsing break;
280f8000896Sjsing default:
281f8000896Sjsing goto die;
282f8000896Sjsing }
283f8000896Sjsing break;
284f8000896Sjsing
285f8000896Sjsing case BIOC_SVOFFLINE:
286f8000896Sjsing /* XXX this might be a little too much */
287f8000896Sjsing goto die;
288f8000896Sjsing
28997e41644Sjsing case BIOC_SVDEGRADED:
29097e41644Sjsing switch (new_state) {
29197e41644Sjsing case BIOC_SVOFFLINE:
29297e41644Sjsing case BIOC_SVREBUILD:
29397e41644Sjsing case BIOC_SVDEGRADED: /* can go to the same state */
29497e41644Sjsing break;
29597e41644Sjsing default:
29697e41644Sjsing goto die;
29797e41644Sjsing }
29897e41644Sjsing break;
29997e41644Sjsing
3000e0446b0Sjsing case BIOC_SVBUILDING:
3010e0446b0Sjsing switch (new_state) {
3020e0446b0Sjsing case BIOC_SVONLINE:
3030e0446b0Sjsing case BIOC_SVOFFLINE:
3040e0446b0Sjsing case BIOC_SVBUILDING: /* can go to the same state */
3050e0446b0Sjsing break;
3060e0446b0Sjsing default:
3070e0446b0Sjsing goto die;
3080e0446b0Sjsing }
3090e0446b0Sjsing break;
3100e0446b0Sjsing
311f8000896Sjsing case BIOC_SVSCRUB:
312f8000896Sjsing switch (new_state) {
313f8000896Sjsing case BIOC_SVONLINE:
314f8000896Sjsing case BIOC_SVOFFLINE:
315f8000896Sjsing case BIOC_SVDEGRADED:
316f8000896Sjsing case BIOC_SVSCRUB: /* can go to same state */
317f8000896Sjsing break;
318f8000896Sjsing default:
319f8000896Sjsing goto die;
320f8000896Sjsing }
321f8000896Sjsing break;
322f8000896Sjsing
323f8000896Sjsing case BIOC_SVREBUILD:
324f8000896Sjsing switch (new_state) {
325f8000896Sjsing case BIOC_SVONLINE:
326f8000896Sjsing case BIOC_SVOFFLINE:
327f8000896Sjsing case BIOC_SVDEGRADED:
328f8000896Sjsing case BIOC_SVREBUILD: /* can go to the same state */
329f8000896Sjsing break;
330f8000896Sjsing default:
331f8000896Sjsing goto die;
332f8000896Sjsing }
333f8000896Sjsing break;
334f8000896Sjsing
335f8000896Sjsing default:
336f8000896Sjsing die:
337f8000896Sjsing panic("%s: %s: invalid volume state transition %d -> %d",
338f8000896Sjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
339f8000896Sjsing old_state, new_state);
340f8000896Sjsing /* NOTREACHED */
341f8000896Sjsing }
342f8000896Sjsing
343f8000896Sjsing sd->sd_vol_status = new_state;
344f8000896Sjsing }
345f8000896Sjsing
346d4c09c60Sjsing static inline int
sr_raid5_chunk_online(struct sr_discipline * sd,int chunk)347d4c09c60Sjsing sr_raid5_chunk_online(struct sr_discipline *sd, int chunk)
348d4c09c60Sjsing {
349d4c09c60Sjsing switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
350d4c09c60Sjsing case BIOC_SDONLINE:
351d4c09c60Sjsing case BIOC_SDSCRUB:
352d4c09c60Sjsing return 1;
353d4c09c60Sjsing default:
354d4c09c60Sjsing return 0;
355d4c09c60Sjsing }
356d4c09c60Sjsing }
357d4c09c60Sjsing
358efce0101Sjsing static inline int
sr_raid5_chunk_rebuild(struct sr_discipline * sd,int chunk)359efce0101Sjsing sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk)
360efce0101Sjsing {
361efce0101Sjsing switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) {
362efce0101Sjsing case BIOC_SDREBUILD:
363efce0101Sjsing return 1;
364efce0101Sjsing default:
365efce0101Sjsing return 0;
366efce0101Sjsing }
367efce0101Sjsing }
368efce0101Sjsing
369f8000896Sjsing int
sr_raid5_rw(struct sr_workunit * wu)370f8000896Sjsing sr_raid5_rw(struct sr_workunit *wu)
371f8000896Sjsing {
372f8000896Sjsing struct sr_workunit *wu_r = NULL;
373f8000896Sjsing struct sr_discipline *sd = wu->swu_dis;
374f8000896Sjsing struct scsi_xfer *xs = wu->swu_xs;
375f8000896Sjsing struct sr_chunk *scp;
376d9ec6765Skrw daddr_t blkno, lba;
377d9ec6765Skrw int64_t chunk_offs, lbaoffs, offset, strip_offs;
378f8000896Sjsing int64_t strip_bits, strip_no, strip_size;
379f8000896Sjsing int64_t chunk, no_chunk;
380c804f705Skrw int64_t parity, row_size;
381c804f705Skrw long length, datalen;
382e3e73c0eSjsing void *data;
383d4c09c60Sjsing int s;
384f8000896Sjsing
385d9ec6765Skrw /* blkno and scsi error will be handled by sr_validate_io */
386d9ec6765Skrw if (sr_validate_io(wu, &blkno, "sr_raid5_rw"))
387f8000896Sjsing goto bad;
388f8000896Sjsing
389d9ec6765Skrw DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n",
390e3e73c0eSjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
391e3e73c0eSjsing (xs->flags & SCSI_DATA_IN) ? "read" : "write",
392d9ec6765Skrw (long long)blkno, xs->datalen);
393e3e73c0eSjsing
394f8000896Sjsing strip_size = sd->sd_meta->ssdi.ssd_strip_size;
395f8000896Sjsing strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
396f8000896Sjsing no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
397f8000896Sjsing row_size = (no_chunk << strip_bits) >> DEV_BSHIFT;
398f8000896Sjsing
399f8000896Sjsing data = xs->data;
400f8000896Sjsing datalen = xs->datalen;
401d9ec6765Skrw lbaoffs = blkno << DEV_BSHIFT;
402f8000896Sjsing
403f8000896Sjsing if (xs->flags & SCSI_DATA_OUT) {
404f8000896Sjsing if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){
4058a279608Sjsing printf("%s: %s failed to get read work unit",
4068a279608Sjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
407f8000896Sjsing goto bad;
408f8000896Sjsing }
409f8000896Sjsing wu_r->swu_state = SR_WU_INPROGRESS;
410f8000896Sjsing wu_r->swu_flags |= SR_WUF_DISCIPLINE;
411f8000896Sjsing }
412f8000896Sjsing
413f8000896Sjsing wu->swu_blk_start = 0;
414f8000896Sjsing while (datalen != 0) {
415f8000896Sjsing strip_no = lbaoffs >> strip_bits;
416f8000896Sjsing strip_offs = lbaoffs & (strip_size - 1);
417f8000896Sjsing chunk_offs = (strip_no / no_chunk) << strip_bits;
418d9ec6765Skrw offset = chunk_offs + strip_offs;
419f8000896Sjsing
420f8000896Sjsing /* get size remaining in this stripe */
421f8000896Sjsing length = MIN(strip_size - strip_offs, datalen);
422f8000896Sjsing
4232e5d087bSjsing /*
4242e5d087bSjsing * Map disk offset to data and parity chunks, using a left
4252e5d087bSjsing * asymmetric algorithm for the parity assignment.
4262e5d087bSjsing */
427f8000896Sjsing chunk = strip_no % no_chunk;
428f8000896Sjsing parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
429f8000896Sjsing if (chunk >= parity)
430f8000896Sjsing chunk++;
431f8000896Sjsing
432d9ec6765Skrw lba = offset >> DEV_BSHIFT;
433f8000896Sjsing
434f8000896Sjsing /* XXX big hammer.. exclude I/O from entire stripe */
435f8000896Sjsing if (wu->swu_blk_start == 0)
436f8000896Sjsing wu->swu_blk_start = (strip_no / no_chunk) * row_size;
4372e5d087bSjsing wu->swu_blk_end = (strip_no / no_chunk) * row_size +
4382e5d087bSjsing (row_size - 1);
439f8000896Sjsing
440f8000896Sjsing scp = sd->sd_vol.sv_chunks[chunk];
441f8000896Sjsing if (xs->flags & SCSI_DATA_IN) {
442f8000896Sjsing switch (scp->src_meta.scm_status) {
443f8000896Sjsing case BIOC_SDONLINE:
444f8000896Sjsing case BIOC_SDSCRUB:
4452e5d087bSjsing /*
4462e5d087bSjsing * Chunk is online, issue a single read
4472e5d087bSjsing * request.
4482e5d087bSjsing */
449f8000896Sjsing if (sr_raid5_addio(wu, chunk, lba, length,
450f8000896Sjsing data, xs->flags, 0, NULL))
451f8000896Sjsing goto bad;
452f8000896Sjsing break;
453f8000896Sjsing case BIOC_SDOFFLINE:
454f8000896Sjsing case BIOC_SDREBUILD:
455f8000896Sjsing case BIOC_SDHOTSPARE:
456d4c09c60Sjsing if (sr_raid5_regenerate(wu, chunk, lba,
457d4c09c60Sjsing length, data))
458f8000896Sjsing goto bad;
459f8000896Sjsing break;
460f8000896Sjsing default:
461f8000896Sjsing printf("%s: is offline, can't read\n",
462f8000896Sjsing DEVNAME(sd->sd_sc));
463f8000896Sjsing goto bad;
464f8000896Sjsing }
465f8000896Sjsing } else {
466e3e73c0eSjsing if (sr_raid5_write(wu, wu_r, chunk, parity, lba,
467e3e73c0eSjsing length, data, xs->flags, 0))
468f8000896Sjsing goto bad;
469f8000896Sjsing }
470f8000896Sjsing
471f8000896Sjsing /* advance to next block */
472f8000896Sjsing lbaoffs += length;
473f8000896Sjsing datalen -= length;
474f8000896Sjsing data += length;
475f8000896Sjsing }
476f8000896Sjsing
477f8000896Sjsing s = splbio();
478f8000896Sjsing if (wu_r) {
479ebed995dSjsing if (wu_r->swu_io_count > 0) {
480f8000896Sjsing /* collide write request with reads */
481f8000896Sjsing wu_r->swu_blk_start = wu->swu_blk_start;
482f8000896Sjsing wu_r->swu_blk_end = wu->swu_blk_end;
483f8000896Sjsing
484f8000896Sjsing wu->swu_state = SR_WU_DEFERRED;
485f8000896Sjsing wu_r->swu_collider = wu;
486f8000896Sjsing TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
487f8000896Sjsing
488f8000896Sjsing wu = wu_r;
489ebed995dSjsing } else {
490ebed995dSjsing sr_scsi_wu_put(sd, wu_r);
491ebed995dSjsing }
492f8000896Sjsing }
493f8000896Sjsing splx(s);
494f8000896Sjsing
495f8000896Sjsing sr_schedule_wu(wu);
496f8000896Sjsing
497f8000896Sjsing return (0);
498f8000896Sjsing
499f8000896Sjsing bad:
500f8000896Sjsing /* wu is unwound by sr_wu_put */
501f8000896Sjsing if (wu_r)
502f8000896Sjsing sr_scsi_wu_put(sd, wu_r);
503f8000896Sjsing return (1);
504f8000896Sjsing }
505f8000896Sjsing
506d4c09c60Sjsing int
sr_raid5_regenerate(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data)507d4c09c60Sjsing sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno,
508c804f705Skrw long len, void *data)
509d4c09c60Sjsing {
510d4c09c60Sjsing struct sr_discipline *sd = wu->swu_dis;
511d4c09c60Sjsing int i;
512d4c09c60Sjsing
513d4c09c60Sjsing /*
514d4c09c60Sjsing * Regenerate a block on a RAID 5 volume by xoring the data and parity
515d4c09c60Sjsing * from all of the remaining online chunks. This requires the parity
516d4c09c60Sjsing * to already be correct.
517d4c09c60Sjsing */
518d4c09c60Sjsing
519d4c09c60Sjsing DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, "
520d4c09c60Sjsing "regenerating block %llu\n",
521d4c09c60Sjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno);
522d4c09c60Sjsing
523d4c09c60Sjsing memset(data, 0, len);
524d4c09c60Sjsing for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
525d4c09c60Sjsing if (i == chunk)
526d4c09c60Sjsing continue;
527d4c09c60Sjsing if (!sr_raid5_chunk_online(sd, i))
528d4c09c60Sjsing goto bad;
529d4c09c60Sjsing if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN,
530d4c09c60Sjsing 0, data))
531d4c09c60Sjsing goto bad;
532d4c09c60Sjsing }
533d4c09c60Sjsing return (0);
534d4c09c60Sjsing
535d4c09c60Sjsing bad:
536d4c09c60Sjsing return (1);
537d4c09c60Sjsing }
538d4c09c60Sjsing
539e3e73c0eSjsing int
sr_raid5_write(struct sr_workunit * wu,struct sr_workunit * wu_r,int chunk,int parity,daddr_t blkno,long len,void * data,int xsflags,int ccbflags)540e3e73c0eSjsing sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk,
541c804f705Skrw int parity, daddr_t blkno, long len, void *data, int xsflags,
542e3e73c0eSjsing int ccbflags)
543e3e73c0eSjsing {
544e3e73c0eSjsing struct sr_discipline *sd = wu->swu_dis;
545e3e73c0eSjsing struct scsi_xfer *xs = wu->swu_xs;
546e3e73c0eSjsing void *xorbuf;
547efce0101Sjsing int chunk_online, chunk_rebuild;
548efce0101Sjsing int parity_online, parity_rebuild;
549efce0101Sjsing int other_offline = 0, other_rebuild = 0;
550e3e73c0eSjsing int i;
551e3e73c0eSjsing
552e3e73c0eSjsing /*
553e3e73c0eSjsing * Perform a write to a RAID 5 volume. This write routine does not
554e3e73c0eSjsing * require the parity to already be correct and will operate on a
555e3e73c0eSjsing * uninitialised volume.
556e3e73c0eSjsing *
557e3e73c0eSjsing * There are four possible cases:
558e3e73c0eSjsing *
559e3e73c0eSjsing * 1) All data chunks and parity are online. In this case we read the
560e3e73c0eSjsing * data from all data chunks, except the one we are writing to, in
561e3e73c0eSjsing * order to calculate and write the new parity.
562e3e73c0eSjsing *
563e3e73c0eSjsing * 2) The parity chunk is offline. In this case we only need to write
564e3e73c0eSjsing * to the data chunk. No parity calculation is required.
565e3e73c0eSjsing *
566e3e73c0eSjsing * 3) The data chunk is offline. In this case we read the data from all
567e3e73c0eSjsing * online chunks in order to calculate and write the new parity.
568e3e73c0eSjsing * This is the same as (1) except we do not write the data chunk.
569e3e73c0eSjsing *
570e3e73c0eSjsing * 4) A different data chunk is offline. The new parity is calculated
571e3e73c0eSjsing * by taking the existing parity, xoring the original data and
572e3e73c0eSjsing * xoring in the new data. This requires that the parity already be
573e3e73c0eSjsing * correct, which it will be if any of the data chunks has
574e3e73c0eSjsing * previously been written.
575efce0101Sjsing *
576efce0101Sjsing * There is an additional complication introduced by a chunk that is
577efce0101Sjsing * being rebuilt. If this is the data or parity chunk, then we want
578efce0101Sjsing * to write to it as per normal. If it is another data chunk then we
579efce0101Sjsing * need to presume that it has not yet been regenerated and use the
580efce0101Sjsing * same method as detailed in (4) above.
581e3e73c0eSjsing */
582e3e73c0eSjsing
583e3e73c0eSjsing DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i "
584d9ec6765Skrw "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
585e3e73c0eSjsing chunk, parity, (unsigned long long)blkno);
586e3e73c0eSjsing
587e3e73c0eSjsing chunk_online = sr_raid5_chunk_online(sd, chunk);
588efce0101Sjsing chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk);
589e3e73c0eSjsing parity_online = sr_raid5_chunk_online(sd, parity);
590efce0101Sjsing parity_rebuild = sr_raid5_chunk_rebuild(sd, parity);
591e3e73c0eSjsing
592e3e73c0eSjsing for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
593e3e73c0eSjsing if (i == chunk || i == parity)
594e3e73c0eSjsing continue;
595efce0101Sjsing if (sr_raid5_chunk_rebuild(sd, i))
596efce0101Sjsing other_rebuild = 1;
597efce0101Sjsing else if (!sr_raid5_chunk_online(sd, i))
598e3e73c0eSjsing other_offline = 1;
599e3e73c0eSjsing }
600e3e73c0eSjsing
601e3e73c0eSjsing DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, "
602e3e73c0eSjsing "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
603e3e73c0eSjsing chunk_online, parity_online, other_offline);
604e3e73c0eSjsing
605efce0101Sjsing if (!parity_online && !parity_rebuild)
606e3e73c0eSjsing goto data_write;
607e3e73c0eSjsing
608e3e73c0eSjsing xorbuf = sr_block_get(sd, len);
609e3e73c0eSjsing if (xorbuf == NULL)
610e3e73c0eSjsing goto bad;
611e3e73c0eSjsing memcpy(xorbuf, data, len);
612e3e73c0eSjsing
613efce0101Sjsing if (other_offline || other_rebuild) {
614e3e73c0eSjsing
615e3e73c0eSjsing /*
616e3e73c0eSjsing * XXX - If we can guarantee that this LBA has been scrubbed
617e3e73c0eSjsing * then we can also take this faster path.
618e3e73c0eSjsing */
619e3e73c0eSjsing
620e3e73c0eSjsing /* Read in existing data and existing parity. */
621e3e73c0eSjsing if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL,
622e3e73c0eSjsing SCSI_DATA_IN, 0, xorbuf))
623e3e73c0eSjsing goto bad;
624e3e73c0eSjsing if (sr_raid5_addio(wu_r, parity, blkno, len, NULL,
625e3e73c0eSjsing SCSI_DATA_IN, 0, xorbuf))
626e3e73c0eSjsing goto bad;
627e3e73c0eSjsing
628e3e73c0eSjsing } else {
629e3e73c0eSjsing
630e3e73c0eSjsing /* Read in existing data from all other chunks. */
631e3e73c0eSjsing for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
632e3e73c0eSjsing if (i == chunk || i == parity)
633e3e73c0eSjsing continue;
634e3e73c0eSjsing if (sr_raid5_addio(wu_r, i, blkno, len, NULL,
635e3e73c0eSjsing SCSI_DATA_IN, 0, xorbuf))
636e3e73c0eSjsing goto bad;
637e3e73c0eSjsing }
638e3e73c0eSjsing
639e3e73c0eSjsing }
640e3e73c0eSjsing
641e3e73c0eSjsing /* Write new parity. */
642e3e73c0eSjsing if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags,
643e3e73c0eSjsing SR_CCBF_FREEBUF, NULL))
644e3e73c0eSjsing goto bad;
645e3e73c0eSjsing
646e3e73c0eSjsing data_write:
647e3e73c0eSjsing /* Write new data. */
648efce0101Sjsing if (chunk_online || chunk_rebuild)
649e3e73c0eSjsing if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags,
650e3e73c0eSjsing 0, NULL))
651e3e73c0eSjsing goto bad;
652e3e73c0eSjsing
653e3e73c0eSjsing return (0);
654e3e73c0eSjsing
655e3e73c0eSjsing bad:
656e3e73c0eSjsing return (1);
657e3e73c0eSjsing }
658e3e73c0eSjsing
659f8000896Sjsing void
sr_raid5_intr(struct buf * bp)660f8000896Sjsing sr_raid5_intr(struct buf *bp)
661f8000896Sjsing {
662f8000896Sjsing struct sr_ccb *ccb = (struct sr_ccb *)bp;
663f8000896Sjsing struct sr_workunit *wu = ccb->ccb_wu;
664f8000896Sjsing struct sr_discipline *sd = wu->swu_dis;
665f8000896Sjsing int s;
666f8000896Sjsing
667f8000896Sjsing DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n",
668f8000896Sjsing DEVNAME(sd->sd_sc), bp, wu->swu_xs);
669f8000896Sjsing
670f8000896Sjsing s = splbio();
671f8000896Sjsing sr_ccb_done(ccb);
672f8000896Sjsing
673e328a933Sjsing /* XXX - Should this be done via the taskq? */
674f8000896Sjsing
675f8000896Sjsing /* XOR data to result. */
676f8000896Sjsing if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque)
677f8000896Sjsing sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data,
678f8000896Sjsing ccb->ccb_buf.b_bcount);
679f8000896Sjsing
680f8000896Sjsing /* Free allocated data buffer. */
681f8000896Sjsing if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
6824c26ac15Sjsing sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
683f8000896Sjsing ccb->ccb_buf.b_data = NULL;
684f8000896Sjsing }
685f8000896Sjsing
686f8000896Sjsing sr_wu_done(wu);
687f8000896Sjsing splx(s);
688f8000896Sjsing }
689f8000896Sjsing
690f8000896Sjsing int
sr_raid5_wu_done(struct sr_workunit * wu)691f8000896Sjsing sr_raid5_wu_done(struct sr_workunit *wu)
692f8000896Sjsing {
693f8000896Sjsing struct sr_discipline *sd = wu->swu_dis;
694f8000896Sjsing struct scsi_xfer *xs = wu->swu_xs;
695f8000896Sjsing
696f8000896Sjsing /* XXX - we have no way of propagating errors... */
697a6c8c894Sjsing if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD))
698f8000896Sjsing return SR_WU_OK;
699f8000896Sjsing
700d4c09c60Sjsing /* XXX - This is insufficient for RAID 5. */
701f8000896Sjsing if (wu->swu_ios_succeeded > 0) {
702f8000896Sjsing xs->error = XS_NOERROR;
703f8000896Sjsing return SR_WU_OK;
704f8000896Sjsing }
705f8000896Sjsing
706f8000896Sjsing if (xs->flags & SCSI_DATA_IN) {
707f8000896Sjsing printf("%s: retrying read on block %lld\n",
708f8000896Sjsing sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
709f8000896Sjsing sr_wu_release_ccbs(wu);
710f8000896Sjsing wu->swu_state = SR_WU_RESTART;
711f8000896Sjsing if (sd->sd_scsi_rw(wu) == 0)
712f8000896Sjsing return SR_WU_RESTART;
713f8000896Sjsing } else {
714a6c8c894Sjsing /* XXX - retry write if we just went from online to degraded. */
715f8000896Sjsing printf("%s: permanently fail write on block %lld\n",
716f8000896Sjsing sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start);
717f8000896Sjsing }
718f8000896Sjsing
719f8000896Sjsing wu->swu_state = SR_WU_FAILED;
720f8000896Sjsing xs->error = XS_DRIVER_STUFFUP;
721f8000896Sjsing
722f8000896Sjsing return SR_WU_FAILED;
723f8000896Sjsing }
724f8000896Sjsing
725f8000896Sjsing int
sr_raid5_addio(struct sr_workunit * wu,int chunk,daddr_t blkno,long len,void * data,int xsflags,int ccbflags,void * xorbuf)726f8000896Sjsing sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
727c804f705Skrw long len, void *data, int xsflags, int ccbflags, void *xorbuf)
728f8000896Sjsing {
729f8000896Sjsing struct sr_discipline *sd = wu->swu_dis;
730f8000896Sjsing struct sr_ccb *ccb;
731f8000896Sjsing
732d4c09c60Sjsing DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld "
733c804f705Skrw "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
734c804f705Skrw chunk, (long long)blkno, len, xorbuf ? "X0R" : "-");
735f8000896Sjsing
736f8000896Sjsing /* Allocate temporary buffer. */
737f8000896Sjsing if (data == NULL) {
7384c26ac15Sjsing data = sr_block_get(sd, len);
739f8000896Sjsing if (data == NULL)
740f8000896Sjsing return (-1);
741f8000896Sjsing ccbflags |= SR_CCBF_FREEBUF;
742f8000896Sjsing }
743f8000896Sjsing
744f8000896Sjsing ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
745f8000896Sjsing if (ccb == NULL) {
746f8000896Sjsing if (ccbflags & SR_CCBF_FREEBUF)
7474c26ac15Sjsing sr_block_put(sd, data, len);
748f8000896Sjsing return (-1);
749f8000896Sjsing }
750f8000896Sjsing ccb->ccb_opaque = xorbuf;
751f8000896Sjsing sr_wu_enqueue_ccb(wu, ccb);
752f8000896Sjsing
753f8000896Sjsing return (0);
754f8000896Sjsing }
755f8000896Sjsing
756f8000896Sjsing void
sr_raid5_xor(void * a,void * b,int len)757f8000896Sjsing sr_raid5_xor(void *a, void *b, int len)
758f8000896Sjsing {
759f8000896Sjsing uint32_t *xa = a, *xb = b;
760f8000896Sjsing
761f8000896Sjsing len >>= 2;
762f8000896Sjsing while (len--)
763f8000896Sjsing *xa++ ^= *xb++;
764f8000896Sjsing }
765f8000896Sjsing
766fd7fd89bSjsing void
sr_raid5_rebuild(struct sr_discipline * sd)767fd7fd89bSjsing sr_raid5_rebuild(struct sr_discipline *sd)
768fd7fd89bSjsing {
76938301a9fSjsing int64_t strip_no, strip_size, strip_bits, i, restart;
770fd7fd89bSjsing int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size;
771fd7fd89bSjsing struct sr_workunit *wu_r, *wu_w;
772fd7fd89bSjsing int s, slept, percent = 0, old_percent = -1;
773fd7fd89bSjsing int rebuild_chunk = -1;
774fd7fd89bSjsing void *xorbuf;
775fd7fd89bSjsing
776fd7fd89bSjsing /* Find the rebuild chunk. */
777fd7fd89bSjsing for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
778fd7fd89bSjsing if (sr_raid5_chunk_rebuild(sd, i)) {
779fd7fd89bSjsing rebuild_chunk = i;
780fd7fd89bSjsing break;
781fd7fd89bSjsing }
782fd7fd89bSjsing }
783fd7fd89bSjsing if (rebuild_chunk == -1)
784fd7fd89bSjsing goto bad;
785fd7fd89bSjsing
786fd7fd89bSjsing strip_size = sd->sd_meta->ssdi.ssd_strip_size;
787fd7fd89bSjsing strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
788fd7fd89bSjsing chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1;
789fd7fd89bSjsing chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count;
790fd7fd89bSjsing chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits;
791fd7fd89bSjsing row_size = (chunk_count << strip_bits) >> DEV_BSHIFT;
792fd7fd89bSjsing
793fd7fd89bSjsing DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, "
794fd7fd89bSjsing "chunk count = %lld, chunk size = %lld, chunk strips = %lld, "
795fd7fd89bSjsing "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname,
796fd7fd89bSjsing sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips,
797fd7fd89bSjsing row_size);
798fd7fd89bSjsing
799fd37253eSjsing restart = sd->sd_meta->ssd_rebuild / row_size;
800fd37253eSjsing if (restart > chunk_strips) {
801fd37253eSjsing printf("%s: bogus rebuild restart offset, starting from 0\n",
802fd37253eSjsing DEVNAME(sd->sd_sc));
803fd37253eSjsing restart = 0;
804fd37253eSjsing }
805fd37253eSjsing if (restart != 0) {
80638301a9fSjsing percent = sr_rebuild_percent(sd);
807fd37253eSjsing printf("%s: resuming rebuild on %s at %d%%\n",
808fd37253eSjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent);
809fd37253eSjsing }
810fd37253eSjsing
811fd37253eSjsing for (strip_no = restart; strip_no < chunk_strips; strip_no++) {
812bd56a12fSkrw chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no;
813fd7fd89bSjsing
814fd7fd89bSjsing DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, "
815fd7fd89bSjsing "chunk lba = %lld\n", DEVNAME(sd->sd_sc),
816fd7fd89bSjsing sd->sd_meta->ssd_devname, strip_no, chunk_lba);
817fd7fd89bSjsing
818fd7fd89bSjsing wu_w = sr_scsi_wu_get(sd, 0);
819fd7fd89bSjsing wu_r = sr_scsi_wu_get(sd, 0);
820fd7fd89bSjsing
821fd7fd89bSjsing xorbuf = sr_block_get(sd, strip_size);
82231135a12Stobhe if (xorbuf == NULL)
82331135a12Stobhe goto bad;
824fd7fd89bSjsing if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba,
825fd7fd89bSjsing strip_size, xorbuf))
826fd7fd89bSjsing goto bad;
827fd7fd89bSjsing if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size,
828fd7fd89bSjsing xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL))
829fd7fd89bSjsing goto bad;
830fd7fd89bSjsing
831fd7fd89bSjsing /* Collide write work unit with read work unit. */
832fd7fd89bSjsing wu_r->swu_state = SR_WU_INPROGRESS;
833fd7fd89bSjsing wu_r->swu_flags |= SR_WUF_REBUILD;
834fd7fd89bSjsing wu_w->swu_state = SR_WU_DEFERRED;
835fd7fd89bSjsing wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
836fd7fd89bSjsing wu_r->swu_collider = wu_w;
837fd7fd89bSjsing
838fd7fd89bSjsing /* Block I/O to this strip while we rebuild it. */
839fd7fd89bSjsing wu_r->swu_blk_start = (strip_no / chunk_count) * row_size;
840fd7fd89bSjsing wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1;
841fd7fd89bSjsing wu_w->swu_blk_start = wu_r->swu_blk_start;
842fd7fd89bSjsing wu_w->swu_blk_end = wu_r->swu_blk_end;
843fd7fd89bSjsing
844fd7fd89bSjsing DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, "
845fd7fd89bSjsing "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc),
846fd7fd89bSjsing sd->sd_meta->ssd_devname,
847fd7fd89bSjsing wu_r->swu_blk_start, wu_r->swu_blk_end);
848fd7fd89bSjsing
849fd7fd89bSjsing s = splbio();
850fd7fd89bSjsing TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
851fd7fd89bSjsing splx(s);
852fd7fd89bSjsing
853fd7fd89bSjsing sr_schedule_wu(wu_r);
854fd7fd89bSjsing
855fd7fd89bSjsing slept = 0;
856fd7fd89bSjsing while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
857fe2d3ae9Scheloha tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP);
858fd7fd89bSjsing slept = 1;
859fd7fd89bSjsing }
860632f6d33Scheloha if (!slept) {
861632f6d33Scheloha tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
862632f6d33Scheloha MSEC_TO_NSEC(1));
863632f6d33Scheloha }
864fd7fd89bSjsing
865fd7fd89bSjsing sr_scsi_wu_put(sd, wu_r);
866fd7fd89bSjsing sr_scsi_wu_put(sd, wu_w);
867fd7fd89bSjsing
868bd56a12fSkrw sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count;
869fd7fd89bSjsing
87038301a9fSjsing percent = sr_rebuild_percent(sd);
871fd7fd89bSjsing if (percent != old_percent && strip_no != chunk_strips - 1) {
872fd7fd89bSjsing if (sr_meta_save(sd, SR_META_DIRTY))
873fd7fd89bSjsing printf("%s: could not save metadata to %s\n",
874fd7fd89bSjsing DEVNAME(sd->sd_sc),
875fd7fd89bSjsing sd->sd_meta->ssd_devname);
876fd7fd89bSjsing old_percent = percent;
877fd7fd89bSjsing }
878fd7fd89bSjsing
879fd7fd89bSjsing if (sd->sd_reb_abort)
880fd7fd89bSjsing goto abort;
881fd7fd89bSjsing }
882fd7fd89bSjsing
883fd7fd89bSjsing DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc),
884fd7fd89bSjsing sd->sd_meta->ssd_devname);
885fd7fd89bSjsing
886fd7fd89bSjsing /* all done */
887fd7fd89bSjsing sd->sd_meta->ssd_rebuild = 0;
888fd7fd89bSjsing for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
889fd7fd89bSjsing if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status ==
890fd7fd89bSjsing BIOC_SDREBUILD) {
891fd7fd89bSjsing sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE);
892fd7fd89bSjsing break;
893fd7fd89bSjsing }
894fd7fd89bSjsing }
895fd7fd89bSjsing
896fd7fd89bSjsing return;
897fd7fd89bSjsing
898fd7fd89bSjsing abort:
899fd7fd89bSjsing if (sr_meta_save(sd, SR_META_DIRTY))
900fd7fd89bSjsing printf("%s: could not save metadata to %s\n",
901fd7fd89bSjsing DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname);
902fd7fd89bSjsing bad:
903fd7fd89bSjsing return;
904fd7fd89bSjsing }
905fd7fd89bSjsing
906f8000896Sjsing #if 0
907f8000896Sjsing void
908f8000896Sjsing sr_raid5_scrub(struct sr_discipline *sd)
909f8000896Sjsing {
910f8000896Sjsing int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits;
911f8000896Sjsing int64_t i;
912f8000896Sjsing struct sr_workunit *wu_r, *wu_w;
913f8000896Sjsing int s, slept;
914f8000896Sjsing void *xorbuf;
915f8000896Sjsing
916f8000896Sjsing wu_w = sr_scsi_wu_get(sd, 0);
917f8000896Sjsing wu_r = sr_scsi_wu_get(sd, 0);
918f8000896Sjsing
919f8000896Sjsing no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1;
920f8000896Sjsing strip_size = sd->sd_meta->ssdi.ssd_strip_size;
921f8000896Sjsing strip_bits = sd->mds.mdd_raid5.sr5_strip_bits;
922f8000896Sjsing max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits;
923f8000896Sjsing
924f8000896Sjsing for (strip_no = 0; strip_no < max_strip; strip_no++) {
925f8000896Sjsing parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1));
926f8000896Sjsing
9274c26ac15Sjsing xorbuf = sr_block_get(sd, strip_size);
928f8000896Sjsing for (i = 0; i <= no_chunk; i++) {
929f8000896Sjsing if (i != parity)
930f8000896Sjsing sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size,
931f8000896Sjsing NULL, SCSI_DATA_IN, 0, xorbuf);
932f8000896Sjsing }
933f8000896Sjsing sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf,
934f8000896Sjsing SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL);
935f8000896Sjsing
936f8000896Sjsing wu_r->swu_flags |= SR_WUF_REBUILD;
937f8000896Sjsing
938f8000896Sjsing /* Collide wu_w with wu_r */
939f8000896Sjsing wu_w->swu_state = SR_WU_DEFERRED;
940f8000896Sjsing wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
941f8000896Sjsing wu_r->swu_collider = wu_w;
942f8000896Sjsing
943f8000896Sjsing s = splbio();
944f8000896Sjsing TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
945f8000896Sjsing splx(s);
946f8000896Sjsing
947f8000896Sjsing wu_r->swu_state = SR_WU_INPROGRESS;
948f8000896Sjsing sr_schedule_wu(wu_r);
949f8000896Sjsing
950f8000896Sjsing slept = 0;
951f8000896Sjsing while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
952fe2d3ae9Scheloha tsleep_nsec(wu_w, PRIBIO, "sr_scrub", INFSLP);
953f8000896Sjsing slept = 1;
954f8000896Sjsing }
955632f6d33Scheloha if (!slept) {
956632f6d33Scheloha tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield",
957632f6d33Scheloha MSEC_TO_NSEC(1));
958632f6d33Scheloha }
959f8000896Sjsing }
960f8000896Sjsing }
961f8000896Sjsing #endif
962