xref: /netbsd-src/sys/dev/raidframe/rf_pqdegdags.c (revision 3b01aba77a7a698587faaae455bbfe740923c1f5)
1 /*	$NetBSD: rf_pqdegdags.c,v 1.6 2001/07/18 06:45:34 thorpej Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * rf_pqdegdags.c
31  * Degraded mode dags for double fault cases.
32 */
33 
34 
35 #include "rf_archs.h"
36 
37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
38 
39 #include "rf_types.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagdegrd.h"
43 #include "rf_dagdegwr.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_dagutils.h"
46 #include "rf_etimer.h"
47 #include "rf_acctrace.h"
48 #include "rf_general.h"
49 #include "rf_pqdegdags.h"
50 #include "rf_pq.h"
51 
52 static void
53 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
54     RF_PhysDiskAddr_t * qpda, void *bp);
55 
56 /*
57    Two data drives have failed, and we are doing a read that covers one of them.
58    We may also be reading some of the surviving drives.
59 
60 
61  *****************************************************************************************
62  *
63  * creates a DAG to perform a degraded-mode read of data within one stripe.
64  * This DAG is as follows:
65  *
66  *                                      Hdr
67  *                                       |
68  *                                     Block
69  *                       /         /           \         \     \   \
70  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
71  *                      | \       | \         | \       | \    | \ | \
72  *
73  *                                 |                 |
74  *                              Unblock              X
75  *                                  \               /
76  *                                   ------ T ------
77  *
78  * Each R node is a successor of the L node
79  * One successor arc from each R node goes to U, and the other to X
80  * There is one Rud for each chunk of surviving user data requested by the user,
81  * and one Rrd for each chunk of surviving user data _not_ being read by the user
82  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
83  * X = pq recovery node, T = terminate
84  *
85  * The block & unblock nodes are leftovers from a previous version.  They
86  * do nothing, but I haven't deleted them because it would be a tremendous
87  * effort to put them back in.
88  *
89  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
90  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
91  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
92  * zero the target buffer prior to the re-use.
93  *
94  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
95  * needs and what's not.
96  ****************************************************************************************/
97 /*   init a disk node with 2 successors and one predecessor */
98 #define INIT_DISK_NODE(node,name) \
99 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
100 (node)->succedents[0] = unblockNode; \
101 (node)->succedents[1] = recoveryNode; \
102 (node)->antecedents[0] = blockNode; \
103 (node)->antType[0] = rf_control
104 
105 #define DISK_NODE_PARAMS(_node_,_p_) \
106   (_node_).params[0].p = _p_ ; \
107   (_node_).params[1].p = (_p_)->bufPtr; \
108   (_node_).params[2].v = parityStripeID; \
109   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
110 
111 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
112 
113 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
114 {
115 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
116 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
117 }
118 
119 static void
120 applyPDA(raidPtr, pda, ppda, qpda, bp)
121 	RF_Raid_t *raidPtr;
122 	RF_PhysDiskAddr_t *pda;
123 	RF_PhysDiskAddr_t *ppda;
124 	RF_PhysDiskAddr_t *qpda;
125 	void   *bp;
126 {
127 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
128 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
129 	RF_SectorCount_t s0len = ppda->numSector, len;
130 	RF_SectorNum_t suoffset;
131 	unsigned coeff;
132 	char   *pbuf = ppda->bufPtr;
133 	char   *qbuf = qpda->bufPtr;
134 	char   *buf;
135 	int     delta;
136 
137 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
138 	len = pda->numSector;
139 	/* see if pda intersects a recovery pda */
140 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
141 		buf = pda->bufPtr;
142 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
143 		coeff = (coeff % raidPtr->Layout.numDataCol);
144 
145 		if (suoffset < s0off) {
146 			delta = s0off - suoffset;
147 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
148 			suoffset = s0off;
149 			len -= delta;
150 		}
151 		if (suoffset > s0off) {
152 			delta = suoffset - s0off;
153 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
154 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
155 		}
156 		if ((suoffset + len) > (s0len + s0off))
157 			len = s0len + s0off - suoffset;
158 
159 		/* src, dest, len */
160 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
161 
162 		/* dest, src, len, coeff */
163 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
164 	}
165 }
166 /*
167    Recover data in the case of a double failure. There can be two
168    result buffers, one for each chunk of data trying to be recovered.
169    The params are pda's that have not been range restricted or otherwise
170    politely massaged - this should be done here. The last params are the
171    pdas of P and Q, followed by the raidPtr. The list can look like
172 
173    pda, pda, ... , p pda, q pda, raidptr, asm
174 
175    or
176 
177    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
178 
179    depending on wether two chunks of recovery data were required.
180 
181    The second condition only arises if there are two failed buffers
182    whose lengths do not add up a stripe unit.
183 */
184 
185 
186 int
187 rf_PQDoubleRecoveryFunc(node)
188 	RF_DagNode_t *node;
189 {
190 	int     np = node->numParams;
191 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
192 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
193 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
194 	int     d, i;
195 	unsigned coeff;
196 	RF_RaidAddr_t sosAddr, suoffset;
197 	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
198 	int     two = 0;
199 	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
200 	char   *buf;
201 	int     numDataCol = layoutPtr->numDataCol;
202 	RF_Etimer_t timer;
203 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
204 
205 	RF_ETIMER_START(timer);
206 
207 	if (asmap->failedPDAs[1] &&
208 	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
209 		RF_ASSERT(0);
210 		ppda = node->params[np - 6].p;
211 		ppda2 = node->params[np - 5].p;
212 		qpda = node->params[np - 4].p;
213 		qpda2 = node->params[np - 3].p;
214 		d = (np - 6);
215 		two = 1;
216 	} else {
217 		ppda = node->params[np - 4].p;
218 		qpda = node->params[np - 3].p;
219 		d = (np - 4);
220 	}
221 
222 	for (i = 0; i < d; i++) {
223 		pda = node->params[i].p;
224 		buf = pda->bufPtr;
225 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
226 		len = pda->numSector;
227 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
228 		/* compute the data unit offset within the column */
229 		coeff = (coeff % raidPtr->Layout.numDataCol);
230 		/* see if pda intersects a recovery pda */
231 		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
232 		if (two)
233 			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
234 	}
235 
236 	/* ok, we got the parity back to the point where we can recover. We
237 	 * now need to determine the coeff of the columns that need to be
238 	 * recovered. We can also only need to recover a single stripe unit. */
239 
240 	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
241 						 * to recover. */
242 		pda = asmap->failedPDAs[0];
243 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
244 		/* need to determine the column of the other failed disk */
245 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
246 		/* compute the data unit offset within the column */
247 		coeff = (coeff % raidPtr->Layout.numDataCol);
248 		for (i = 0; i < numDataCol; i++) {
249 			npda.raidAddress = sosAddr + (i * secPerSU);
250 			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
251 			/* skip over dead disks */
252 			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
253 				if (i != coeff)
254 					break;
255 		}
256 		RF_ASSERT(i < numDataCol);
257 		RF_ASSERT(two == 0);
258 		/* recover the data. Since we need only want to recover one
259 		 * column, we overwrite the parity with the other one. */
260 		if (coeff < i)	/* recovering 'a' */
261 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
262 		else		/* recovering 'b' */
263 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
264 	} else
265 		RF_PANIC();
266 
267 	RF_ETIMER_STOP(timer);
268 	RF_ETIMER_EVAL(timer);
269 	if (tracerec)
270 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
271 	rf_GenericWakeupFunc(node, 0);
272 	return (0);
273 }
274 
275 int
276 rf_PQWriteDoubleRecoveryFunc(node)
277 	RF_DagNode_t *node;
278 {
279 	/* The situation:
280 	 *
281 	 * We are doing a write that hits only one failed data unit. The other
282 	 * failed data unit is not being overwritten, so we need to generate
283 	 * it.
284 	 *
285 	 * For the moment, we assume all the nonfailed data being written is in
286 	 * the shadow of the failed data unit. (i.e,, either a single data
287 	 * unit write or the entire failed stripe unit is being overwritten. )
288 	 *
289 	 * Recovery strategy: apply the recovery data to the parity and q. Use P
290 	 * & Q to recover the second failed data unit in P. Zero fill Q, then
291 	 * apply the recovered data to p. Then apply the data being written to
292 	 * the failed drive. Then walk through the surviving drives, applying
293 	 * new data when it exists, othewise the recovery data. Quite a mess.
294 	 *
295 	 *
296 	 * The params
297 	 *
298 	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
299 	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
300 	 * raidPtr, asmap */
301 
302 	int     np = node->numParams;
303 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
304 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
305 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
306 	int     i;
307 	RF_RaidAddr_t sosAddr;
308 	unsigned coeff;
309 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
310 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
311 	int     numDataCol = layoutPtr->numDataCol;
312 	RF_Etimer_t timer;
313 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
314 
315 	RF_ASSERT(node->numResults == 2);
316 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
317 	RF_ETIMER_START(timer);
318 	ppda = node->results[0];
319 	qpda = node->results[1];
320 	/* apply the recovery data */
321 	for (i = 0; i < numDataCol - 2; i++)
322 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
323 
324 	/* determine the other failed data unit */
325 	pda = asmap->failedPDAs[0];
326 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
327 	/* need to determine the column of the other failed disk */
328 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
329 	/* compute the data unit offset within the column */
330 	coeff = (coeff % raidPtr->Layout.numDataCol);
331 	for (i = 0; i < numDataCol; i++) {
332 		npda.raidAddress = sosAddr + (i * secPerSU);
333 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
334 		/* skip over dead disks */
335 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
336 			if (i != coeff)
337 				break;
338 	}
339 	RF_ASSERT(i < numDataCol);
340 	/* recover the data. The column we want to recover we write over the
341 	 * parity. The column we don't care about we dump in q. */
342 	if (coeff < i)		/* recovering 'a' */
343 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
344 	else			/* recovering 'b' */
345 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
346 
347 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
348 	memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
349 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
350 
351 	/* now apply all the write data to the buffer */
352 	/* single stripe unit write case: the failed data is only thing we are
353 	 * writing. */
354 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
355 	/* dest, src, len, coeff */
356 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
357 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
358 
359 	/* now apply all the recovery data */
360 	for (i = 0; i < numDataCol - 2; i++)
361 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
362 
363 	RF_ETIMER_STOP(timer);
364 	RF_ETIMER_EVAL(timer);
365 	if (tracerec)
366 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
367 
368 	rf_GenericWakeupFunc(node, 0);
369 	return (0);
370 }
371 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
372 {
373 	RF_PANIC();
374 }
375 /*
376    Two lost data unit write case.
377 
378    There are really two cases here:
379 
380    (1) The write completely covers the two lost data units.
381        In that case, a reconstruct write that doesn't write the
382        failed data units will do the correct thing. So in this case,
383        the dag looks like
384 
385             full stripe read of surviving data units (not being overwriten)
386 	    write new data (ignoring failed units)   compute P&Q
387 	                                             write P&Q
388 
389 
390    (2) The write does not completely cover both failed data units
391        (but touches at least one of them). Then we need to do the
392        equivalent of a reconstruct read to recover the missing data
393        unit from the other stripe.
394 
395        For any data we are writing that is not in the "shadow"
396        of the failed units, we need to do a four cycle update.
397        PANIC on this case. for now
398 
399 */
400 
401 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
402 {
403 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
404 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
405 	int     sum;
406 	int     nf = asmap->numDataFailed;
407 
408 	sum = asmap->failedPDAs[0]->numSector;
409 	if (nf == 2)
410 		sum += asmap->failedPDAs[1]->numSector;
411 
412 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
413 		/* large write case */
414 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
415 		return;
416 	}
417 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
418 		/* small write case, no user data not in shadow */
419 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
420 		return;
421 	}
422 	RF_PANIC();
423 }
424 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
425 {
426 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
427 }
428 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
429 				 * (RF_INCLUDE_RAID6 > 0) */
430