xref: /netbsd-src/sys/dev/raidframe/rf_pqdegdags.c (revision 5192e0e7e8c6ea9b0c3f63d942341865bc3ddbd8)
1 /*	$NetBSD: rf_pqdegdags.c,v 1.17 2023/10/15 18:15:20 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * rf_pqdegdags.c
31  * Degraded mode dags for double fault cases.
32 */
33 
34 
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.17 2023/10/15 18:15:20 oster Exp $");
37 
38 #include "rf_archs.h"
39 
40 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
41 
42 #include <dev/raidframe/raidframevar.h>
43 
44 #include "rf_raid.h"
45 #include "rf_dag.h"
46 #include "rf_dagdegrd.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_dagfuncs.h"
49 #include "rf_dagutils.h"
50 #include "rf_etimer.h"
51 #include "rf_acctrace.h"
52 #include "rf_general.h"
53 #include "rf_pqdegdags.h"
54 #include "rf_pq.h"
55 
56 static void
57 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
58     RF_PhysDiskAddr_t * qpda, const struct buf *bp);
59 
60 /*
61    Two data drives have failed, and we are doing a read that covers one of them.
62    We may also be reading some of the surviving drives.
63 
64 
65  *****************************************************************************************
66  *
67  * creates a DAG to perform a degraded-mode read of data within one stripe.
68  * This DAG is as follows:
69  *
70  *                                      Hdr
71  *                                       |
72  *                                     Block
73  *                       /         /           \         \     \   \
74  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
75  *                      | \       | \         | \       | \    | \ | \
76  *
77  *                                 |                 |
78  *                              Unblock              X
79  *                                  \               /
80  *                                   ------ T ------
81  *
82  * Each R node is a successor of the L node
83  * One successor arc from each R node goes to U, and the other to X
84  * There is one Rud for each chunk of surviving user data requested by the user,
85  * and one Rrd for each chunk of surviving user data _not_ being read by the user
86  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
87  * X = pq recovery node, T = terminate
88  *
89  * The block & unblock nodes are leftovers from a previous version.  They
90  * do nothing, but I haven't deleted them because it would be a tremendous
91  * effort to put them back in.
92  *
93  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
94  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
95  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
96  * zero the target buffer prior to the re-use.
97  *
98  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
99  * needs and what's not.
100  ****************************************************************************************/
101 /*   init a disk node with 2 successors and one predecessor */
102 #define INIT_DISK_NODE(node,name) \
103 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
104 (node)->succedents[0] = unblockNode; \
105 (node)->succedents[1] = recoveryNode; \
106 (node)->antecedents[0] = blockNode; \
107 (node)->antType[0] = rf_control
108 
109 #define DISK_NODE_PARAMS(_node_,_p_) \
110   (_node_).params[0].p = _p_ ; \
111   (_node_).params[1].p = (_p_)->bufPtr; \
112   (_node_).params[2].v = parityStripeID; \
113   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
114 
115 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
116 
RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)117 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
118 {
119 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
120 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
121 }
122 
123 static void
applyPDA(RF_Raid_t * raidPtr,RF_PhysDiskAddr_t * pda,RF_PhysDiskAddr_t * ppda,RF_PhysDiskAddr_t * qpda,const struct buf * bp)124 applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, const struct buf *bp)
125 {
126 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
127 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
128 	RF_SectorCount_t s0len = ppda->numSector, len;
129 	RF_SectorNum_t suoffset;
130 	unsigned coeff;
131 	char   *pbuf = ppda->bufPtr;
132 	char   *qbuf = qpda->bufPtr;
133 	char   *buf;
134 	int     delta;
135 
136 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
137 	len = pda->numSector;
138 	/* see if pda intersects a recovery pda */
139 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
140 		buf = pda->bufPtr;
141 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
142 		coeff = (coeff % raidPtr->Layout.numDataCol);
143 
144 		if (suoffset < s0off) {
145 			delta = s0off - suoffset;
146 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
147 			suoffset = s0off;
148 			len -= delta;
149 		}
150 		if (suoffset > s0off) {
151 			delta = suoffset - s0off;
152 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
153 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
154 		}
155 		if ((suoffset + len) > (s0len + s0off))
156 			len = s0len + s0off - suoffset;
157 
158 		/* src, dest, len */
159 		/* rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp); */
160 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len));
161 
162 		/* dest, src, len, coeff */
163 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
164 	}
165 }
166 /*
167    Recover data in the case of a double failure. There can be two
168    result buffers, one for each chunk of data trying to be recovered.
169    The params are pda's that have not been range restricted or otherwise
170    politely massaged - this should be done here. The last params are the
171    pdas of P and Q, followed by the raidPtr. The list can look like
172 
173    pda, pda, ... , p pda, q pda, raidptr, asm
174 
175    or
176 
177    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
178 
179    depending on whether two chunks of recovery data were required.
180 
181    The second condition only arises if there are two failed buffers
182    whose lengths do not add up a stripe unit.
183 */
184 
185 
186 void
rf_PQDoubleRecoveryFunc(RF_DagNode_t * node)187 rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
188 {
189 	int     np = node->numParams;
190 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
191 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
192 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
193 	int     d, i;
194 	unsigned coeff;
195 	RF_RaidAddr_t sosAddr; /* , suoffset; */
196 	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
197 	int     two = 0;
198 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
199 	/* char   *buf; */
200 	int     numDataCol = layoutPtr->numDataCol;
201 	RF_Etimer_t timer;
202 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
203 
204 	RF_ETIMER_START(timer);
205 
206 	if (asmap->failedPDAs[1] &&
207 	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
208 		RF_ASSERT(0);
209 		ppda = node->params[np - 6].p;
210 		/*		ppda2 = node->params[np - 5].p; */
211 		qpda = node->params[np - 4].p;
212 		/* 		qpda2 = node->params[np - 3].p; */
213 		d = (np - 6);
214 		two = 1;
215 	} else {
216 		ppda = node->params[np - 4].p;
217 		qpda = node->params[np - 3].p;
218 		d = (np - 4);
219 	}
220 
221 	for (i = 0; i < d; i++) {
222 		pda = node->params[i].p;
223 		/* 		buf = pda->bufPtr; */
224 		/* 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); */
225 		/* 		len = pda->numSector; */
226 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
227 		/* compute the data unit offset within the column */
228 		coeff = (coeff % raidPtr->Layout.numDataCol);
229 		/* see if pda intersects a recovery pda */
230 		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
231 		if (two)
232 			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
233 	}
234 
235 	/* ok, we got the parity back to the point where we can recover. We
236 	 * now need to determine the coeff of the columns that need to be
237 	 * recovered. We can also only need to recover a single stripe unit. */
238 
239 	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
240 						 * to recover. */
241 		pda = asmap->failedPDAs[0];
242 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
243 		/* need to determine the column of the other failed disk */
244 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
245 		/* compute the data unit offset within the column */
246 		coeff = (coeff % raidPtr->Layout.numDataCol);
247 		for (i = 0; i < numDataCol; i++) {
248 			npda.raidAddress = sosAddr + (i * secPerSU);
249 			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
250 			/* skip over dead disks */
251 			if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
252 				if (i != coeff)
253 					break;
254 		}
255 		RF_ASSERT(i < numDataCol);
256 		RF_ASSERT(two == 0);
257 		/* recover the data. Since we need only want to recover one
258 		 * column, we overwrite the parity with the other one. */
259 		if (coeff < i)	/* recovering 'a' */
260 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
261 		else		/* recovering 'b' */
262 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
263 	} else
264 		RF_PANIC();
265 
266 	RF_ETIMER_STOP(timer);
267 	RF_ETIMER_EVAL(timer);
268 	if (tracerec)
269 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
270 	rf_GenericWakeupFunc(node, 0);
271 }
272 
273 void
rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t * node)274 rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
275 {
276 	/* The situation:
277 	 *
278 	 * We are doing a write that hits only one failed data unit. The other
279 	 * failed data unit is not being overwritten, so we need to generate
280 	 * it.
281 	 *
282 	 * For the moment, we assume all the nonfailed data being written is in
283 	 * the shadow of the failed data unit. (i.e,, either a single data
284 	 * unit write or the entire failed stripe unit is being overwritten. )
285 	 *
286 	 * Recovery strategy: apply the recovery data to the parity and q. Use P
287 	 * & Q to recover the second failed data unit in P. Zero fill Q, then
288 	 * apply the recovered data to p. Then apply the data being written to
289 	 * the failed drive. Then walk through the surviving drives, applying
290 	 * new data when it exists, otherwise the recovery data. Quite a mess.
291 	 *
292 	 *
293 	 * The params
294 	 *
295 	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
296 	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
297 	 * raidPtr, asmap */
298 
299 	int     np = node->numParams;
300 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
301 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
302 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
303 	int     i;
304 	RF_RaidAddr_t sosAddr;
305 	unsigned coeff;
306 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
307 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
308 	int     numDataCol = layoutPtr->numDataCol;
309 	RF_Etimer_t timer;
310 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
311 
312 	RF_ASSERT(node->numResults == 2);
313 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
314 	RF_ETIMER_START(timer);
315 	ppda = node->results[0];
316 	qpda = node->results[1];
317 	/* apply the recovery data */
318 	for (i = 0; i < numDataCol - 2; i++)
319 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
320 
321 	/* determine the other failed data unit */
322 	pda = asmap->failedPDAs[0];
323 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
324 	/* need to determine the column of the other failed disk */
325 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
326 	/* compute the data unit offset within the column */
327 	coeff = (coeff % raidPtr->Layout.numDataCol);
328 	for (i = 0; i < numDataCol; i++) {
329 		npda.raidAddress = sosAddr + (i * secPerSU);
330 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
331 		/* skip over dead disks */
332 		if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
333 			if (i != coeff)
334 				break;
335 	}
336 	RF_ASSERT(i < numDataCol);
337 	/* recover the data. The column we want to recover we write over the
338 	 * parity. The column we don't care about we dump in q. */
339 	if (coeff < i)		/* recovering 'a' */
340 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
341 	else			/* recovering 'b' */
342 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
343 
344 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
345 	memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
346 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
347 
348 	/* now apply all the write data to the buffer */
349 	/* single stripe unit write case: the failed data is only thing we are
350 	 * writing. */
351 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
352 	/* dest, src, len, coeff */
353 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
354 	/* rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp); */
355 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector));
356 
357 	/* now apply all the recovery data */
358 	for (i = 0; i < numDataCol - 2; i++)
359 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
360 
361 	RF_ETIMER_STOP(timer);
362 	RF_ETIMER_EVAL(timer);
363 	if (tracerec)
364 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
365 
366 	rf_GenericWakeupFunc(node, 0);
367 }
RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)368 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
369 {
370 	RF_PANIC();
371 }
372 /*
373    Two lost data unit write case.
374 
375    There are really two cases here:
376 
377    (1) The write completely covers the two lost data units.
378        In that case, a reconstruct write that doesn't write the
379        failed data units will do the correct thing. So in this case,
380        the dag looks like
381 
382             full stripe read of surviving data units (not being overwritten)
383 	    write new data (ignoring failed units)   compute P&Q
384 	                                             write P&Q
385 
386 
387    (2) The write does not completely cover both failed data units
388        (but touches at least one of them). Then we need to do the
389        equivalent of a reconstruct read to recover the missing data
390        unit from the other stripe.
391 
392        For any data we are writing that is not in the "shadow"
393        of the failed units, we need to do a four cycle update.
394        PANIC on this case. for now
395 
396 */
397 
RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)398 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
399 {
400 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
401 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
402 	int     sum;
403 	int     nf = asmap->numDataFailed;
404 
405 	sum = asmap->failedPDAs[0]->numSector;
406 	if (nf == 2)
407 		sum += asmap->failedPDAs[1]->numSector;
408 
409 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
410 		/* large write case */
411 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
412 		return;
413 	}
414 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
415 		/* small write case, no user data not in shadow */
416 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
417 		return;
418 	}
419 	RF_PANIC();
420 }
RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)421 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
422 {
423 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
424 }
425 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
426 				 * (RF_INCLUDE_RAID6 > 0) */
427