xref: /netbsd-src/sys/dev/raidframe/rf_pq.c (revision 1ca5c1b28139779176bd5c13ad7c5f25c0bcd5f8)
1 /*	$NetBSD: rf_pq.c,v 1.11 2001/11/13 07:11:16 lukem Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * Code for RAID level 6 (P + Q) disk array architecture.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.11 2001/11/13 07:11:16 lukem Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
39 
40 #include <dev/raidframe/raidframevar.h>
41 
42 #include "rf_raid.h"
43 #include "rf_dag.h"
44 #include "rf_dagffrd.h"
45 #include "rf_dagffwr.h"
46 #include "rf_dagdegrd.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_dagutils.h"
49 #include "rf_dagfuncs.h"
50 #include "rf_etimer.h"
51 #include "rf_pqdeg.h"
52 #include "rf_general.h"
53 #include "rf_map.h"
54 #include "rf_pq.h"
55 
56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
58 
59 int
60 rf_RegularONPFunc(node)
61 	RF_DagNode_t *node;
62 {
63 	return (rf_RegularXorFunc(node));
64 }
65 /*
66    same as simpleONQ func, but the coefficient is always 1
67 */
68 
69 int
70 rf_SimpleONPFunc(node)
71 	RF_DagNode_t *node;
72 {
73 	return (rf_SimpleXorFunc(node));
74 }
75 
76 int
77 rf_RecoveryPFunc(node)
78 	RF_DagNode_t *node;
79 {
80 	return (rf_RecoveryXorFunc(node));
81 }
82 
83 int
84 rf_RegularPFunc(node)
85 	RF_DagNode_t *node;
86 {
87 	return (rf_RegularXorFunc(node));
88 }
89 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
90 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
91 
92 static void
93 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
94     unsigned char coeff);
95 static void
96 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
97     unsigned length, unsigned coeff);
98 
99 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
100 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
101 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
102 
103 void
104 rf_PQDagSelect(
105     RF_Raid_t * raidPtr,
106     RF_IoType_t type,
107     RF_AccessStripeMap_t * asmap,
108     RF_VoidFuncPtr * createFunc)
109 {
110 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
111 	unsigned ndfail = asmap->numDataFailed;
112 	unsigned npfail = asmap->numParityFailed;
113 	unsigned ntfail = npfail + ndfail;
114 
115 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
116 	if (ntfail > 2) {
117 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
118 		 /* *infoFunc = */ *createFunc = NULL;
119 		return;
120 	}
121 	/* ok, we can do this I/O */
122 	if (type == RF_IO_TYPE_READ) {
123 		switch (ndfail) {
124 		case 0:
125 			/* fault free read */
126 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
127 			break;
128 		case 1:
129 			/* lost a single data unit */
130 			/* two cases: (1) parity is not lost. do a normal raid
131 			 * 5 reconstruct read. (2) parity is lost. do a
132 			 * reconstruct read using "q". */
133 			if (ntfail == 2) {	/* also lost redundancy */
134 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
135 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
136 				else
137 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
138 			} else {
139 				/* P and Q are ok. But is there a failure in
140 				 * some unaccessed data unit? */
141 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
142 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
143 				else
144 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
145 			}
146 			break;
147 		case 2:
148 			/* lost two data units */
149 			/* *infoFunc = PQOneTwo; */
150 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
151 			break;
152 		}
153 		return;
154 	}
155 	/* a write */
156 	switch (ntfail) {
157 	case 0:		/* fault free */
158 		if (rf_suppressLocksAndLargeWrites ||
159 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
160 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
161 
162 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
163 		} else {
164 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
165 		}
166 		break;
167 
168 	case 1:		/* single disk fault */
169 		if (npfail == 1) {
170 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
171 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
172 										 * normal mode raid5
173 										 * write. */
174 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
175 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
176 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
177 				else
178 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
179 			} else {/* parity died, small write only updating Q */
180 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
181 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
182 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
183 				else
184 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
185 			}
186 		} else {	/* data missing. Do a P reconstruct write if
187 				 * only a single data unit is lost in the
188 				 * stripe, otherwise a PQ reconstruct write. */
189 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
190 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
191 			else
192 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
193 		}
194 		break;
195 
196 	case 2:		/* two disk faults */
197 		switch (npfail) {
198 		case 2:	/* both p and q dead */
199 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
200 			break;
201 		case 1:	/* either p or q and dead data */
202 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
203 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
204 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
205 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
206 			else
207 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
208 			break;
209 		case 0:	/* double data loss */
210 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
211 			break;
212 		}
213 		break;
214 
215 	default:		/* more than 2 disk faults */
216 		*createFunc = NULL;
217 		RF_PANIC();
218 	}
219 	return;
220 }
221 /*
222    Used as a stop gap info function
223 */
224 #if 0
225 static void
226 PQOne(raidPtr, nSucc, nAnte, asmap)
227 	RF_Raid_t *raidPtr;
228 	int    *nSucc;
229 	int    *nAnte;
230 	RF_AccessStripeMap_t *asmap;
231 {
232 	*nSucc = *nAnte = 1;
233 }
234 
235 static void
236 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
237 	RF_Raid_t *raidPtr;
238 	int    *nSucc;
239 	int    *nAnte;
240 	RF_AccessStripeMap_t *asmap;
241 {
242 	*nSucc = 1;
243 	*nAnte = 2;
244 }
245 #endif
246 
247 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
248 {
249 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
250 	    rf_RegularPQFunc, RF_FALSE);
251 }
252 
253 int
254 rf_RegularONQFunc(node)
255 	RF_DagNode_t *node;
256 {
257 	int     np = node->numParams;
258 	int     d;
259 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
260 	int     i;
261 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
262 	RF_Etimer_t timer;
263 	char   *qbuf, *qpbuf;
264 	char   *obuf, *nbuf;
265 	RF_PhysDiskAddr_t *old, *new;
266 	unsigned long coeff;
267 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
268 
269 	RF_ETIMER_START(timer);
270 
271 	d = (np - 3) / 4;
272 	RF_ASSERT(4 * d + 3 == np);
273 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
274 	for (i = 0; i < d; i++) {
275 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
276 		obuf = (char *) node->params[2 * i + 1].p;
277 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
278 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
279 		RF_ASSERT(new->numSector == old->numSector);
280 		RF_ASSERT(new->raidAddress == old->raidAddress);
281 		/* the stripe unit within the stripe tells us the coefficient
282 		 * to use for the multiply. */
283 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
284 		/* compute the data unit offset within the column, then add
285 		 * one */
286 		coeff = (coeff % raidPtr->Layout.numDataCol);
287 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
288 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
289 	}
290 
291 	RF_ETIMER_STOP(timer);
292 	RF_ETIMER_EVAL(timer);
293 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
294 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
295 					 * I/O in this node */
296 	return (0);
297 }
298 /*
299    See the SimpleXORFunc for the difference between a simple and regular func.
300    These Q functions should be used for
301 
302          new q = Q(data,old data,old q)
303 
304    style updates and not for
305 
306          q = ( new data, new data, .... )
307 
308    computations.
309 
310    The simple q takes 2(2d+1)+1 params, where d is the number
311    of stripes written. The order of params is
312    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
313    [2d] old q pda_0, old q buffer
314    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
315    raidPtr
316 */
317 
318 int
319 rf_SimpleONQFunc(node)
320 	RF_DagNode_t *node;
321 {
322 	int     np = node->numParams;
323 	int     d;
324 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
325 	int     i;
326 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
327 	RF_Etimer_t timer;
328 	char   *qbuf;
329 	char   *obuf, *nbuf;
330 	RF_PhysDiskAddr_t *old, *new;
331 	unsigned long coeff;
332 
333 	RF_ETIMER_START(timer);
334 
335 	d = (np - 3) / 4;
336 	RF_ASSERT(4 * d + 3 == np);
337 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
338 	for (i = 0; i < d; i++) {
339 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
340 		obuf = (char *) node->params[2 * i + 1].p;
341 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
342 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
343 		RF_ASSERT(new->numSector == old->numSector);
344 		RF_ASSERT(new->raidAddress == old->raidAddress);
345 		/* the stripe unit within the stripe tells us the coefficient
346 		 * to use for the multiply. */
347 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
348 		/* compute the data unit offset within the column, then add
349 		 * one */
350 		coeff = (coeff % raidPtr->Layout.numDataCol);
351 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
352 	}
353 
354 	RF_ETIMER_STOP(timer);
355 	RF_ETIMER_EVAL(timer);
356 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
357 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
358 					 * I/O in this node */
359 	return (0);
360 }
361 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
362 {
363 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
364 }
365 
366 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
367 
368 static void
369 RegularQSubr(node, qbuf)
370 	RF_DagNode_t *node;
371 	char   *qbuf;
372 {
373 	int     np = node->numParams;
374 	int     d;
375 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
376 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
377 	int     i;
378 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
379 	RF_Etimer_t timer;
380 	char   *obuf, *qpbuf;
381 	RF_PhysDiskAddr_t *old;
382 	unsigned long coeff;
383 
384 	RF_ETIMER_START(timer);
385 
386 	d = (np - 1) / 2;
387 	RF_ASSERT(2 * d + 1 == np);
388 	for (i = 0; i < d; i++) {
389 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
390 		obuf = (char *) node->params[2 * i + 1].p;
391 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
392 		/* compute the data unit offset within the column, then add
393 		 * one */
394 		coeff = (coeff % raidPtr->Layout.numDataCol);
395 		/* the input buffers may not all be aligned with the start of
396 		 * the stripe. so shift by their sector offset within the
397 		 * stripe unit */
398 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
399 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
400 	}
401 
402 	RF_ETIMER_STOP(timer);
403 	RF_ETIMER_EVAL(timer);
404 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
405 }
406 /*
407    used in degraded writes.
408 */
409 
410 static void DegrQSubr(RF_DagNode_t *node);
411 
412 static void
413 DegrQSubr(node)
414 	RF_DagNode_t *node;
415 {
416 	int     np = node->numParams;
417 	int     d;
418 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
419 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
420 	int     i;
421 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
422 	RF_Etimer_t timer;
423 	char   *qbuf = node->results[1];
424 	char   *obuf, *qpbuf;
425 	RF_PhysDiskAddr_t *old;
426 	unsigned long coeff;
427 	unsigned fail_start;
428 	int     j;
429 
430 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
431 	fail_start = old->startSector % secPerSU;
432 
433 	RF_ETIMER_START(timer);
434 
435 	d = (np - 2) / 2;
436 	RF_ASSERT(2 * d + 2 == np);
437 	for (i = 0; i < d; i++) {
438 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
439 		obuf = (char *) node->params[2 * i + 1].p;
440 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
441 		/* compute the data unit offset within the column, then add
442 		 * one */
443 		coeff = (coeff % raidPtr->Layout.numDataCol);
444 		/* the input buffers may not all be aligned with the start of
445 		 * the stripe. so shift by their sector offset within the
446 		 * stripe unit */
447 		j = old->startSector % secPerSU;
448 		RF_ASSERT(j >= fail_start);
449 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
450 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
451 	}
452 
453 	RF_ETIMER_STOP(timer);
454 	RF_ETIMER_EVAL(timer);
455 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
456 }
457 /*
458    Called by large write code to compute the new parity and the new q.
459 
460    structure of the params:
461 
462    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
463    raidPtr
464 
465    for a total of 2d+1 arguments.
466    The result buffers results[0], results[1] are the buffers for the p and q,
467    respectively.
468 
469    We compute Q first, then compute P. The P calculation may try to reuse
470    one of the input buffers for its output, so if we computed P first, we would
471    corrupt the input for the q calculation.
472 */
473 
474 int
475 rf_RegularPQFunc(node)
476 	RF_DagNode_t *node;
477 {
478 	RegularQSubr(node, node->results[1]);
479 	return (rf_RegularXorFunc(node));	/* does the wakeup */
480 }
481 
482 int
483 rf_RegularQFunc(node)
484 	RF_DagNode_t *node;
485 {
486 	/* Almost ... adjust Qsubr args */
487 	RegularQSubr(node, node->results[0]);
488 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
489 					 * I/O in this node */
490 	return (0);
491 }
492 /*
493    Called by singly degraded write code to compute the new parity and the new q.
494 
495    structure of the params:
496 
497    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
498    failedPDA raidPtr
499 
500    for a total of 2d+2 arguments.
501    The result buffers results[0], results[1] are the buffers for the parity and q,
502    respectively.
503 
504    We compute Q first, then compute parity. The parity calculation may try to reuse
505    one of the input buffers for its output, so if we computed parity first, we would
506    corrupt the input for the q calculation.
507 
508    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
509 */
510 
511 void
512 rf_Degraded_100_PQFunc(node)
513 	RF_DagNode_t *node;
514 {
515 	int     np = node->numParams;
516 
517 	RF_ASSERT(np >= 2);
518 	DegrQSubr(node);
519 	rf_RecoveryXorFunc(node);
520 }
521 
522 
523 /*
524    The two below are used when reading a stripe with a single lost data unit.
525    The parameters are
526 
527    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
528 
529    and results[0] contains the data buffer. Which is originally zero-filled.
530 
531 */
532 
533 /* this Q func is used by the degraded-mode dag functions to recover lost data.
534  * the second-to-last parameter is the PDA for the failed portion of the access.
535  * the code here looks at this PDA and assumes that the xor target buffer is
536  * equal in size to the number of sectors in the failed PDA.  It then uses
537  * the other PDAs in the parameter list to determine where within the target
538  * buffer the corresponding data should be xored.
539  *
540  * Recall the basic equation is
541  *
542  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
543  *
544  * so to recover data_j we need
545  *
546  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
547  *
548  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
549  * copying Q into it. Then we need to do a table lookup to convert to solve
550  *   data_j /= J
551  *
552  *
553  */
554 int
555 rf_RecoveryQFunc(node)
556 	RF_DagNode_t *node;
557 {
558 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
559 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
560 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
561 	int     i;
562 	RF_PhysDiskAddr_t *pda;
563 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
564 	char   *srcbuf, *destbuf;
565 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
566 	RF_Etimer_t timer;
567 	unsigned long coeff;
568 
569 	RF_ETIMER_START(timer);
570 	/* start by copying Q into the buffer */
571 	bcopy(node->params[node->numParams - 3].p, node->results[0],
572 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
573 	for (i = 0; i < node->numParams - 4; i += 2) {
574 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
575 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
576 		srcbuf = (char *) node->params[i + 1].p;
577 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
578 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
579 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
580 		/* compute the data unit offset within the column */
581 		coeff = (coeff % raidPtr->Layout.numDataCol);
582 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
583 	}
584 	/* Do the nasty inversion now */
585 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
586 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
587 	RF_ETIMER_STOP(timer);
588 	RF_ETIMER_EVAL(timer);
589 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
590 	rf_GenericWakeupFunc(node, 0);
591 	return (0);
592 }
593 
594 int
595 rf_RecoveryPQFunc(node)
596 	RF_DagNode_t *node;
597 {
598 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
599 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
600 	return (1);
601 }
602 /*
603    Degraded write Q subroutine.
604    Used when P is dead.
605    Large-write style Q computation.
606    Parameters
607 
608    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
609 
610    We ignore failedPDA.
611 
612    This is a "simple style" recovery func.
613 */
614 
615 void
616 rf_PQ_DegradedWriteQFunc(node)
617 	RF_DagNode_t *node;
618 {
619 	int     np = node->numParams;
620 	int     d;
621 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
622 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
623 	int     i;
624 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
625 	RF_Etimer_t timer;
626 	char   *qbuf = node->results[0];
627 	char   *obuf, *qpbuf;
628 	RF_PhysDiskAddr_t *old;
629 	unsigned long coeff;
630 	int     fail_start, j;
631 
632 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
633 	fail_start = old->startSector % secPerSU;
634 
635 	RF_ETIMER_START(timer);
636 
637 	d = (np - 2) / 2;
638 	RF_ASSERT(2 * d + 2 == np);
639 
640 	for (i = 0; i < d; i++) {
641 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
642 		obuf = (char *) node->params[2 * i + 1].p;
643 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
644 		/* compute the data unit offset within the column, then add
645 		 * one */
646 		coeff = (coeff % raidPtr->Layout.numDataCol);
647 		j = old->startSector % secPerSU;
648 		RF_ASSERT(j >= fail_start);
649 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
650 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
651 	}
652 
653 	RF_ETIMER_STOP(timer);
654 	RF_ETIMER_EVAL(timer);
655 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
656 	rf_GenericWakeupFunc(node, 0);
657 }
658 
659 
660 
661 
662 /* Q computations */
663 
664 /*
665    coeff - colummn;
666 
667    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
668 
669    on 5-bit basis;
670    length in bytes;
671 */
672 
673 void
674 rf_IncQ(dest, buf, length, coeff)
675 	unsigned long *dest;
676 	unsigned long *buf;
677 	unsigned length;
678 	unsigned coeff;
679 {
680 	unsigned long a, d, new;
681 	unsigned long a1, a2;
682 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
683 	unsigned r = rf_rn[coeff + 1];
684 
685 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
686 #define INSERT(a,i) (a << (5L*i))
687 
688 	length /= 8;
689 	/* 13 5 bit quants in a 64 bit word */
690 	while (length) {
691 		a = *buf++;
692 		d = *dest;
693 		a1 = EXTRACT(a, 0) ^ r;
694 		a2 = EXTRACT(a, 1) ^ r;
695 		new = INSERT(a2, 1) | a1;
696 		a1 = EXTRACT(a, 2) ^ r;
697 		a2 = EXTRACT(a, 3) ^ r;
698 		a1 = q[a1];
699 		a2 = q[a2];
700 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
701 		a1 = EXTRACT(a, 4) ^ r;
702 		a2 = EXTRACT(a, 5) ^ r;
703 		a1 = q[a1];
704 		a2 = q[a2];
705 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
706 		a1 = EXTRACT(a, 5) ^ r;
707 		a2 = EXTRACT(a, 6) ^ r;
708 		a1 = q[a1];
709 		a2 = q[a2];
710 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
711 #if RF_LONGSHIFT > 2
712 		a1 = EXTRACT(a, 7) ^ r;
713 		a2 = EXTRACT(a, 8) ^ r;
714 		a1 = q[a1];
715 		a2 = q[a2];
716 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
717 		a1 = EXTRACT(a, 9) ^ r;
718 		a2 = EXTRACT(a, 10) ^ r;
719 		a1 = q[a1];
720 		a2 = q[a2];
721 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
722 		a1 = EXTRACT(a, 11) ^ r;
723 		a2 = EXTRACT(a, 12) ^ r;
724 		a1 = q[a1];
725 		a2 = q[a2];
726 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
727 #endif				/* RF_LONGSHIFT > 2 */
728 		d ^= new;
729 		*dest++ = d;
730 		length--;
731 	}
732 }
733 /*
734    compute
735 
736    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
737 
738    on a five bit basis.
739    optimization: compute old ^ new on 64 bit basis.
740 
741    length in bytes.
742 */
743 
744 static void
745 QDelta(
746     char *dest,
747     char *obuf,
748     char *nbuf,
749     unsigned length,
750     unsigned char coeff)
751 {
752 	unsigned long a, d, new;
753 	unsigned long a1, a2;
754 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
755 	unsigned int r = rf_rn[coeff + 1];
756 
757 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
758 	q = NULL; /* XXX for now */
759 
760 #ifdef _KERNEL
761 	/* PQ in kernel currently not supported because the encoding/decoding
762 	 * table is not present */
763 	memset(dest, 0, length);
764 #else				/* KERNEL */
765 	/* this code probably doesn't work and should be rewritten  -wvcii */
766 	/* 13 5 bit quants in a 64 bit word */
767 	length /= 8;
768 	while (length) {
769 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
770 		a ^= *nbuf++;
771 		d = *dest;
772 		a1 = EXTRACT(a, 0) ^ r;
773 		a2 = EXTRACT(a, 1) ^ r;
774 		a1 = q[a1];
775 		a2 = q[a2];
776 		new = INSERT(a2, 1) | a1;
777 		a1 = EXTRACT(a, 2) ^ r;
778 		a2 = EXTRACT(a, 3) ^ r;
779 		a1 = q[a1];
780 		a2 = q[a2];
781 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
782 		a1 = EXTRACT(a, 4) ^ r;
783 		a2 = EXTRACT(a, 5) ^ r;
784 		a1 = q[a1];
785 		a2 = q[a2];
786 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
787 		a1 = EXTRACT(a, 5) ^ r;
788 		a2 = EXTRACT(a, 6) ^ r;
789 		a1 = q[a1];
790 		a2 = q[a2];
791 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
792 #if RF_LONGSHIFT > 2
793 		a1 = EXTRACT(a, 7) ^ r;
794 		a2 = EXTRACT(a, 8) ^ r;
795 		a1 = q[a1];
796 		a2 = q[a2];
797 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
798 		a1 = EXTRACT(a, 9) ^ r;
799 		a2 = EXTRACT(a, 10) ^ r;
800 		a1 = q[a1];
801 		a2 = q[a2];
802 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
803 		a1 = EXTRACT(a, 11) ^ r;
804 		a2 = EXTRACT(a, 12) ^ r;
805 		a1 = q[a1];
806 		a2 = q[a2];
807 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
808 #endif				/* RF_LONGSHIFT > 2 */
809 		d ^= new;
810 		*dest++ = d;
811 		length--;
812 	}
813 #endif				/* _KERNEL */
814 }
815 /*
816    recover columns a and b from the given p and q into
817    bufs abuf and bbuf. All bufs are word aligned.
818    Length is in bytes.
819 */
820 
821 
822 /*
823  * XXX
824  *
825  * Everything about this seems wrong.
826  */
827 void
828 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
829 	unsigned long *pbuf;
830 	unsigned long *qbuf;
831 	unsigned long *abuf;
832 	unsigned long *bbuf;
833 	unsigned length;
834 	unsigned coeff_a;
835 	unsigned coeff_b;
836 {
837 	unsigned long p, q, a, a0, a1;
838 	int     col = (29 * coeff_a) + coeff_b;
839 	unsigned char *q0 = &(rf_qinv[col][0]);
840 
841 	length /= 8;
842 	while (length) {
843 		p = *pbuf++;
844 		q = *qbuf++;
845 		a0 = EXTRACT(p, 0);
846 		a1 = EXTRACT(q, 0);
847 		a = q0[a0 << 5 | a1];
848 #define MF(i) \
849       a0 = EXTRACT(p,i); \
850       a1 = EXTRACT(q,i); \
851       a  = a | INSERT(q0[a0<<5 | a1],i)
852 
853 		MF(1);
854 		MF(2);
855 		MF(3);
856 		MF(4);
857 		MF(5);
858 		MF(6);
859 #if 0
860 		MF(7);
861 		MF(8);
862 		MF(9);
863 		MF(10);
864 		MF(11);
865 		MF(12);
866 #endif				/* 0 */
867 		*abuf++ = a;
868 		*bbuf++ = a ^ p;
869 		length--;
870 	}
871 }
872 /*
873    Lost parity and a data column. Recover that data column.
874    Assume col coeff is lost. Let q the contents of Q after
875    all surviving data columns have been q-xored out of it.
876    Then we have the equation
877 
878    q[28-coeff][a_i ^ r_i+1] = q
879 
880    but q is cyclic with period 31.
881    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
882       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
883 
884    so a_i = r_{coeff+1} ^ q[3+coeff][q]
885 
886    The routine is passed q buffer and the buffer
887    the data is to be recoverd into. They can be the same.
888 */
889 
890 
891 
892 static void
893 rf_InvertQ(
894     unsigned long *qbuf,
895     unsigned long *abuf,
896     unsigned length,
897     unsigned coeff)
898 {
899 	unsigned long a, new;
900 	unsigned long a1, a2;
901 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
902 	unsigned r = rf_rn[coeff + 1];
903 
904 	/* 13 5 bit quants in a 64 bit word */
905 	length /= 8;
906 	while (length) {
907 		a = *qbuf++;
908 		a1 = EXTRACT(a, 0);
909 		a2 = EXTRACT(a, 1);
910 		a1 = r ^ q[a1];
911 		a2 = r ^ q[a2];
912 		new = INSERT(a2, 1) | a1;
913 #define M(i,j) \
914       a1 = EXTRACT(a,i); \
915       a2 = EXTRACT(a,j); \
916       a1 = r ^ q[a1]; \
917       a2 = r ^ q[a2]; \
918       new = new | INSERT(a1,i) | INSERT(a2,j)
919 
920 		M(2, 3);
921 		M(4, 5);
922 		M(5, 6);
923 #if RF_LONGSHIFT > 2
924 		M(7, 8);
925 		M(9, 10);
926 		M(11, 12);
927 #endif				/* RF_LONGSHIFT > 2 */
928 		*abuf++ = new;
929 		length--;
930 	}
931 }
932 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
933 				 * (RF_INCLUDE_RAID6 > 0) */
934