xref: /netbsd-src/sys/dev/raidframe/rf_pq.c (revision 95d875fb90b1458e4f1de6950286ddcd6644bc61)
1 /*	$NetBSD: rf_pq.c,v 1.6 1999/08/15 03:44:46 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * Code for RAID level 6 (P + Q) disk array architecture.
31  */
32 
33 #include "rf_archs.h"
34 #include "rf_types.h"
35 #include "rf_raid.h"
36 #include "rf_dag.h"
37 #include "rf_dagffrd.h"
38 #include "rf_dagffwr.h"
39 #include "rf_dagdegrd.h"
40 #include "rf_dagdegwr.h"
41 #include "rf_dagutils.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_threadid.h"
44 #include "rf_etimer.h"
45 #include "rf_pqdeg.h"
46 #include "rf_general.h"
47 #include "rf_map.h"
48 #include "rf_pq.h"
49 
50 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
51 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
52 
53 int
54 rf_RegularONPFunc(node)
55 	RF_DagNode_t *node;
56 {
57 	return (rf_RegularXorFunc(node));
58 }
59 /*
60    same as simpleONQ func, but the coefficient is always 1
61 */
62 
63 int
64 rf_SimpleONPFunc(node)
65 	RF_DagNode_t *node;
66 {
67 	return (rf_SimpleXorFunc(node));
68 }
69 
70 int
71 rf_RecoveryPFunc(node)
72 	RF_DagNode_t *node;
73 {
74 	return (rf_RecoveryXorFunc(node));
75 }
76 
77 int
78 rf_RegularPFunc(node)
79 	RF_DagNode_t *node;
80 {
81 	return (rf_RegularXorFunc(node));
82 }
83 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
84 
85 static void
86 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
87     unsigned char coeff);
88 static void
89 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
90     unsigned length, unsigned coeff);
91 
92 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
93 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
94 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
95 
96 void
97 rf_PQDagSelect(
98     RF_Raid_t * raidPtr,
99     RF_IoType_t type,
100     RF_AccessStripeMap_t * asmap,
101     RF_VoidFuncPtr * createFunc)
102 {
103 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
104 	unsigned ndfail = asmap->numDataFailed;
105 	unsigned npfail = asmap->numParityFailed;
106 	unsigned ntfail = npfail + ndfail;
107 
108 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
109 	if (ntfail > 2) {
110 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
111 		 /* *infoFunc = */ *createFunc = NULL;
112 		return;
113 	}
114 	/* ok, we can do this I/O */
115 	if (type == RF_IO_TYPE_READ) {
116 		switch (ndfail) {
117 		case 0:
118 			/* fault free read */
119 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
120 			break;
121 		case 1:
122 			/* lost a single data unit */
123 			/* two cases: (1) parity is not lost. do a normal raid
124 			 * 5 reconstruct read. (2) parity is lost. do a
125 			 * reconstruct read using "q". */
126 			if (ntfail == 2) {	/* also lost redundancy */
127 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
128 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
129 				else
130 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
131 			} else {
132 				/* P and Q are ok. But is there a failure in
133 				 * some unaccessed data unit? */
134 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
135 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
136 				else
137 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
138 			}
139 			break;
140 		case 2:
141 			/* lost two data units */
142 			/* *infoFunc = PQOneTwo; */
143 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
144 			break;
145 		}
146 		return;
147 	}
148 	/* a write */
149 	switch (ntfail) {
150 	case 0:		/* fault free */
151 		if (rf_suppressLocksAndLargeWrites ||
152 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
153 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
154 
155 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
156 		} else {
157 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
158 		}
159 		break;
160 
161 	case 1:		/* single disk fault */
162 		if (npfail == 1) {
163 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
164 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
165 										 * normal mode raid5
166 										 * write. */
167 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
168 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
169 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
170 				else
171 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
172 			} else {/* parity died, small write only updating Q */
173 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
174 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
175 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
176 				else
177 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
178 			}
179 		} else {	/* data missing. Do a P reconstruct write if
180 				 * only a single data unit is lost in the
181 				 * stripe, otherwise a PQ reconstruct write. */
182 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
183 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
184 			else
185 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
186 		}
187 		break;
188 
189 	case 2:		/* two disk faults */
190 		switch (npfail) {
191 		case 2:	/* both p and q dead */
192 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
193 			break;
194 		case 1:	/* either p or q and dead data */
195 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
196 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
197 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
198 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
199 			else
200 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
201 			break;
202 		case 0:	/* double data loss */
203 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
204 			break;
205 		}
206 		break;
207 
208 	default:		/* more than 2 disk faults */
209 		*createFunc = NULL;
210 		RF_PANIC();
211 	}
212 	return;
213 }
214 /*
215    Used as a stop gap info function
216 */
217 #if 0
218 static void
219 PQOne(raidPtr, nSucc, nAnte, asmap)
220 	RF_Raid_t *raidPtr;
221 	int    *nSucc;
222 	int    *nAnte;
223 	RF_AccessStripeMap_t *asmap;
224 {
225 	*nSucc = *nAnte = 1;
226 }
227 
228 static void
229 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
230 	RF_Raid_t *raidPtr;
231 	int    *nSucc;
232 	int    *nAnte;
233 	RF_AccessStripeMap_t *asmap;
234 {
235 	*nSucc = 1;
236 	*nAnte = 2;
237 }
238 #endif
239 
240 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
241 {
242 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
243 	    rf_RegularPQFunc, RF_FALSE);
244 }
245 
246 int
247 rf_RegularONQFunc(node)
248 	RF_DagNode_t *node;
249 {
250 	int     np = node->numParams;
251 	int     d;
252 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
253 	int     i;
254 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
255 	RF_Etimer_t timer;
256 	char   *qbuf, *qpbuf;
257 	char   *obuf, *nbuf;
258 	RF_PhysDiskAddr_t *old, *new;
259 	unsigned long coeff;
260 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
261 
262 	RF_ETIMER_START(timer);
263 
264 	d = (np - 3) / 4;
265 	RF_ASSERT(4 * d + 3 == np);
266 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
267 	for (i = 0; i < d; i++) {
268 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
269 		obuf = (char *) node->params[2 * i + 1].p;
270 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
271 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
272 		RF_ASSERT(new->numSector == old->numSector);
273 		RF_ASSERT(new->raidAddress == old->raidAddress);
274 		/* the stripe unit within the stripe tells us the coefficient
275 		 * to use for the multiply. */
276 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
277 		/* compute the data unit offset within the column, then add
278 		 * one */
279 		coeff = (coeff % raidPtr->Layout.numDataCol);
280 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
281 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
282 	}
283 
284 	RF_ETIMER_STOP(timer);
285 	RF_ETIMER_EVAL(timer);
286 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
287 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
288 					 * I/O in this node */
289 	return (0);
290 }
291 /*
292    See the SimpleXORFunc for the difference between a simple and regular func.
293    These Q functions should be used for
294 
295          new q = Q(data,old data,old q)
296 
297    style updates and not for
298 
299          q = ( new data, new data, .... )
300 
301    computations.
302 
303    The simple q takes 2(2d+1)+1 params, where d is the number
304    of stripes written. The order of params is
305    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
306    [2d] old q pda_0, old q buffer
307    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
308    raidPtr
309 */
310 
311 int
312 rf_SimpleONQFunc(node)
313 	RF_DagNode_t *node;
314 {
315 	int     np = node->numParams;
316 	int     d;
317 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
318 	int     i;
319 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
320 	RF_Etimer_t timer;
321 	char   *qbuf;
322 	char   *obuf, *nbuf;
323 	RF_PhysDiskAddr_t *old, *new;
324 	unsigned long coeff;
325 
326 	RF_ETIMER_START(timer);
327 
328 	d = (np - 3) / 4;
329 	RF_ASSERT(4 * d + 3 == np);
330 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
331 	for (i = 0; i < d; i++) {
332 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
333 		obuf = (char *) node->params[2 * i + 1].p;
334 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
335 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
336 		RF_ASSERT(new->numSector == old->numSector);
337 		RF_ASSERT(new->raidAddress == old->raidAddress);
338 		/* the stripe unit within the stripe tells us the coefficient
339 		 * to use for the multiply. */
340 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
341 		/* compute the data unit offset within the column, then add
342 		 * one */
343 		coeff = (coeff % raidPtr->Layout.numDataCol);
344 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
345 	}
346 
347 	RF_ETIMER_STOP(timer);
348 	RF_ETIMER_EVAL(timer);
349 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
350 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
351 					 * I/O in this node */
352 	return (0);
353 }
354 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
355 {
356 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
357 }
358 
359 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
360 
361 static void
362 RegularQSubr(node, qbuf)
363 	RF_DagNode_t *node;
364 	char   *qbuf;
365 {
366 	int     np = node->numParams;
367 	int     d;
368 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
369 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
370 	int     i;
371 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
372 	RF_Etimer_t timer;
373 	char   *obuf, *qpbuf;
374 	RF_PhysDiskAddr_t *old;
375 	unsigned long coeff;
376 
377 	RF_ETIMER_START(timer);
378 
379 	d = (np - 1) / 2;
380 	RF_ASSERT(2 * d + 1 == np);
381 	for (i = 0; i < d; i++) {
382 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
383 		obuf = (char *) node->params[2 * i + 1].p;
384 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
385 		/* compute the data unit offset within the column, then add
386 		 * one */
387 		coeff = (coeff % raidPtr->Layout.numDataCol);
388 		/* the input buffers may not all be aligned with the start of
389 		 * the stripe. so shift by their sector offset within the
390 		 * stripe unit */
391 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
392 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
393 	}
394 
395 	RF_ETIMER_STOP(timer);
396 	RF_ETIMER_EVAL(timer);
397 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
398 }
399 /*
400    used in degraded writes.
401 */
402 
403 static void DegrQSubr(RF_DagNode_t *node);
404 
405 static void
406 DegrQSubr(node)
407 	RF_DagNode_t *node;
408 {
409 	int     np = node->numParams;
410 	int     d;
411 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
412 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
413 	int     i;
414 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
415 	RF_Etimer_t timer;
416 	char   *qbuf = node->results[1];
417 	char   *obuf, *qpbuf;
418 	RF_PhysDiskAddr_t *old;
419 	unsigned long coeff;
420 	unsigned fail_start;
421 	int     j;
422 
423 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
424 	fail_start = old->startSector % secPerSU;
425 
426 	RF_ETIMER_START(timer);
427 
428 	d = (np - 2) / 2;
429 	RF_ASSERT(2 * d + 2 == np);
430 	for (i = 0; i < d; i++) {
431 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
432 		obuf = (char *) node->params[2 * i + 1].p;
433 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
434 		/* compute the data unit offset within the column, then add
435 		 * one */
436 		coeff = (coeff % raidPtr->Layout.numDataCol);
437 		/* the input buffers may not all be aligned with the start of
438 		 * the stripe. so shift by their sector offset within the
439 		 * stripe unit */
440 		j = old->startSector % secPerSU;
441 		RF_ASSERT(j >= fail_start);
442 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
443 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
444 	}
445 
446 	RF_ETIMER_STOP(timer);
447 	RF_ETIMER_EVAL(timer);
448 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
449 }
450 /*
451    Called by large write code to compute the new parity and the new q.
452 
453    structure of the params:
454 
455    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
456    raidPtr
457 
458    for a total of 2d+1 arguments.
459    The result buffers results[0], results[1] are the buffers for the p and q,
460    respectively.
461 
462    We compute Q first, then compute P. The P calculation may try to reuse
463    one of the input buffers for its output, so if we computed P first, we would
464    corrupt the input for the q calculation.
465 */
466 
467 int
468 rf_RegularPQFunc(node)
469 	RF_DagNode_t *node;
470 {
471 	RegularQSubr(node, node->results[1]);
472 	return (rf_RegularXorFunc(node));	/* does the wakeup */
473 }
474 
475 int
476 rf_RegularQFunc(node)
477 	RF_DagNode_t *node;
478 {
479 	/* Almost ... adjust Qsubr args */
480 	RegularQSubr(node, node->results[0]);
481 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
482 					 * I/O in this node */
483 	return (0);
484 }
485 /*
486    Called by singly degraded write code to compute the new parity and the new q.
487 
488    structure of the params:
489 
490    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
491    failedPDA raidPtr
492 
493    for a total of 2d+2 arguments.
494    The result buffers results[0], results[1] are the buffers for the parity and q,
495    respectively.
496 
497    We compute Q first, then compute parity. The parity calculation may try to reuse
498    one of the input buffers for its output, so if we computed parity first, we would
499    corrupt the input for the q calculation.
500 
501    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
502 */
503 
504 void
505 rf_Degraded_100_PQFunc(node)
506 	RF_DagNode_t *node;
507 {
508 	int     np = node->numParams;
509 
510 	RF_ASSERT(np >= 2);
511 	DegrQSubr(node);
512 	rf_RecoveryXorFunc(node);
513 }
514 
515 
516 /*
517    The two below are used when reading a stripe with a single lost data unit.
518    The parameters are
519 
520    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
521 
522    and results[0] contains the data buffer. Which is originally zero-filled.
523 
524 */
525 
526 /* this Q func is used by the degraded-mode dag functions to recover lost data.
527  * the second-to-last parameter is the PDA for the failed portion of the access.
528  * the code here looks at this PDA and assumes that the xor target buffer is
529  * equal in size to the number of sectors in the failed PDA.  It then uses
530  * the other PDAs in the parameter list to determine where within the target
531  * buffer the corresponding data should be xored.
532  *
533  * Recall the basic equation is
534  *
535  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
536  *
537  * so to recover data_j we need
538  *
539  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
540  *
541  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
542  * copying Q into it. Then we need to do a table lookup to convert to solve
543  *   data_j /= J
544  *
545  *
546  */
547 int
548 rf_RecoveryQFunc(node)
549 	RF_DagNode_t *node;
550 {
551 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
552 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
553 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
554 	int     i;
555 	RF_PhysDiskAddr_t *pda;
556 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
557 	char   *srcbuf, *destbuf;
558 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
559 	RF_Etimer_t timer;
560 	unsigned long coeff;
561 
562 	RF_ETIMER_START(timer);
563 	/* start by copying Q into the buffer */
564 	bcopy(node->params[node->numParams - 3].p, node->results[0],
565 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
566 	for (i = 0; i < node->numParams - 4; i += 2) {
567 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
568 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
569 		srcbuf = (char *) node->params[i + 1].p;
570 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
571 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
572 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
573 		/* compute the data unit offset within the column */
574 		coeff = (coeff % raidPtr->Layout.numDataCol);
575 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
576 	}
577 	/* Do the nasty inversion now */
578 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
579 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
580 	RF_ETIMER_STOP(timer);
581 	RF_ETIMER_EVAL(timer);
582 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
583 	rf_GenericWakeupFunc(node, 0);
584 	return (0);
585 }
586 
587 int
588 rf_RecoveryPQFunc(node)
589 	RF_DagNode_t *node;
590 {
591 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
592 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
593 	return (1);
594 }
595 /*
596    Degraded write Q subroutine.
597    Used when P is dead.
598    Large-write style Q computation.
599    Parameters
600 
601    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
602 
603    We ignore failedPDA.
604 
605    This is a "simple style" recovery func.
606 */
607 
608 void
609 rf_PQ_DegradedWriteQFunc(node)
610 	RF_DagNode_t *node;
611 {
612 	int     np = node->numParams;
613 	int     d;
614 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
615 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
616 	int     i;
617 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
618 	RF_Etimer_t timer;
619 	char   *qbuf = node->results[0];
620 	char   *obuf, *qpbuf;
621 	RF_PhysDiskAddr_t *old;
622 	unsigned long coeff;
623 	int     fail_start, j;
624 
625 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
626 	fail_start = old->startSector % secPerSU;
627 
628 	RF_ETIMER_START(timer);
629 
630 	d = (np - 2) / 2;
631 	RF_ASSERT(2 * d + 2 == np);
632 
633 	for (i = 0; i < d; i++) {
634 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
635 		obuf = (char *) node->params[2 * i + 1].p;
636 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
637 		/* compute the data unit offset within the column, then add
638 		 * one */
639 		coeff = (coeff % raidPtr->Layout.numDataCol);
640 		j = old->startSector % secPerSU;
641 		RF_ASSERT(j >= fail_start);
642 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
643 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
644 	}
645 
646 	RF_ETIMER_STOP(timer);
647 	RF_ETIMER_EVAL(timer);
648 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
649 	rf_GenericWakeupFunc(node, 0);
650 }
651 
652 
653 
654 
655 /* Q computations */
656 
657 /*
658    coeff - colummn;
659 
660    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
661 
662    on 5-bit basis;
663    length in bytes;
664 */
665 
666 void
667 rf_IncQ(dest, buf, length, coeff)
668 	unsigned long *dest;
669 	unsigned long *buf;
670 	unsigned length;
671 	unsigned coeff;
672 {
673 	unsigned long a, d, new;
674 	unsigned long a1, a2;
675 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
676 	unsigned r = rf_rn[coeff + 1];
677 
678 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
679 #define INSERT(a,i) (a << (5L*i))
680 
681 	length /= 8;
682 	/* 13 5 bit quants in a 64 bit word */
683 	while (length) {
684 		a = *buf++;
685 		d = *dest;
686 		a1 = EXTRACT(a, 0) ^ r;
687 		a2 = EXTRACT(a, 1) ^ r;
688 		new = INSERT(a2, 1) | a1;
689 		a1 = EXTRACT(a, 2) ^ r;
690 		a2 = EXTRACT(a, 3) ^ r;
691 		a1 = q[a1];
692 		a2 = q[a2];
693 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
694 		a1 = EXTRACT(a, 4) ^ r;
695 		a2 = EXTRACT(a, 5) ^ r;
696 		a1 = q[a1];
697 		a2 = q[a2];
698 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
699 		a1 = EXTRACT(a, 5) ^ r;
700 		a2 = EXTRACT(a, 6) ^ r;
701 		a1 = q[a1];
702 		a2 = q[a2];
703 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
704 #if RF_LONGSHIFT > 2
705 		a1 = EXTRACT(a, 7) ^ r;
706 		a2 = EXTRACT(a, 8) ^ r;
707 		a1 = q[a1];
708 		a2 = q[a2];
709 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
710 		a1 = EXTRACT(a, 9) ^ r;
711 		a2 = EXTRACT(a, 10) ^ r;
712 		a1 = q[a1];
713 		a2 = q[a2];
714 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
715 		a1 = EXTRACT(a, 11) ^ r;
716 		a2 = EXTRACT(a, 12) ^ r;
717 		a1 = q[a1];
718 		a2 = q[a2];
719 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
720 #endif				/* RF_LONGSHIFT > 2 */
721 		d ^= new;
722 		*dest++ = d;
723 		length--;
724 	}
725 }
726 /*
727    compute
728 
729    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
730 
731    on a five bit basis.
732    optimization: compute old ^ new on 64 bit basis.
733 
734    length in bytes.
735 */
736 
737 static void
738 QDelta(
739     char *dest,
740     char *obuf,
741     char *nbuf,
742     unsigned length,
743     unsigned char coeff)
744 {
745 	unsigned long a, d, new;
746 	unsigned long a1, a2;
747 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
748 	unsigned int r = rf_rn[coeff + 1];
749 
750 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
751 	q = NULL; /* XXX for now */
752 
753 #ifdef _KERNEL
754 	/* PQ in kernel currently not supported because the encoding/decoding
755 	 * table is not present */
756 	bzero(dest, length);
757 #else				/* KERNEL */
758 	/* this code probably doesn't work and should be rewritten  -wvcii */
759 	/* 13 5 bit quants in a 64 bit word */
760 	length /= 8;
761 	while (length) {
762 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
763 		a ^= *nbuf++;
764 		d = *dest;
765 		a1 = EXTRACT(a, 0) ^ r;
766 		a2 = EXTRACT(a, 1) ^ r;
767 		a1 = q[a1];
768 		a2 = q[a2];
769 		new = INSERT(a2, 1) | a1;
770 		a1 = EXTRACT(a, 2) ^ r;
771 		a2 = EXTRACT(a, 3) ^ r;
772 		a1 = q[a1];
773 		a2 = q[a2];
774 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
775 		a1 = EXTRACT(a, 4) ^ r;
776 		a2 = EXTRACT(a, 5) ^ r;
777 		a1 = q[a1];
778 		a2 = q[a2];
779 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
780 		a1 = EXTRACT(a, 5) ^ r;
781 		a2 = EXTRACT(a, 6) ^ r;
782 		a1 = q[a1];
783 		a2 = q[a2];
784 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
785 #if RF_LONGSHIFT > 2
786 		a1 = EXTRACT(a, 7) ^ r;
787 		a2 = EXTRACT(a, 8) ^ r;
788 		a1 = q[a1];
789 		a2 = q[a2];
790 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
791 		a1 = EXTRACT(a, 9) ^ r;
792 		a2 = EXTRACT(a, 10) ^ r;
793 		a1 = q[a1];
794 		a2 = q[a2];
795 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
796 		a1 = EXTRACT(a, 11) ^ r;
797 		a2 = EXTRACT(a, 12) ^ r;
798 		a1 = q[a1];
799 		a2 = q[a2];
800 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
801 #endif				/* RF_LONGSHIFT > 2 */
802 		d ^= new;
803 		*dest++ = d;
804 		length--;
805 	}
806 #endif				/* _KERNEL */
807 }
808 /*
809    recover columns a and b from the given p and q into
810    bufs abuf and bbuf. All bufs are word aligned.
811    Length is in bytes.
812 */
813 
814 
815 /*
816  * XXX
817  *
818  * Everything about this seems wrong.
819  */
820 void
821 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
822 	unsigned long *pbuf;
823 	unsigned long *qbuf;
824 	unsigned long *abuf;
825 	unsigned long *bbuf;
826 	unsigned length;
827 	unsigned coeff_a;
828 	unsigned coeff_b;
829 {
830 	unsigned long p, q, a, a0, a1;
831 	int     col = (29 * coeff_a) + coeff_b;
832 	unsigned char *q0 = &(rf_qinv[col][0]);
833 
834 	length /= 8;
835 	while (length) {
836 		p = *pbuf++;
837 		q = *qbuf++;
838 		a0 = EXTRACT(p, 0);
839 		a1 = EXTRACT(q, 0);
840 		a = q0[a0 << 5 | a1];
841 #define MF(i) \
842       a0 = EXTRACT(p,i); \
843       a1 = EXTRACT(q,i); \
844       a  = a | INSERT(q0[a0<<5 | a1],i)
845 
846 		MF(1);
847 		MF(2);
848 		MF(3);
849 		MF(4);
850 		MF(5);
851 		MF(6);
852 #if 0
853 		MF(7);
854 		MF(8);
855 		MF(9);
856 		MF(10);
857 		MF(11);
858 		MF(12);
859 #endif				/* 0 */
860 		*abuf++ = a;
861 		*bbuf++ = a ^ p;
862 		length--;
863 	}
864 }
865 /*
866    Lost parity and a data column. Recover that data column.
867    Assume col coeff is lost. Let q the contents of Q after
868    all surviving data columns have been q-xored out of it.
869    Then we have the equation
870 
871    q[28-coeff][a_i ^ r_i+1] = q
872 
873    but q is cyclic with period 31.
874    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
875       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
876 
877    so a_i = r_{coeff+1} ^ q[3+coeff][q]
878 
879    The routine is passed q buffer and the buffer
880    the data is to be recoverd into. They can be the same.
881 */
882 
883 
884 
885 static void
886 rf_InvertQ(
887     unsigned long *qbuf,
888     unsigned long *abuf,
889     unsigned length,
890     unsigned coeff)
891 {
892 	unsigned long a, new;
893 	unsigned long a1, a2;
894 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
895 	unsigned r = rf_rn[coeff + 1];
896 
897 	/* 13 5 bit quants in a 64 bit word */
898 	length /= 8;
899 	while (length) {
900 		a = *qbuf++;
901 		a1 = EXTRACT(a, 0);
902 		a2 = EXTRACT(a, 1);
903 		a1 = r ^ q[a1];
904 		a2 = r ^ q[a2];
905 		new = INSERT(a2, 1) | a1;
906 #define M(i,j) \
907       a1 = EXTRACT(a,i); \
908       a2 = EXTRACT(a,j); \
909       a1 = r ^ q[a1]; \
910       a2 = r ^ q[a2]; \
911       new = new | INSERT(a1,i) | INSERT(a2,j)
912 
913 		M(2, 3);
914 		M(4, 5);
915 		M(5, 6);
916 #if RF_LONGSHIFT > 2
917 		M(7, 8);
918 		M(9, 10);
919 		M(11, 12);
920 #endif				/* RF_LONGSHIFT > 2 */
921 		*abuf++ = new;
922 		length--;
923 	}
924 }
925 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
926 				 * (RF_INCLUDE_RAID6 > 0) */
927