xref: /netbsd-src/sys/dev/raidframe/rf_pq.c (revision 4472dbe5e3bd91ef2540bada7a7ca7384627ff9b)
1 /*	$NetBSD: rf_pq.c,v 1.7 2000/01/07 03:41:02 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * Code for RAID level 6 (P + Q) disk array architecture.
31  */
32 
33 #include "rf_archs.h"
34 #include "rf_types.h"
35 #include "rf_raid.h"
36 #include "rf_dag.h"
37 #include "rf_dagffrd.h"
38 #include "rf_dagffwr.h"
39 #include "rf_dagdegrd.h"
40 #include "rf_dagdegwr.h"
41 #include "rf_dagutils.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_etimer.h"
44 #include "rf_pqdeg.h"
45 #include "rf_general.h"
46 #include "rf_map.h"
47 #include "rf_pq.h"
48 
49 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
50 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
51 
52 int
53 rf_RegularONPFunc(node)
54 	RF_DagNode_t *node;
55 {
56 	return (rf_RegularXorFunc(node));
57 }
58 /*
59    same as simpleONQ func, but the coefficient is always 1
60 */
61 
62 int
63 rf_SimpleONPFunc(node)
64 	RF_DagNode_t *node;
65 {
66 	return (rf_SimpleXorFunc(node));
67 }
68 
69 int
70 rf_RecoveryPFunc(node)
71 	RF_DagNode_t *node;
72 {
73 	return (rf_RecoveryXorFunc(node));
74 }
75 
76 int
77 rf_RegularPFunc(node)
78 	RF_DagNode_t *node;
79 {
80 	return (rf_RegularXorFunc(node));
81 }
82 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
83 
84 static void
85 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
86     unsigned char coeff);
87 static void
88 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
89     unsigned length, unsigned coeff);
90 
91 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
92 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
93 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
94 
95 void
96 rf_PQDagSelect(
97     RF_Raid_t * raidPtr,
98     RF_IoType_t type,
99     RF_AccessStripeMap_t * asmap,
100     RF_VoidFuncPtr * createFunc)
101 {
102 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
103 	unsigned ndfail = asmap->numDataFailed;
104 	unsigned npfail = asmap->numParityFailed;
105 	unsigned ntfail = npfail + ndfail;
106 
107 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
108 	if (ntfail > 2) {
109 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
110 		 /* *infoFunc = */ *createFunc = NULL;
111 		return;
112 	}
113 	/* ok, we can do this I/O */
114 	if (type == RF_IO_TYPE_READ) {
115 		switch (ndfail) {
116 		case 0:
117 			/* fault free read */
118 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
119 			break;
120 		case 1:
121 			/* lost a single data unit */
122 			/* two cases: (1) parity is not lost. do a normal raid
123 			 * 5 reconstruct read. (2) parity is lost. do a
124 			 * reconstruct read using "q". */
125 			if (ntfail == 2) {	/* also lost redundancy */
126 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
127 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
128 				else
129 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
130 			} else {
131 				/* P and Q are ok. But is there a failure in
132 				 * some unaccessed data unit? */
133 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
134 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
135 				else
136 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
137 			}
138 			break;
139 		case 2:
140 			/* lost two data units */
141 			/* *infoFunc = PQOneTwo; */
142 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
143 			break;
144 		}
145 		return;
146 	}
147 	/* a write */
148 	switch (ntfail) {
149 	case 0:		/* fault free */
150 		if (rf_suppressLocksAndLargeWrites ||
151 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
152 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
153 
154 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
155 		} else {
156 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
157 		}
158 		break;
159 
160 	case 1:		/* single disk fault */
161 		if (npfail == 1) {
162 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
163 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
164 										 * normal mode raid5
165 										 * write. */
166 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
167 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
168 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
169 				else
170 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
171 			} else {/* parity died, small write only updating Q */
172 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
173 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
174 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
175 				else
176 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
177 			}
178 		} else {	/* data missing. Do a P reconstruct write if
179 				 * only a single data unit is lost in the
180 				 * stripe, otherwise a PQ reconstruct write. */
181 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
182 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
183 			else
184 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
185 		}
186 		break;
187 
188 	case 2:		/* two disk faults */
189 		switch (npfail) {
190 		case 2:	/* both p and q dead */
191 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
192 			break;
193 		case 1:	/* either p or q and dead data */
194 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
195 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
196 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
197 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
198 			else
199 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
200 			break;
201 		case 0:	/* double data loss */
202 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
203 			break;
204 		}
205 		break;
206 
207 	default:		/* more than 2 disk faults */
208 		*createFunc = NULL;
209 		RF_PANIC();
210 	}
211 	return;
212 }
213 /*
214    Used as a stop gap info function
215 */
216 #if 0
217 static void
218 PQOne(raidPtr, nSucc, nAnte, asmap)
219 	RF_Raid_t *raidPtr;
220 	int    *nSucc;
221 	int    *nAnte;
222 	RF_AccessStripeMap_t *asmap;
223 {
224 	*nSucc = *nAnte = 1;
225 }
226 
227 static void
228 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
229 	RF_Raid_t *raidPtr;
230 	int    *nSucc;
231 	int    *nAnte;
232 	RF_AccessStripeMap_t *asmap;
233 {
234 	*nSucc = 1;
235 	*nAnte = 2;
236 }
237 #endif
238 
239 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
240 {
241 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
242 	    rf_RegularPQFunc, RF_FALSE);
243 }
244 
245 int
246 rf_RegularONQFunc(node)
247 	RF_DagNode_t *node;
248 {
249 	int     np = node->numParams;
250 	int     d;
251 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
252 	int     i;
253 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
254 	RF_Etimer_t timer;
255 	char   *qbuf, *qpbuf;
256 	char   *obuf, *nbuf;
257 	RF_PhysDiskAddr_t *old, *new;
258 	unsigned long coeff;
259 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
260 
261 	RF_ETIMER_START(timer);
262 
263 	d = (np - 3) / 4;
264 	RF_ASSERT(4 * d + 3 == np);
265 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
266 	for (i = 0; i < d; i++) {
267 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
268 		obuf = (char *) node->params[2 * i + 1].p;
269 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
270 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
271 		RF_ASSERT(new->numSector == old->numSector);
272 		RF_ASSERT(new->raidAddress == old->raidAddress);
273 		/* the stripe unit within the stripe tells us the coefficient
274 		 * to use for the multiply. */
275 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
276 		/* compute the data unit offset within the column, then add
277 		 * one */
278 		coeff = (coeff % raidPtr->Layout.numDataCol);
279 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
280 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
281 	}
282 
283 	RF_ETIMER_STOP(timer);
284 	RF_ETIMER_EVAL(timer);
285 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
286 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
287 					 * I/O in this node */
288 	return (0);
289 }
290 /*
291    See the SimpleXORFunc for the difference between a simple and regular func.
292    These Q functions should be used for
293 
294          new q = Q(data,old data,old q)
295 
296    style updates and not for
297 
298          q = ( new data, new data, .... )
299 
300    computations.
301 
302    The simple q takes 2(2d+1)+1 params, where d is the number
303    of stripes written. The order of params is
304    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
305    [2d] old q pda_0, old q buffer
306    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
307    raidPtr
308 */
309 
310 int
311 rf_SimpleONQFunc(node)
312 	RF_DagNode_t *node;
313 {
314 	int     np = node->numParams;
315 	int     d;
316 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
317 	int     i;
318 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
319 	RF_Etimer_t timer;
320 	char   *qbuf;
321 	char   *obuf, *nbuf;
322 	RF_PhysDiskAddr_t *old, *new;
323 	unsigned long coeff;
324 
325 	RF_ETIMER_START(timer);
326 
327 	d = (np - 3) / 4;
328 	RF_ASSERT(4 * d + 3 == np);
329 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
330 	for (i = 0; i < d; i++) {
331 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
332 		obuf = (char *) node->params[2 * i + 1].p;
333 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
334 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
335 		RF_ASSERT(new->numSector == old->numSector);
336 		RF_ASSERT(new->raidAddress == old->raidAddress);
337 		/* the stripe unit within the stripe tells us the coefficient
338 		 * to use for the multiply. */
339 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
340 		/* compute the data unit offset within the column, then add
341 		 * one */
342 		coeff = (coeff % raidPtr->Layout.numDataCol);
343 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
344 	}
345 
346 	RF_ETIMER_STOP(timer);
347 	RF_ETIMER_EVAL(timer);
348 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
349 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
350 					 * I/O in this node */
351 	return (0);
352 }
353 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
354 {
355 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
356 }
357 
358 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
359 
360 static void
361 RegularQSubr(node, qbuf)
362 	RF_DagNode_t *node;
363 	char   *qbuf;
364 {
365 	int     np = node->numParams;
366 	int     d;
367 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
368 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
369 	int     i;
370 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
371 	RF_Etimer_t timer;
372 	char   *obuf, *qpbuf;
373 	RF_PhysDiskAddr_t *old;
374 	unsigned long coeff;
375 
376 	RF_ETIMER_START(timer);
377 
378 	d = (np - 1) / 2;
379 	RF_ASSERT(2 * d + 1 == np);
380 	for (i = 0; i < d; i++) {
381 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
382 		obuf = (char *) node->params[2 * i + 1].p;
383 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
384 		/* compute the data unit offset within the column, then add
385 		 * one */
386 		coeff = (coeff % raidPtr->Layout.numDataCol);
387 		/* the input buffers may not all be aligned with the start of
388 		 * the stripe. so shift by their sector offset within the
389 		 * stripe unit */
390 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
391 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
392 	}
393 
394 	RF_ETIMER_STOP(timer);
395 	RF_ETIMER_EVAL(timer);
396 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
397 }
398 /*
399    used in degraded writes.
400 */
401 
402 static void DegrQSubr(RF_DagNode_t *node);
403 
404 static void
405 DegrQSubr(node)
406 	RF_DagNode_t *node;
407 {
408 	int     np = node->numParams;
409 	int     d;
410 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
411 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
412 	int     i;
413 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
414 	RF_Etimer_t timer;
415 	char   *qbuf = node->results[1];
416 	char   *obuf, *qpbuf;
417 	RF_PhysDiskAddr_t *old;
418 	unsigned long coeff;
419 	unsigned fail_start;
420 	int     j;
421 
422 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
423 	fail_start = old->startSector % secPerSU;
424 
425 	RF_ETIMER_START(timer);
426 
427 	d = (np - 2) / 2;
428 	RF_ASSERT(2 * d + 2 == np);
429 	for (i = 0; i < d; i++) {
430 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
431 		obuf = (char *) node->params[2 * i + 1].p;
432 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
433 		/* compute the data unit offset within the column, then add
434 		 * one */
435 		coeff = (coeff % raidPtr->Layout.numDataCol);
436 		/* the input buffers may not all be aligned with the start of
437 		 * the stripe. so shift by their sector offset within the
438 		 * stripe unit */
439 		j = old->startSector % secPerSU;
440 		RF_ASSERT(j >= fail_start);
441 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
442 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
443 	}
444 
445 	RF_ETIMER_STOP(timer);
446 	RF_ETIMER_EVAL(timer);
447 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
448 }
449 /*
450    Called by large write code to compute the new parity and the new q.
451 
452    structure of the params:
453 
454    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
455    raidPtr
456 
457    for a total of 2d+1 arguments.
458    The result buffers results[0], results[1] are the buffers for the p and q,
459    respectively.
460 
461    We compute Q first, then compute P. The P calculation may try to reuse
462    one of the input buffers for its output, so if we computed P first, we would
463    corrupt the input for the q calculation.
464 */
465 
466 int
467 rf_RegularPQFunc(node)
468 	RF_DagNode_t *node;
469 {
470 	RegularQSubr(node, node->results[1]);
471 	return (rf_RegularXorFunc(node));	/* does the wakeup */
472 }
473 
474 int
475 rf_RegularQFunc(node)
476 	RF_DagNode_t *node;
477 {
478 	/* Almost ... adjust Qsubr args */
479 	RegularQSubr(node, node->results[0]);
480 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
481 					 * I/O in this node */
482 	return (0);
483 }
484 /*
485    Called by singly degraded write code to compute the new parity and the new q.
486 
487    structure of the params:
488 
489    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
490    failedPDA raidPtr
491 
492    for a total of 2d+2 arguments.
493    The result buffers results[0], results[1] are the buffers for the parity and q,
494    respectively.
495 
496    We compute Q first, then compute parity. The parity calculation may try to reuse
497    one of the input buffers for its output, so if we computed parity first, we would
498    corrupt the input for the q calculation.
499 
500    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
501 */
502 
503 void
504 rf_Degraded_100_PQFunc(node)
505 	RF_DagNode_t *node;
506 {
507 	int     np = node->numParams;
508 
509 	RF_ASSERT(np >= 2);
510 	DegrQSubr(node);
511 	rf_RecoveryXorFunc(node);
512 }
513 
514 
515 /*
516    The two below are used when reading a stripe with a single lost data unit.
517    The parameters are
518 
519    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
520 
521    and results[0] contains the data buffer. Which is originally zero-filled.
522 
523 */
524 
525 /* this Q func is used by the degraded-mode dag functions to recover lost data.
526  * the second-to-last parameter is the PDA for the failed portion of the access.
527  * the code here looks at this PDA and assumes that the xor target buffer is
528  * equal in size to the number of sectors in the failed PDA.  It then uses
529  * the other PDAs in the parameter list to determine where within the target
530  * buffer the corresponding data should be xored.
531  *
532  * Recall the basic equation is
533  *
534  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
535  *
536  * so to recover data_j we need
537  *
538  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
539  *
540  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
541  * copying Q into it. Then we need to do a table lookup to convert to solve
542  *   data_j /= J
543  *
544  *
545  */
546 int
547 rf_RecoveryQFunc(node)
548 	RF_DagNode_t *node;
549 {
550 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
551 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
552 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
553 	int     i;
554 	RF_PhysDiskAddr_t *pda;
555 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
556 	char   *srcbuf, *destbuf;
557 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
558 	RF_Etimer_t timer;
559 	unsigned long coeff;
560 
561 	RF_ETIMER_START(timer);
562 	/* start by copying Q into the buffer */
563 	bcopy(node->params[node->numParams - 3].p, node->results[0],
564 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
565 	for (i = 0; i < node->numParams - 4; i += 2) {
566 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
567 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
568 		srcbuf = (char *) node->params[i + 1].p;
569 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
570 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
571 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
572 		/* compute the data unit offset within the column */
573 		coeff = (coeff % raidPtr->Layout.numDataCol);
574 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
575 	}
576 	/* Do the nasty inversion now */
577 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
578 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
579 	RF_ETIMER_STOP(timer);
580 	RF_ETIMER_EVAL(timer);
581 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
582 	rf_GenericWakeupFunc(node, 0);
583 	return (0);
584 }
585 
586 int
587 rf_RecoveryPQFunc(node)
588 	RF_DagNode_t *node;
589 {
590 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
591 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
592 	return (1);
593 }
594 /*
595    Degraded write Q subroutine.
596    Used when P is dead.
597    Large-write style Q computation.
598    Parameters
599 
600    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
601 
602    We ignore failedPDA.
603 
604    This is a "simple style" recovery func.
605 */
606 
607 void
608 rf_PQ_DegradedWriteQFunc(node)
609 	RF_DagNode_t *node;
610 {
611 	int     np = node->numParams;
612 	int     d;
613 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
614 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
615 	int     i;
616 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
617 	RF_Etimer_t timer;
618 	char   *qbuf = node->results[0];
619 	char   *obuf, *qpbuf;
620 	RF_PhysDiskAddr_t *old;
621 	unsigned long coeff;
622 	int     fail_start, j;
623 
624 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
625 	fail_start = old->startSector % secPerSU;
626 
627 	RF_ETIMER_START(timer);
628 
629 	d = (np - 2) / 2;
630 	RF_ASSERT(2 * d + 2 == np);
631 
632 	for (i = 0; i < d; i++) {
633 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
634 		obuf = (char *) node->params[2 * i + 1].p;
635 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
636 		/* compute the data unit offset within the column, then add
637 		 * one */
638 		coeff = (coeff % raidPtr->Layout.numDataCol);
639 		j = old->startSector % secPerSU;
640 		RF_ASSERT(j >= fail_start);
641 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
642 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
643 	}
644 
645 	RF_ETIMER_STOP(timer);
646 	RF_ETIMER_EVAL(timer);
647 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
648 	rf_GenericWakeupFunc(node, 0);
649 }
650 
651 
652 
653 
654 /* Q computations */
655 
656 /*
657    coeff - colummn;
658 
659    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
660 
661    on 5-bit basis;
662    length in bytes;
663 */
664 
665 void
666 rf_IncQ(dest, buf, length, coeff)
667 	unsigned long *dest;
668 	unsigned long *buf;
669 	unsigned length;
670 	unsigned coeff;
671 {
672 	unsigned long a, d, new;
673 	unsigned long a1, a2;
674 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
675 	unsigned r = rf_rn[coeff + 1];
676 
677 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
678 #define INSERT(a,i) (a << (5L*i))
679 
680 	length /= 8;
681 	/* 13 5 bit quants in a 64 bit word */
682 	while (length) {
683 		a = *buf++;
684 		d = *dest;
685 		a1 = EXTRACT(a, 0) ^ r;
686 		a2 = EXTRACT(a, 1) ^ r;
687 		new = INSERT(a2, 1) | a1;
688 		a1 = EXTRACT(a, 2) ^ r;
689 		a2 = EXTRACT(a, 3) ^ r;
690 		a1 = q[a1];
691 		a2 = q[a2];
692 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
693 		a1 = EXTRACT(a, 4) ^ r;
694 		a2 = EXTRACT(a, 5) ^ r;
695 		a1 = q[a1];
696 		a2 = q[a2];
697 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
698 		a1 = EXTRACT(a, 5) ^ r;
699 		a2 = EXTRACT(a, 6) ^ r;
700 		a1 = q[a1];
701 		a2 = q[a2];
702 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
703 #if RF_LONGSHIFT > 2
704 		a1 = EXTRACT(a, 7) ^ r;
705 		a2 = EXTRACT(a, 8) ^ r;
706 		a1 = q[a1];
707 		a2 = q[a2];
708 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
709 		a1 = EXTRACT(a, 9) ^ r;
710 		a2 = EXTRACT(a, 10) ^ r;
711 		a1 = q[a1];
712 		a2 = q[a2];
713 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
714 		a1 = EXTRACT(a, 11) ^ r;
715 		a2 = EXTRACT(a, 12) ^ r;
716 		a1 = q[a1];
717 		a2 = q[a2];
718 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
719 #endif				/* RF_LONGSHIFT > 2 */
720 		d ^= new;
721 		*dest++ = d;
722 		length--;
723 	}
724 }
725 /*
726    compute
727 
728    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
729 
730    on a five bit basis.
731    optimization: compute old ^ new on 64 bit basis.
732 
733    length in bytes.
734 */
735 
736 static void
737 QDelta(
738     char *dest,
739     char *obuf,
740     char *nbuf,
741     unsigned length,
742     unsigned char coeff)
743 {
744 	unsigned long a, d, new;
745 	unsigned long a1, a2;
746 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
747 	unsigned int r = rf_rn[coeff + 1];
748 
749 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
750 	q = NULL; /* XXX for now */
751 
752 #ifdef _KERNEL
753 	/* PQ in kernel currently not supported because the encoding/decoding
754 	 * table is not present */
755 	bzero(dest, length);
756 #else				/* KERNEL */
757 	/* this code probably doesn't work and should be rewritten  -wvcii */
758 	/* 13 5 bit quants in a 64 bit word */
759 	length /= 8;
760 	while (length) {
761 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
762 		a ^= *nbuf++;
763 		d = *dest;
764 		a1 = EXTRACT(a, 0) ^ r;
765 		a2 = EXTRACT(a, 1) ^ r;
766 		a1 = q[a1];
767 		a2 = q[a2];
768 		new = INSERT(a2, 1) | a1;
769 		a1 = EXTRACT(a, 2) ^ r;
770 		a2 = EXTRACT(a, 3) ^ r;
771 		a1 = q[a1];
772 		a2 = q[a2];
773 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
774 		a1 = EXTRACT(a, 4) ^ r;
775 		a2 = EXTRACT(a, 5) ^ r;
776 		a1 = q[a1];
777 		a2 = q[a2];
778 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
779 		a1 = EXTRACT(a, 5) ^ r;
780 		a2 = EXTRACT(a, 6) ^ r;
781 		a1 = q[a1];
782 		a2 = q[a2];
783 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
784 #if RF_LONGSHIFT > 2
785 		a1 = EXTRACT(a, 7) ^ r;
786 		a2 = EXTRACT(a, 8) ^ r;
787 		a1 = q[a1];
788 		a2 = q[a2];
789 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
790 		a1 = EXTRACT(a, 9) ^ r;
791 		a2 = EXTRACT(a, 10) ^ r;
792 		a1 = q[a1];
793 		a2 = q[a2];
794 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
795 		a1 = EXTRACT(a, 11) ^ r;
796 		a2 = EXTRACT(a, 12) ^ r;
797 		a1 = q[a1];
798 		a2 = q[a2];
799 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
800 #endif				/* RF_LONGSHIFT > 2 */
801 		d ^= new;
802 		*dest++ = d;
803 		length--;
804 	}
805 #endif				/* _KERNEL */
806 }
807 /*
808    recover columns a and b from the given p and q into
809    bufs abuf and bbuf. All bufs are word aligned.
810    Length is in bytes.
811 */
812 
813 
814 /*
815  * XXX
816  *
817  * Everything about this seems wrong.
818  */
819 void
820 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
821 	unsigned long *pbuf;
822 	unsigned long *qbuf;
823 	unsigned long *abuf;
824 	unsigned long *bbuf;
825 	unsigned length;
826 	unsigned coeff_a;
827 	unsigned coeff_b;
828 {
829 	unsigned long p, q, a, a0, a1;
830 	int     col = (29 * coeff_a) + coeff_b;
831 	unsigned char *q0 = &(rf_qinv[col][0]);
832 
833 	length /= 8;
834 	while (length) {
835 		p = *pbuf++;
836 		q = *qbuf++;
837 		a0 = EXTRACT(p, 0);
838 		a1 = EXTRACT(q, 0);
839 		a = q0[a0 << 5 | a1];
840 #define MF(i) \
841       a0 = EXTRACT(p,i); \
842       a1 = EXTRACT(q,i); \
843       a  = a | INSERT(q0[a0<<5 | a1],i)
844 
845 		MF(1);
846 		MF(2);
847 		MF(3);
848 		MF(4);
849 		MF(5);
850 		MF(6);
851 #if 0
852 		MF(7);
853 		MF(8);
854 		MF(9);
855 		MF(10);
856 		MF(11);
857 		MF(12);
858 #endif				/* 0 */
859 		*abuf++ = a;
860 		*bbuf++ = a ^ p;
861 		length--;
862 	}
863 }
864 /*
865    Lost parity and a data column. Recover that data column.
866    Assume col coeff is lost. Let q the contents of Q after
867    all surviving data columns have been q-xored out of it.
868    Then we have the equation
869 
870    q[28-coeff][a_i ^ r_i+1] = q
871 
872    but q is cyclic with period 31.
873    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
874       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
875 
876    so a_i = r_{coeff+1} ^ q[3+coeff][q]
877 
878    The routine is passed q buffer and the buffer
879    the data is to be recoverd into. They can be the same.
880 */
881 
882 
883 
884 static void
885 rf_InvertQ(
886     unsigned long *qbuf,
887     unsigned long *abuf,
888     unsigned length,
889     unsigned coeff)
890 {
891 	unsigned long a, new;
892 	unsigned long a1, a2;
893 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
894 	unsigned r = rf_rn[coeff + 1];
895 
896 	/* 13 5 bit quants in a 64 bit word */
897 	length /= 8;
898 	while (length) {
899 		a = *qbuf++;
900 		a1 = EXTRACT(a, 0);
901 		a2 = EXTRACT(a, 1);
902 		a1 = r ^ q[a1];
903 		a2 = r ^ q[a2];
904 		new = INSERT(a2, 1) | a1;
905 #define M(i,j) \
906       a1 = EXTRACT(a,i); \
907       a2 = EXTRACT(a,j); \
908       a1 = r ^ q[a1]; \
909       a2 = r ^ q[a2]; \
910       new = new | INSERT(a1,i) | INSERT(a2,j)
911 
912 		M(2, 3);
913 		M(4, 5);
914 		M(5, 6);
915 #if RF_LONGSHIFT > 2
916 		M(7, 8);
917 		M(9, 10);
918 		M(11, 12);
919 #endif				/* RF_LONGSHIFT > 2 */
920 		*abuf++ = new;
921 		length--;
922 	}
923 }
924 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
925 				 * (RF_INCLUDE_RAID6 > 0) */
926