xref: /netbsd-src/sys/dev/raidframe/rf_pq.c (revision 23c8222edbfb0f0932d88a8351d3a0cf817dfb9e)
1 /*	$NetBSD: rf_pq.c,v 1.13 2003/11/16 20:32:05 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * Code for RAID level 6 (P + Q) disk array architecture.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.13 2003/11/16 20:32:05 oster Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
39 
40 #include <dev/raidframe/raidframevar.h>
41 
42 #include "rf_raid.h"
43 #include "rf_dag.h"
44 #include "rf_dagffrd.h"
45 #include "rf_dagffwr.h"
46 #include "rf_dagdegrd.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_dagutils.h"
49 #include "rf_dagfuncs.h"
50 #include "rf_etimer.h"
51 #include "rf_pqdeg.h"
52 #include "rf_general.h"
53 #include "rf_map.h"
54 #include "rf_pq.h"
55 
56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
58 
59 int
60 rf_RegularONPFunc(node)
61 	RF_DagNode_t *node;
62 {
63 	return (rf_RegularXorFunc(node));
64 }
65 /*
66    same as simpleONQ func, but the coefficient is always 1
67 */
68 
69 int
70 rf_SimpleONPFunc(node)
71 	RF_DagNode_t *node;
72 {
73 	return (rf_SimpleXorFunc(node));
74 }
75 
76 int
77 rf_RecoveryPFunc(node)
78 	RF_DagNode_t *node;
79 {
80 	return (rf_RecoveryXorFunc(node));
81 }
82 
83 int
84 rf_RegularPFunc(node)
85 	RF_DagNode_t *node;
86 {
87 	return (rf_RegularXorFunc(node));
88 }
89 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
90 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
91 
92 static void
93 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
94     unsigned char coeff);
95 static void
96 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
97     unsigned length, unsigned coeff);
98 
99 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
100 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
101 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
102 
103 void
104 rf_PQDagSelect(
105     RF_Raid_t * raidPtr,
106     RF_IoType_t type,
107     RF_AccessStripeMap_t * asmap,
108     RF_VoidFuncPtr * createFunc)
109 {
110 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
111 	unsigned ndfail = asmap->numDataFailed;
112 	unsigned npfail = asmap->numParityFailed;
113 	unsigned ntfail = npfail + ndfail;
114 
115 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
116 	if (ntfail > 2) {
117 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
118 		*createFunc = NULL;
119 		return;
120 	}
121 	/* ok, we can do this I/O */
122 	if (type == RF_IO_TYPE_READ) {
123 		switch (ndfail) {
124 		case 0:
125 			/* fault free read */
126 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
127 			break;
128 		case 1:
129 			/* lost a single data unit */
130 			/* two cases: (1) parity is not lost. do a normal raid
131 			 * 5 reconstruct read. (2) parity is lost. do a
132 			 * reconstruct read using "q". */
133 			if (ntfail == 2) {	/* also lost redundancy */
134 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
135 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
136 				else
137 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
138 			} else {
139 				/* P and Q are ok. But is there a failure in
140 				 * some unaccessed data unit? */
141 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
142 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
143 				else
144 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
145 			}
146 			break;
147 		case 2:
148 			/* lost two data units */
149 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
150 			break;
151 		}
152 		return;
153 	}
154 	/* a write */
155 	switch (ntfail) {
156 	case 0:		/* fault free */
157 		if (rf_suppressLocksAndLargeWrites ||
158 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
159 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
160 
161 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
162 		} else {
163 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
164 		}
165 		break;
166 
167 	case 1:		/* single disk fault */
168 		if (npfail == 1) {
169 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
170 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
171 										 * normal mode raid5
172 										 * write. */
173 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
174 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
175 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
176 				else
177 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
178 			} else {/* parity died, small write only updating Q */
179 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
180 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
181 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
182 				else
183 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
184 			}
185 		} else {	/* data missing. Do a P reconstruct write if
186 				 * only a single data unit is lost in the
187 				 * stripe, otherwise a PQ reconstruct write. */
188 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
189 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
190 			else
191 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
192 		}
193 		break;
194 
195 	case 2:		/* two disk faults */
196 		switch (npfail) {
197 		case 2:	/* both p and q dead */
198 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
199 			break;
200 		case 1:	/* either p or q and dead data */
201 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
202 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
203 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
204 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
205 			else
206 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
207 			break;
208 		case 0:	/* double data loss */
209 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
210 			break;
211 		}
212 		break;
213 
214 	default:		/* more than 2 disk faults */
215 		*createFunc = NULL;
216 		RF_PANIC();
217 	}
218 	return;
219 }
220 /*
221    Used as a stop gap info function
222 */
223 #if 0
224 static void
225 PQOne(raidPtr, nSucc, nAnte, asmap)
226 	RF_Raid_t *raidPtr;
227 	int    *nSucc;
228 	int    *nAnte;
229 	RF_AccessStripeMap_t *asmap;
230 {
231 	*nSucc = *nAnte = 1;
232 }
233 
234 static void
235 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
236 	RF_Raid_t *raidPtr;
237 	int    *nSucc;
238 	int    *nAnte;
239 	RF_AccessStripeMap_t *asmap;
240 {
241 	*nSucc = 1;
242 	*nAnte = 2;
243 }
244 #endif
245 
246 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
247 {
248 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
249 	    rf_RegularPQFunc, RF_FALSE);
250 }
251 
252 int
253 rf_RegularONQFunc(node)
254 	RF_DagNode_t *node;
255 {
256 	int     np = node->numParams;
257 	int     d;
258 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
259 	int     i;
260 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
261 	RF_Etimer_t timer;
262 	char   *qbuf, *qpbuf;
263 	char   *obuf, *nbuf;
264 	RF_PhysDiskAddr_t *old, *new;
265 	unsigned long coeff;
266 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
267 
268 	RF_ETIMER_START(timer);
269 
270 	d = (np - 3) / 4;
271 	RF_ASSERT(4 * d + 3 == np);
272 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
273 	for (i = 0; i < d; i++) {
274 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
275 		obuf = (char *) node->params[2 * i + 1].p;
276 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
277 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
278 		RF_ASSERT(new->numSector == old->numSector);
279 		RF_ASSERT(new->raidAddress == old->raidAddress);
280 		/* the stripe unit within the stripe tells us the coefficient
281 		 * to use for the multiply. */
282 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
283 		/* compute the data unit offset within the column, then add
284 		 * one */
285 		coeff = (coeff % raidPtr->Layout.numDataCol);
286 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
287 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
288 	}
289 
290 	RF_ETIMER_STOP(timer);
291 	RF_ETIMER_EVAL(timer);
292 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
293 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
294 					 * I/O in this node */
295 	return (0);
296 }
297 /*
298    See the SimpleXORFunc for the difference between a simple and regular func.
299    These Q functions should be used for
300 
301          new q = Q(data,old data,old q)
302 
303    style updates and not for
304 
305          q = ( new data, new data, .... )
306 
307    computations.
308 
309    The simple q takes 2(2d+1)+1 params, where d is the number
310    of stripes written. The order of params is
311    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
312    [2d] old q pda_0, old q buffer
313    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
314    raidPtr
315 */
316 
317 int
318 rf_SimpleONQFunc(node)
319 	RF_DagNode_t *node;
320 {
321 	int     np = node->numParams;
322 	int     d;
323 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
324 	int     i;
325 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
326 	RF_Etimer_t timer;
327 	char   *qbuf;
328 	char   *obuf, *nbuf;
329 	RF_PhysDiskAddr_t *old, *new;
330 	unsigned long coeff;
331 
332 	RF_ETIMER_START(timer);
333 
334 	d = (np - 3) / 4;
335 	RF_ASSERT(4 * d + 3 == np);
336 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
337 	for (i = 0; i < d; i++) {
338 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
339 		obuf = (char *) node->params[2 * i + 1].p;
340 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
341 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
342 		RF_ASSERT(new->numSector == old->numSector);
343 		RF_ASSERT(new->raidAddress == old->raidAddress);
344 		/* the stripe unit within the stripe tells us the coefficient
345 		 * to use for the multiply. */
346 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
347 		/* compute the data unit offset within the column, then add
348 		 * one */
349 		coeff = (coeff % raidPtr->Layout.numDataCol);
350 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
351 	}
352 
353 	RF_ETIMER_STOP(timer);
354 	RF_ETIMER_EVAL(timer);
355 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
356 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
357 					 * I/O in this node */
358 	return (0);
359 }
360 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
361 {
362 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
363 }
364 
365 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
366 
367 static void
368 RegularQSubr(node, qbuf)
369 	RF_DagNode_t *node;
370 	char   *qbuf;
371 {
372 	int     np = node->numParams;
373 	int     d;
374 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
375 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
376 	int     i;
377 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
378 	RF_Etimer_t timer;
379 	char   *obuf, *qpbuf;
380 	RF_PhysDiskAddr_t *old;
381 	unsigned long coeff;
382 
383 	RF_ETIMER_START(timer);
384 
385 	d = (np - 1) / 2;
386 	RF_ASSERT(2 * d + 1 == np);
387 	for (i = 0; i < d; i++) {
388 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
389 		obuf = (char *) node->params[2 * i + 1].p;
390 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
391 		/* compute the data unit offset within the column, then add
392 		 * one */
393 		coeff = (coeff % raidPtr->Layout.numDataCol);
394 		/* the input buffers may not all be aligned with the start of
395 		 * the stripe. so shift by their sector offset within the
396 		 * stripe unit */
397 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
398 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
399 	}
400 
401 	RF_ETIMER_STOP(timer);
402 	RF_ETIMER_EVAL(timer);
403 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
404 }
405 /*
406    used in degraded writes.
407 */
408 
409 static void DegrQSubr(RF_DagNode_t *node);
410 
411 static void
412 DegrQSubr(node)
413 	RF_DagNode_t *node;
414 {
415 	int     np = node->numParams;
416 	int     d;
417 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
418 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
419 	int     i;
420 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
421 	RF_Etimer_t timer;
422 	char   *qbuf = node->results[1];
423 	char   *obuf, *qpbuf;
424 	RF_PhysDiskAddr_t *old;
425 	unsigned long coeff;
426 	unsigned fail_start;
427 	int     j;
428 
429 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
430 	fail_start = old->startSector % secPerSU;
431 
432 	RF_ETIMER_START(timer);
433 
434 	d = (np - 2) / 2;
435 	RF_ASSERT(2 * d + 2 == np);
436 	for (i = 0; i < d; i++) {
437 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
438 		obuf = (char *) node->params[2 * i + 1].p;
439 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
440 		/* compute the data unit offset within the column, then add
441 		 * one */
442 		coeff = (coeff % raidPtr->Layout.numDataCol);
443 		/* the input buffers may not all be aligned with the start of
444 		 * the stripe. so shift by their sector offset within the
445 		 * stripe unit */
446 		j = old->startSector % secPerSU;
447 		RF_ASSERT(j >= fail_start);
448 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
449 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
450 	}
451 
452 	RF_ETIMER_STOP(timer);
453 	RF_ETIMER_EVAL(timer);
454 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
455 }
456 /*
457    Called by large write code to compute the new parity and the new q.
458 
459    structure of the params:
460 
461    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
462    raidPtr
463 
464    for a total of 2d+1 arguments.
465    The result buffers results[0], results[1] are the buffers for the p and q,
466    respectively.
467 
468    We compute Q first, then compute P. The P calculation may try to reuse
469    one of the input buffers for its output, so if we computed P first, we would
470    corrupt the input for the q calculation.
471 */
472 
473 int
474 rf_RegularPQFunc(node)
475 	RF_DagNode_t *node;
476 {
477 	RegularQSubr(node, node->results[1]);
478 	return (rf_RegularXorFunc(node));	/* does the wakeup */
479 }
480 
481 int
482 rf_RegularQFunc(node)
483 	RF_DagNode_t *node;
484 {
485 	/* Almost ... adjust Qsubr args */
486 	RegularQSubr(node, node->results[0]);
487 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
488 					 * I/O in this node */
489 	return (0);
490 }
491 /*
492    Called by singly degraded write code to compute the new parity and the new q.
493 
494    structure of the params:
495 
496    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
497    failedPDA raidPtr
498 
499    for a total of 2d+2 arguments.
500    The result buffers results[0], results[1] are the buffers for the parity and q,
501    respectively.
502 
503    We compute Q first, then compute parity. The parity calculation may try to reuse
504    one of the input buffers for its output, so if we computed parity first, we would
505    corrupt the input for the q calculation.
506 
507    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
508 */
509 
510 void
511 rf_Degraded_100_PQFunc(node)
512 	RF_DagNode_t *node;
513 {
514 	int     np = node->numParams;
515 
516 	RF_ASSERT(np >= 2);
517 	DegrQSubr(node);
518 	rf_RecoveryXorFunc(node);
519 }
520 
521 
522 /*
523    The two below are used when reading a stripe with a single lost data unit.
524    The parameters are
525 
526    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
527 
528    and results[0] contains the data buffer. Which is originally zero-filled.
529 
530 */
531 
532 /* this Q func is used by the degraded-mode dag functions to recover lost data.
533  * the second-to-last parameter is the PDA for the failed portion of the access.
534  * the code here looks at this PDA and assumes that the xor target buffer is
535  * equal in size to the number of sectors in the failed PDA.  It then uses
536  * the other PDAs in the parameter list to determine where within the target
537  * buffer the corresponding data should be xored.
538  *
539  * Recall the basic equation is
540  *
541  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
542  *
543  * so to recover data_j we need
544  *
545  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
546  *
547  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
548  * copying Q into it. Then we need to do a table lookup to convert to solve
549  *   data_j /= J
550  *
551  *
552  */
553 int
554 rf_RecoveryQFunc(node)
555 	RF_DagNode_t *node;
556 {
557 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
558 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
559 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
560 	int     i;
561 	RF_PhysDiskAddr_t *pda;
562 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
563 	char   *srcbuf, *destbuf;
564 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
565 	RF_Etimer_t timer;
566 	unsigned long coeff;
567 
568 	RF_ETIMER_START(timer);
569 	/* start by copying Q into the buffer */
570 	memcpy(node->results[0], node->params[node->numParams - 3].p,
571 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
572 	for (i = 0; i < node->numParams - 4; i += 2) {
573 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
574 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
575 		srcbuf = (char *) node->params[i + 1].p;
576 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
577 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
578 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
579 		/* compute the data unit offset within the column */
580 		coeff = (coeff % raidPtr->Layout.numDataCol);
581 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
582 	}
583 	/* Do the nasty inversion now */
584 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
585 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
586 	RF_ETIMER_STOP(timer);
587 	RF_ETIMER_EVAL(timer);
588 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
589 	rf_GenericWakeupFunc(node, 0);
590 	return (0);
591 }
592 
593 int
594 rf_RecoveryPQFunc(node)
595 	RF_DagNode_t *node;
596 {
597 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
598 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
599 	return (1);
600 }
601 /*
602    Degraded write Q subroutine.
603    Used when P is dead.
604    Large-write style Q computation.
605    Parameters
606 
607    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
608 
609    We ignore failedPDA.
610 
611    This is a "simple style" recovery func.
612 */
613 
614 void
615 rf_PQ_DegradedWriteQFunc(node)
616 	RF_DagNode_t *node;
617 {
618 	int     np = node->numParams;
619 	int     d;
620 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
621 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
622 	int     i;
623 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
624 	RF_Etimer_t timer;
625 	char   *qbuf = node->results[0];
626 	char   *obuf, *qpbuf;
627 	RF_PhysDiskAddr_t *old;
628 	unsigned long coeff;
629 	int     fail_start, j;
630 
631 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
632 	fail_start = old->startSector % secPerSU;
633 
634 	RF_ETIMER_START(timer);
635 
636 	d = (np - 2) / 2;
637 	RF_ASSERT(2 * d + 2 == np);
638 
639 	for (i = 0; i < d; i++) {
640 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
641 		obuf = (char *) node->params[2 * i + 1].p;
642 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
643 		/* compute the data unit offset within the column, then add
644 		 * one */
645 		coeff = (coeff % raidPtr->Layout.numDataCol);
646 		j = old->startSector % secPerSU;
647 		RF_ASSERT(j >= fail_start);
648 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
649 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
650 	}
651 
652 	RF_ETIMER_STOP(timer);
653 	RF_ETIMER_EVAL(timer);
654 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
655 	rf_GenericWakeupFunc(node, 0);
656 }
657 
658 
659 
660 
661 /* Q computations */
662 
663 /*
664    coeff - colummn;
665 
666    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
667 
668    on 5-bit basis;
669    length in bytes;
670 */
671 
672 void
673 rf_IncQ(dest, buf, length, coeff)
674 	unsigned long *dest;
675 	unsigned long *buf;
676 	unsigned length;
677 	unsigned coeff;
678 {
679 	unsigned long a, d, new;
680 	unsigned long a1, a2;
681 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
682 	unsigned r = rf_rn[coeff + 1];
683 
684 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
685 #define INSERT(a,i) (a << (5L*i))
686 
687 	length /= 8;
688 	/* 13 5 bit quants in a 64 bit word */
689 	while (length) {
690 		a = *buf++;
691 		d = *dest;
692 		a1 = EXTRACT(a, 0) ^ r;
693 		a2 = EXTRACT(a, 1) ^ r;
694 		new = INSERT(a2, 1) | a1;
695 		a1 = EXTRACT(a, 2) ^ r;
696 		a2 = EXTRACT(a, 3) ^ r;
697 		a1 = q[a1];
698 		a2 = q[a2];
699 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
700 		a1 = EXTRACT(a, 4) ^ r;
701 		a2 = EXTRACT(a, 5) ^ r;
702 		a1 = q[a1];
703 		a2 = q[a2];
704 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
705 		a1 = EXTRACT(a, 5) ^ r;
706 		a2 = EXTRACT(a, 6) ^ r;
707 		a1 = q[a1];
708 		a2 = q[a2];
709 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
710 #if RF_LONGSHIFT > 2
711 		a1 = EXTRACT(a, 7) ^ r;
712 		a2 = EXTRACT(a, 8) ^ r;
713 		a1 = q[a1];
714 		a2 = q[a2];
715 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
716 		a1 = EXTRACT(a, 9) ^ r;
717 		a2 = EXTRACT(a, 10) ^ r;
718 		a1 = q[a1];
719 		a2 = q[a2];
720 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
721 		a1 = EXTRACT(a, 11) ^ r;
722 		a2 = EXTRACT(a, 12) ^ r;
723 		a1 = q[a1];
724 		a2 = q[a2];
725 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
726 #endif				/* RF_LONGSHIFT > 2 */
727 		d ^= new;
728 		*dest++ = d;
729 		length--;
730 	}
731 }
732 /*
733    compute
734 
735    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
736 
737    on a five bit basis.
738    optimization: compute old ^ new on 64 bit basis.
739 
740    length in bytes.
741 */
742 
743 static void
744 QDelta(
745     char *dest,
746     char *obuf,
747     char *nbuf,
748     unsigned length,
749     unsigned char coeff)
750 {
751 	unsigned long a, d, new;
752 	unsigned long a1, a2;
753 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
754 	unsigned int r = rf_rn[coeff + 1];
755 
756 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
757 	q = NULL; /* XXX for now */
758 
759 #ifdef _KERNEL
760 	/* PQ in kernel currently not supported because the encoding/decoding
761 	 * table is not present */
762 	memset(dest, 0, length);
763 #else				/* KERNEL */
764 	/* this code probably doesn't work and should be rewritten  -wvcii */
765 	/* 13 5 bit quants in a 64 bit word */
766 	length /= 8;
767 	while (length) {
768 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
769 		a ^= *nbuf++;
770 		d = *dest;
771 		a1 = EXTRACT(a, 0) ^ r;
772 		a2 = EXTRACT(a, 1) ^ r;
773 		a1 = q[a1];
774 		a2 = q[a2];
775 		new = INSERT(a2, 1) | a1;
776 		a1 = EXTRACT(a, 2) ^ r;
777 		a2 = EXTRACT(a, 3) ^ r;
778 		a1 = q[a1];
779 		a2 = q[a2];
780 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
781 		a1 = EXTRACT(a, 4) ^ r;
782 		a2 = EXTRACT(a, 5) ^ r;
783 		a1 = q[a1];
784 		a2 = q[a2];
785 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
786 		a1 = EXTRACT(a, 5) ^ r;
787 		a2 = EXTRACT(a, 6) ^ r;
788 		a1 = q[a1];
789 		a2 = q[a2];
790 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
791 #if RF_LONGSHIFT > 2
792 		a1 = EXTRACT(a, 7) ^ r;
793 		a2 = EXTRACT(a, 8) ^ r;
794 		a1 = q[a1];
795 		a2 = q[a2];
796 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
797 		a1 = EXTRACT(a, 9) ^ r;
798 		a2 = EXTRACT(a, 10) ^ r;
799 		a1 = q[a1];
800 		a2 = q[a2];
801 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
802 		a1 = EXTRACT(a, 11) ^ r;
803 		a2 = EXTRACT(a, 12) ^ r;
804 		a1 = q[a1];
805 		a2 = q[a2];
806 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
807 #endif				/* RF_LONGSHIFT > 2 */
808 		d ^= new;
809 		*dest++ = d;
810 		length--;
811 	}
812 #endif				/* _KERNEL */
813 }
814 /*
815    recover columns a and b from the given p and q into
816    bufs abuf and bbuf. All bufs are word aligned.
817    Length is in bytes.
818 */
819 
820 
821 /*
822  * XXX
823  *
824  * Everything about this seems wrong.
825  */
826 void
827 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
828 	unsigned long *pbuf;
829 	unsigned long *qbuf;
830 	unsigned long *abuf;
831 	unsigned long *bbuf;
832 	unsigned length;
833 	unsigned coeff_a;
834 	unsigned coeff_b;
835 {
836 	unsigned long p, q, a, a0, a1;
837 	int     col = (29 * coeff_a) + coeff_b;
838 	unsigned char *q0 = &(rf_qinv[col][0]);
839 
840 	length /= 8;
841 	while (length) {
842 		p = *pbuf++;
843 		q = *qbuf++;
844 		a0 = EXTRACT(p, 0);
845 		a1 = EXTRACT(q, 0);
846 		a = q0[a0 << 5 | a1];
847 #define MF(i) \
848       a0 = EXTRACT(p,i); \
849       a1 = EXTRACT(q,i); \
850       a  = a | INSERT(q0[a0<<5 | a1],i)
851 
852 		MF(1);
853 		MF(2);
854 		MF(3);
855 		MF(4);
856 		MF(5);
857 		MF(6);
858 #if 0
859 		MF(7);
860 		MF(8);
861 		MF(9);
862 		MF(10);
863 		MF(11);
864 		MF(12);
865 #endif				/* 0 */
866 		*abuf++ = a;
867 		*bbuf++ = a ^ p;
868 		length--;
869 	}
870 }
871 /*
872    Lost parity and a data column. Recover that data column.
873    Assume col coeff is lost. Let q the contents of Q after
874    all surviving data columns have been q-xored out of it.
875    Then we have the equation
876 
877    q[28-coeff][a_i ^ r_i+1] = q
878 
879    but q is cyclic with period 31.
880    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
881       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
882 
883    so a_i = r_{coeff+1} ^ q[3+coeff][q]
884 
885    The routine is passed q buffer and the buffer
886    the data is to be recoverd into. They can be the same.
887 */
888 
889 
890 
891 static void
892 rf_InvertQ(
893     unsigned long *qbuf,
894     unsigned long *abuf,
895     unsigned length,
896     unsigned coeff)
897 {
898 	unsigned long a, new;
899 	unsigned long a1, a2;
900 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
901 	unsigned r = rf_rn[coeff + 1];
902 
903 	/* 13 5 bit quants in a 64 bit word */
904 	length /= 8;
905 	while (length) {
906 		a = *qbuf++;
907 		a1 = EXTRACT(a, 0);
908 		a2 = EXTRACT(a, 1);
909 		a1 = r ^ q[a1];
910 		a2 = r ^ q[a2];
911 		new = INSERT(a2, 1) | a1;
912 #define M(i,j) \
913       a1 = EXTRACT(a,i); \
914       a2 = EXTRACT(a,j); \
915       a1 = r ^ q[a1]; \
916       a2 = r ^ q[a2]; \
917       new = new | INSERT(a1,i) | INSERT(a2,j)
918 
919 		M(2, 3);
920 		M(4, 5);
921 		M(5, 6);
922 #if RF_LONGSHIFT > 2
923 		M(7, 8);
924 		M(9, 10);
925 		M(11, 12);
926 #endif				/* RF_LONGSHIFT > 2 */
927 		*abuf++ = new;
928 		length--;
929 	}
930 }
931 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
932 				 * (RF_INCLUDE_RAID6 > 0) */
933