xref: /netbsd-src/sys/dev/raidframe/rf_raid1.c (revision dc306354b0b29af51801a7632f1e95265a68cd81)
1 /*	$NetBSD: rf_raid1.c,v 1.1 1998/11/13 04:20:33 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*****************************************************************************
30  *
31  * rf_raid1.c -- implements RAID Level 1
32  *
33  *****************************************************************************/
34 
35 /*
36  * :
37  * Log: rf_raid1.c,v
38  * Revision 1.46  1996/11/05 21:10:40  jimz
39  * failed pda generalization
40  *
41  * Revision 1.45  1996/07/31  16:56:18  jimz
42  * dataBytesPerStripe, sectorsPerDisk init arch-indep.
43  *
44  * Revision 1.44  1996/07/30  03:06:43  jimz
45  * get rid of extra rf_threadid.h include
46  *
47  * Revision 1.43  1996/07/27  23:36:08  jimz
48  * Solaris port of simulator
49  *
50  * Revision 1.42  1996/07/22  19:52:16  jimz
51  * switched node params to RF_DagParam_t, a union of
52  * a 64-bit int and a void *, for better portability
53  * attempted hpux port, but failed partway through for
54  * lack of a single C compiler capable of compiling all
55  * source files
56  *
57  * Revision 1.41  1996/07/18  22:57:14  jimz
58  * port simulator to AIX
59  *
60  * Revision 1.40  1996/07/17  14:31:19  jimz
61  * minor cleanup for readability
62  *
63  * Revision 1.39  1996/07/15  17:22:18  jimz
64  * nit-pick code cleanup
65  * resolve stdlib problems on DEC OSF
66  *
67  * Revision 1.38  1996/07/15  02:56:31  jimz
68  * fixed dag selection to deal with failed + recon to spare disks
69  * enhanced recon, parity check debugging
70  *
71  * Revision 1.37  1996/07/13  00:00:59  jimz
72  * sanitized generalized reconstruction architecture
73  * cleaned up head sep, rbuf problems
74  *
75  * Revision 1.36  1996/07/11  19:08:00  jimz
76  * generalize reconstruction mechanism
77  * allow raid1 reconstructs via copyback (done with array
78  * quiesced, not online, therefore not disk-directed)
79  *
80  * Revision 1.35  1996/07/10  23:01:24  jimz
81  * Better commenting of VerifyParity (for posterity)
82  *
83  * Revision 1.34  1996/07/10  22:29:45  jimz
84  * VerifyParityRAID1: corrected return values for stripes in degraded mode
85  *
86  * Revision 1.33  1996/07/10  16:05:39  jimz
87  * fixed a couple minor bugs in VerifyParityRAID1
88  * added code to correct bad RAID1 parity
89  *
90  * Revision 1.32  1996/06/20  18:47:04  jimz
91  * fix up verification bugs
92  *
93  * Revision 1.31  1996/06/20  15:38:59  jimz
94  * added parity verification
95  * can't correct bad parity yet, but can return pass/fail
96  *
97  * Revision 1.30  1996/06/19  22:23:01  jimz
98  * parity verification is now a layout-configurable thing
99  * not all layouts currently support it (correctly, anyway)
100  *
101  * Revision 1.29  1996/06/11  08:54:27  jimz
102  * improved error-checking at configuration time
103  *
104  * Revision 1.28  1996/06/10  18:25:24  wvcii
105  * fixed bug in rf_IdentifyStripeRAID1 - added array initialization
106  *
107  * Revision 1.27  1996/06/10  11:55:47  jimz
108  * Straightened out some per-array/not-per-array distinctions, fixed
109  * a couple bugs related to confusion. Added shutdown lists. Removed
110  * layout shutdown function (now subsumed by shutdown lists).
111  *
112  * Revision 1.26  1996/06/07  22:26:27  jimz
113  * type-ify which_ru (RF_ReconUnitNum_t)
114  *
115  * Revision 1.25  1996/06/07  21:33:04  jimz
116  * begin using consistent types for sector numbers,
117  * stripe numbers, row+col numbers, recon unit numbers
118  *
119  * Revision 1.24  1996/06/06  17:29:43  jimz
120  * use CreateMirrorIdleReadDAG for mirrored read
121  *
122  * Revision 1.23  1996/06/03  23:28:26  jimz
123  * more bugfixes
124  * check in tree to sync for IPDS runs with current bugfixes
125  * there still may be a problem with threads in the script test
126  * getting I/Os stuck- not trivially reproducible (runs ~50 times
127  * in a row without getting stuck)
128  *
129  * Revision 1.22  1996/06/02  17:31:48  jimz
130  * Moved a lot of global stuff into array structure, where it belongs.
131  * Fixed up paritylogging, pss modules in this manner. Some general
132  * code cleanup. Removed lots of dead code, some dead files.
133  *
134  * Revision 1.21  1996/05/31  22:26:54  jimz
135  * fix a lot of mapping problems, memory allocation problems
136  * found some weird lock issues, fixed 'em
137  * more code cleanup
138  *
139  * Revision 1.20  1996/05/30  23:22:16  jimz
140  * bugfixes of serialization, timing problems
141  * more cleanup
142  *
143  * Revision 1.19  1996/05/30  11:29:41  jimz
144  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
145  * about when stripes should be locked (I made it consistent: no parity, no lock)
146  * There was a lot of extra serialization of I/Os which I've removed- a lot of
147  * it was to calculate values for the cache code, which is no longer with us.
148  * More types, function, macro cleanup. Added code to properly quiesce the array
149  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
150  * before. Fixed memory allocation, freeing bugs.
151  *
152  * Revision 1.18  1996/05/27  18:56:37  jimz
153  * more code cleanup
154  * better typing
155  * compiles in all 3 environments
156  *
157  * Revision 1.17  1996/05/24  22:17:04  jimz
158  * continue code + namespace cleanup
159  * typed a bunch of flags
160  *
161  * Revision 1.16  1996/05/24  04:28:55  jimz
162  * release cleanup ckpt
163  *
164  * Revision 1.15  1996/05/24  01:59:45  jimz
165  * another checkpoint in code cleanup for release
166  * time to sync kernel tree
167  *
168  * Revision 1.14  1996/05/18  19:51:34  jimz
169  * major code cleanup- fix syntax, make some types consistent,
170  * add prototypes, clean out dead code, et cetera
171  *
172  * Revision 1.13  1996/05/03  19:36:22  wvcii
173  * moved dag creation routines to dag library
174  *
175  * Revision 1.12  1996/02/23  01:38:16  amiri
176  * removed chained declustering special case in SelectIdleDisk
177  *
178  * Revision 1.11  1996/02/22  16:47:18  amiri
179  * disabled shortest queue optimization for chained declustering
180  *
181  * Revision 1.10  1995/12/12  18:10:06  jimz
182  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
183  * fix 80-column brain damage in comments
184  *
185  * Revision 1.9  1995/12/04  19:21:28  wvcii
186  * modified SelectIdleDisk to take a mirror node as a parameter and
187  * conditionally swap params 0 (data pda) and 4 (mirror pda).
188  * modified CreateRaidOneReadDAG so that it creates the DAG itself
189  * as opposed to reusing code in CreateNonredundantDAG.
190  *
191  * Revision 1.8  1995/11/30  16:07:45  wvcii
192  * added copyright info
193  *
194  * Revision 1.7  1995/11/16  14:46:18  wvcii
195  * fixed bugs in mapping and degraded dag creation, added comments
196  *
197  * Revision 1.6  1995/11/14  22:29:16  wvcii
198  * fixed bugs in dag creation
199  *
200  * Revision 1.5  1995/11/07  15:23:33  wvcii
201  * changed RAID1DagSelect prototype
202  * function no longer generates numHdrSucc, numTermAnt
203  * changed dag creation routines:
204  *   term node generated during dag creation
205  *   encoded commit nodes, barrier, antecedent types
206  *
207  * Revision 1.4  1995/10/10  19:09:21  wvcii
208  * write dag now handles non-aligned accesses
209  *
210  * Revision 1.3  1995/10/05  02:32:56  jimz
211  * ifdef'd out queue locking for load balancing
212  *
213  * Revision 1.2  1995/10/04  07:04:40  wvcii
214  * reads are now scheduled according to disk queue length.
215  * queue length is the sum of number of ios queued in raidframe as well as those at the disk.
216  * reads are sent to the disk with the shortest queue.
217  * testing against user disks successful, sim & kernel untested.
218  *
219  * Revision 1.1  1995/10/04  03:53:23  wvcii
220  * Initial revision
221  *
222  *
223  */
224 
225 #include "rf_raid.h"
226 #include "rf_raid1.h"
227 #include "rf_dag.h"
228 #include "rf_dagffrd.h"
229 #include "rf_dagffwr.h"
230 #include "rf_dagdegrd.h"
231 #include "rf_dagutils.h"
232 #include "rf_dagfuncs.h"
233 #include "rf_threadid.h"
234 #include "rf_diskqueue.h"
235 #include "rf_general.h"
236 #include "rf_utils.h"
237 #include "rf_parityscan.h"
238 #include "rf_mcpair.h"
239 #include "rf_layout.h"
240 #include "rf_map.h"
241 #include "rf_engine.h"
242 #include "rf_reconbuffer.h"
243 #include "rf_sys.h"
244 
245 typedef struct RF_Raid1ConfigInfo_s {
246   RF_RowCol_t  **stripeIdentifier;
247 } RF_Raid1ConfigInfo_t;
248 
249 /* start of day code specific to RAID level 1 */
250 int rf_ConfigureRAID1(
251   RF_ShutdownList_t  **listp,
252   RF_Raid_t           *raidPtr,
253   RF_Config_t         *cfgPtr)
254 {
255   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
256   RF_Raid1ConfigInfo_t *info;
257   RF_RowCol_t i;
258 
259   /* create a RAID level 1 configuration structure */
260   RF_MallocAndAdd(info, sizeof(RF_Raid1ConfigInfo_t), (RF_Raid1ConfigInfo_t *), raidPtr->cleanupList);
261   if (info == NULL)
262     return(ENOMEM);
263   layoutPtr->layoutSpecificInfo = (void *) info;
264 
265   /* ... and fill it in. */
266   info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol / 2, 2, raidPtr->cleanupList);
267   if (info->stripeIdentifier == NULL)
268     return(ENOMEM);
269   for (i = 0; i < (raidPtr->numCol / 2); i ++) {
270     info->stripeIdentifier[i][0] = (2 * i);
271     info->stripeIdentifier[i][1] = (2 * i) + 1;
272   }
273 
274   RF_ASSERT(raidPtr->numRow == 1);
275 
276   /* this implementation of RAID level 1 uses one row of numCol disks and allows multiple (numCol / 2)
277    * stripes per row.  A stripe consists of a single data unit and a single parity (mirror) unit.
278    * stripe id = raidAddr / stripeUnitSize
279    */
280   raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2) * layoutPtr->sectorsPerStripeUnit;
281   layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2);
282   layoutPtr->dataSectorsPerStripe = layoutPtr->sectorsPerStripeUnit;
283   layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
284   layoutPtr->numDataCol = 1;
285   layoutPtr->numParityCol = 1;
286   return(0);
287 }
288 
289 
290 /* returns the physical disk location of the primary copy in the mirror pair */
291 void rf_MapSectorRAID1(
292   RF_Raid_t         *raidPtr,
293   RF_RaidAddr_t      raidSector,
294   RF_RowCol_t       *row,
295   RF_RowCol_t       *col,
296   RF_SectorNum_t    *diskSector,
297   int                remap)
298 {
299   RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
300   RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
301 
302   *row = 0;
303   *col = 2 * mirrorPair;
304   *diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
305 }
306 
307 
308 /* Map Parity
309  *
310  * returns the physical disk location of the secondary copy in the mirror
311  * pair
312  */
313 void rf_MapParityRAID1(
314   RF_Raid_t       *raidPtr,
315   RF_RaidAddr_t    raidSector,
316   RF_RowCol_t     *row,
317   RF_RowCol_t     *col,
318   RF_SectorNum_t  *diskSector,
319   int              remap)
320 {
321   RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
322   RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
323 
324   *row = 0;
325   *col = (2 * mirrorPair) + 1;
326 
327   *diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
328 }
329 
330 
331 /* IdentifyStripeRAID1
332  *
333  * returns a list of disks for a given redundancy group
334  */
335 void rf_IdentifyStripeRAID1(
336   RF_Raid_t        *raidPtr,
337   RF_RaidAddr_t     addr,
338   RF_RowCol_t     **diskids,
339   RF_RowCol_t      *outRow)
340 {
341   RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
342   RF_Raid1ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo;
343   RF_ASSERT(stripeID >= 0);
344   RF_ASSERT(addr >= 0);
345   *outRow = 0;
346   *diskids = info->stripeIdentifier[ stripeID % (raidPtr->numCol/2)];
347   RF_ASSERT(*diskids);
348 }
349 
350 
351 /* MapSIDToPSIDRAID1
352  *
353  * maps a logical stripe to a stripe in the redundant array
354  */
355 void rf_MapSIDToPSIDRAID1(
356   RF_RaidLayout_t    *layoutPtr,
357   RF_StripeNum_t      stripeID,
358   RF_StripeNum_t     *psID,
359   RF_ReconUnitNum_t  *which_ru)
360 {
361   *which_ru = 0;
362   *psID = stripeID;
363 }
364 
365 
366 
367 /******************************************************************************
368  * select a graph to perform a single-stripe access
369  *
370  * Parameters:  raidPtr    - description of the physical array
371  *              type       - type of operation (read or write) requested
372  *              asmap      - logical & physical addresses for this access
373  *              createFunc - name of function to use to create the graph
374  *****************************************************************************/
375 
376 void rf_RAID1DagSelect(
377   RF_Raid_t             *raidPtr,
378   RF_IoType_t            type,
379   RF_AccessStripeMap_t  *asmap,
380   RF_VoidFuncPtr        *createFunc)
381 {
382   RF_RowCol_t frow, fcol, or, oc;
383   RF_PhysDiskAddr_t *failedPDA;
384   int prior_recon, tid;
385   RF_RowStatus_t rstat;
386   RF_SectorNum_t oo;
387 
388 
389   RF_ASSERT(RF_IO_IS_R_OR_W(type));
390 
391   if (asmap->numDataFailed + asmap->numParityFailed > 1) {
392     RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
393     *createFunc = NULL;
394     return;
395   }
396 
397   if (asmap->numDataFailed + asmap->numParityFailed) {
398     /*
399      * We've got a fault. Re-map to spare space, iff applicable.
400      * Shouldn't the arch-independent code do this for us?
401      * Anyway, it turns out if we don't do this here, then when
402      * we're reconstructing, writes go only to the surviving
403      * original disk, and aren't reflected on the reconstructed
404      * spare. Oops. --jimz
405      */
406     failedPDA = asmap->failedPDAs[0];
407     frow = failedPDA->row;
408     fcol = failedPDA->col;
409     rstat = raidPtr->status[frow];
410     prior_recon = (rstat == rf_rs_reconfigured) || (
411       (rstat == rf_rs_reconstructing) ?
412       rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
413       );
414     if (prior_recon) {
415       or = frow;
416       oc = fcol;
417       oo = failedPDA->startSector;
418       /*
419        * If we did distributed sparing, we'd monkey with that here.
420        * But we don't, so we'll
421        */
422       failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
423       failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
424       /*
425        * Redirect other components, iff necessary. This looks
426        * pretty suspicious to me, but it's what the raid5
427        * DAG select does.
428        */
429       if (asmap->parityInfo->next) {
430         if (failedPDA == asmap->parityInfo) {
431           failedPDA->next->row = failedPDA->row;
432           failedPDA->next->col = failedPDA->col;
433         }
434         else {
435           if (failedPDA == asmap->parityInfo->next) {
436             asmap->parityInfo->row = failedPDA->row;
437             asmap->parityInfo->col = failedPDA->col;
438           }
439         }
440       }
441       if (rf_dagDebug || rf_mapDebug) {
442         rf_get_threadid(tid);
443         printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
444           tid, type, or, oc, (long)oo, failedPDA->row, failedPDA->col,
445           (long)failedPDA->startSector);
446       }
447       asmap->numDataFailed = asmap->numParityFailed = 0;
448     }
449   }
450   if (type == RF_IO_TYPE_READ) {
451     if (asmap->numDataFailed == 0)
452       *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorIdleReadDAG;
453     else
454       *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneDegradedReadDAG;
455   }
456   else {
457     *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG;
458   }
459 }
460 
461 int rf_VerifyParityRAID1(
462   RF_Raid_t             *raidPtr,
463   RF_RaidAddr_t          raidAddr,
464   RF_PhysDiskAddr_t     *parityPDA,
465   int                    correct_it,
466   RF_RaidAccessFlags_t   flags)
467 {
468   int nbytes, bcount, stripeWidth, ret, i, j, tid=0, nbad, *bbufs;
469   RF_DagNode_t *blockNode, *unblockNode, *wrBlock;
470   RF_DagHeader_t *rd_dag_h, *wr_dag_h;
471   RF_AccessStripeMapHeader_t *asm_h;
472   RF_AllocListElem_t *allocList;
473   RF_AccTraceEntry_t tracerec;
474   RF_ReconUnitNum_t which_ru;
475   RF_RaidLayout_t *layoutPtr;
476   RF_AccessStripeMap_t *aasm;
477   RF_SectorCount_t nsector;
478   RF_RaidAddr_t startAddr;
479   char *buf, *buf1, *buf2;
480   RF_PhysDiskAddr_t *pda;
481   RF_StripeNum_t psID;
482   RF_MCPair_t *mcpair;
483 
484   if (rf_verifyParityDebug) {
485     rf_get_threadid(tid);
486   }
487 
488   layoutPtr = &raidPtr->Layout;
489   startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
490   nsector = parityPDA->numSector;
491   nbytes = rf_RaidAddressToByte(raidPtr, nsector);
492   psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
493 
494   asm_h = NULL;
495   rd_dag_h = wr_dag_h = NULL;
496   mcpair = NULL;
497 
498   ret = RF_PARITY_COULD_NOT_VERIFY;
499 
500   rf_MakeAllocList(allocList);
501   if (allocList == NULL)
502     return(RF_PARITY_COULD_NOT_VERIFY);
503   mcpair = rf_AllocMCPair();
504   if (mcpair == NULL)
505     goto done;
506   RF_ASSERT(layoutPtr->numDataCol == layoutPtr->numParityCol);
507   stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
508   bcount = nbytes*(layoutPtr->numDataCol + layoutPtr->numParityCol);
509   RF_MallocAndAdd(buf, bcount, (char *), allocList);
510   if (buf == NULL)
511     goto done;
512   if (rf_verifyParityDebug) {
513     printf("[%d] RAID1 parity verify: buf=%lx bcount=%d (%lx - %lx)\n",
514       tid, (long)buf, bcount, (long)buf, (long)buf+bcount);
515   }
516 
517   /*
518    * Generate a DAG which will read the entire stripe- then we can
519    * just compare data chunks versus "parity" chunks.
520    */
521 
522   rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, nbytes, buf,
523     rf_DiskReadFunc, rf_DiskReadUndoFunc, "Rod", allocList, flags,
524     RF_IO_NORMAL_PRIORITY);
525   if (rd_dag_h == NULL)
526     goto done;
527   blockNode = rd_dag_h->succedents[0];
528   unblockNode = blockNode->succedents[0]->succedents[0];
529 
530   /*
531    * Map the access to physical disk addresses (PDAs)- this will
532    * get us both a list of data addresses, and "parity" addresses
533    * (which are really mirror copies).
534    */
535   asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe,
536     buf, RF_DONT_REMAP);
537   aasm = asm_h->stripeMap;
538 
539   buf1 = buf;
540   /*
541    * Loop through the data blocks, setting up read nodes for each.
542    */
543   for(pda=aasm->physInfo,i=0;i<layoutPtr->numDataCol;i++,pda=pda->next)
544   {
545     RF_ASSERT(pda);
546 
547     rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
548 
549     RF_ASSERT(pda->numSector != 0);
550     if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
551       /* cannot verify parity with dead disk */
552       goto done;
553     }
554     pda->bufPtr = buf1;
555     blockNode->succedents[i]->params[0].p = pda;
556     blockNode->succedents[i]->params[1].p = buf1;
557     blockNode->succedents[i]->params[2].v = psID;
558     blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
559     buf1 += nbytes;
560   }
561   RF_ASSERT(pda == NULL);
562   /*
563    * keep i, buf1 running
564    *
565    * Loop through parity blocks, setting up read nodes for each.
566    */
567   for(pda=aasm->parityInfo;i<layoutPtr->numDataCol+layoutPtr->numParityCol;i++,pda=pda->next)
568   {
569     RF_ASSERT(pda);
570     rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
571     RF_ASSERT(pda->numSector != 0);
572     if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
573       /* cannot verify parity with dead disk */
574       goto done;
575     }
576     pda->bufPtr = buf1;
577     blockNode->succedents[i]->params[0].p = pda;
578     blockNode->succedents[i]->params[1].p = buf1;
579     blockNode->succedents[i]->params[2].v = psID;
580     blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
581     buf1 += nbytes;
582   }
583   RF_ASSERT(pda == NULL);
584 
585   bzero((char *)&tracerec, sizeof(tracerec));
586   rd_dag_h->tracerec = &tracerec;
587 
588   if (rf_verifyParityDebug > 1) {
589     printf("[%d] RAID1 parity verify read dag:\n", tid);
590     rf_PrintDAGList(rd_dag_h);
591   }
592 
593   RF_LOCK_MUTEX(mcpair->mutex);
594   mcpair->flag = 0;
595   rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
596 		 (void *)mcpair);
597   while (mcpair->flag == 0) {
598     RF_WAIT_MCPAIR(mcpair);
599   }
600   RF_UNLOCK_MUTEX(mcpair->mutex);
601 
602   if (rd_dag_h->status != rf_enable) {
603     RF_ERRORMSG("Unable to verify raid1 parity: can't read stripe\n");
604     ret = RF_PARITY_COULD_NOT_VERIFY;
605     goto done;
606   }
607 
608   /*
609    * buf1 is the beginning of the data blocks chunk
610    * buf2 is the beginning of the parity blocks chunk
611    */
612   buf1 = buf;
613   buf2 = buf + (nbytes * layoutPtr->numDataCol);
614   ret = RF_PARITY_OKAY;
615   /*
616    * bbufs is "bad bufs"- an array whose entries are the data
617    * column numbers where we had miscompares. (That is, column 0
618    * and column 1 of the array are mirror copies, and are considered
619    * "data column 0" for this purpose).
620    */
621   RF_MallocAndAdd(bbufs, layoutPtr->numParityCol*sizeof(int), (int *),
622     allocList);
623   nbad = 0;
624   /*
625    * Check data vs "parity" (mirror copy).
626    */
627   for(i=0;i<layoutPtr->numDataCol;i++) {
628     if (rf_verifyParityDebug) {
629       printf("[%d] RAID1 parity verify %d bytes: i=%d buf1=%lx buf2=%lx buf=%lx\n",
630         tid, nbytes, i, (long)buf1, (long)buf2, (long)buf);
631     }
632     ret = bcmp(buf1, buf2, nbytes);
633     if (ret) {
634       if (rf_verifyParityDebug > 1) {
635         for(j=0;j<nbytes;j++) {
636          if (buf1[j] != buf2[j])
637            break;
638         }
639         printf("psid=%ld j=%d\n", (long)psID, j);
640         printf("buf1 %02x %02x %02x %02x %02x\n", buf1[0]&0xff,
641           buf1[1]&0xff, buf1[2]&0xff, buf1[3]&0xff, buf1[4]&0xff);
642         printf("buf2 %02x %02x %02x %02x %02x\n", buf2[0]&0xff,
643           buf2[1]&0xff, buf2[2]&0xff, buf2[3]&0xff, buf2[4]&0xff);
644       }
645       if (rf_verifyParityDebug) {
646         printf("[%d] RAID1: found bad parity, i=%d\n", tid, i);
647       }
648       /*
649        * Parity is bad. Keep track of which columns were bad.
650        */
651       if (bbufs)
652         bbufs[nbad] = i;
653       nbad++;
654       ret = RF_PARITY_BAD;
655     }
656     buf1 += nbytes;
657     buf2 += nbytes;
658   }
659 
660   if ((ret != RF_PARITY_OKAY) && correct_it) {
661     ret = RF_PARITY_COULD_NOT_CORRECT;
662     if (rf_verifyParityDebug) {
663       printf("[%d] RAID1 parity verify: parity not correct\n", tid);
664     }
665     if (bbufs == NULL)
666       goto done;
667     /*
668      * Make a DAG with one write node for each bad unit. We'll simply
669      * write the contents of the data unit onto the parity unit for
670      * correction. (It's possible that the mirror copy was the correct
671      * copy, and that we're spooging good data by writing bad over it,
672      * but there's no way we can know that.
673      */
674     wr_dag_h = rf_MakeSimpleDAG(raidPtr, nbad, nbytes, buf,
675       rf_DiskWriteFunc, rf_DiskWriteUndoFunc, "Wnp", allocList, flags,
676       RF_IO_NORMAL_PRIORITY);
677     if (wr_dag_h == NULL)
678       goto done;
679     wrBlock = wr_dag_h->succedents[0];
680     /*
681      * Fill in a write node for each bad compare.
682      */
683     for(i=0;i<nbad;i++) {
684       j = i+layoutPtr->numDataCol;
685       pda = blockNode->succedents[j]->params[0].p;
686       pda->bufPtr = blockNode->succedents[i]->params[1].p;
687       wrBlock->succedents[i]->params[0].p = pda;
688       wrBlock->succedents[i]->params[1].p = pda->bufPtr;
689       wrBlock->succedents[i]->params[2].v = psID;
690       wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
691     }
692     bzero((char *)&tracerec, sizeof(tracerec));
693     wr_dag_h->tracerec = &tracerec;
694     if (rf_verifyParityDebug > 1) {
695       printf("Parity verify write dag:\n");
696       rf_PrintDAGList(wr_dag_h);
697     }
698     RF_LOCK_MUTEX(mcpair->mutex);
699     mcpair->flag = 0;
700     /* fire off the write DAG */
701     rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
702 		   (void *)mcpair);
703     while (!mcpair->flag) {
704       RF_WAIT_COND(mcpair->cond, mcpair->mutex);
705     }
706     RF_UNLOCK_MUTEX(mcpair->mutex);
707     if (wr_dag_h->status != rf_enable) {
708       RF_ERRORMSG("Unable to correct RAID1 parity in VerifyParity\n");
709       goto done;
710     }
711     ret = RF_PARITY_CORRECTED;
712   }
713 
714 done:
715   /*
716    * All done. We might've gotten here without doing part of the function,
717    * so cleanup what we have to and return our running status.
718    */
719   if (asm_h)
720     rf_FreeAccessStripeMap(asm_h);
721   if (rd_dag_h)
722     rf_FreeDAG(rd_dag_h);
723   if (wr_dag_h)
724     rf_FreeDAG(wr_dag_h);
725   if (mcpair)
726     rf_FreeMCPair(mcpair);
727   rf_FreeAllocList(allocList);
728   if (rf_verifyParityDebug) {
729     printf("[%d] RAID1 parity verify, returning %d\n", tid, ret);
730   }
731   return(ret);
732 }
733 
734 int rf_SubmitReconBufferRAID1(rbuf, keep_it, use_committed)
735   RF_ReconBuffer_t  *rbuf;          /* the recon buffer to submit */
736   int                keep_it;       /* whether we can keep this buffer or we have to return it */
737   int                use_committed; /* whether to use a committed or an available recon buffer */
738 {
739   RF_ReconParityStripeStatus_t *pssPtr;
740   RF_ReconCtrl_t *reconCtrlPtr;
741   RF_RaidLayout_t *layoutPtr;
742   int tid=0, retcode, created;
743   RF_CallbackDesc_t *cb, *p;
744   RF_ReconBuffer_t *t;
745   RF_Raid_t *raidPtr;
746   caddr_t ta;
747 
748   retcode = 0;
749   created = 0;
750 
751   raidPtr = rbuf->raidPtr;
752   layoutPtr = &raidPtr->Layout;
753   reconCtrlPtr = raidPtr->reconControl[rbuf->row];
754 
755   RF_ASSERT(rbuf);
756   RF_ASSERT(rbuf->col != reconCtrlPtr->fcol);
757 
758   if (rf_reconbufferDebug) {
759     rf_get_threadid(tid);
760     printf("[%d] RAID1 reconbuffer submission r%d c%d psid %ld ru%d (failed offset %ld)\n",
761       tid, rbuf->row, rbuf->col, (long)rbuf->parityStripeID, rbuf->which_ru,
762       (long)rbuf->failedDiskSectorOffset);
763   }
764 
765   if (rf_reconDebug) {
766     printf("RAID1 reconbuffer submit psid %ld buf %lx\n",
767 	   (long)rbuf->parityStripeID, (long)rbuf->buffer);
768     printf("RAID1 psid %ld   %02x %02x %02x %02x %02x\n",
769 	   (long)rbuf->parityStripeID,
770       rbuf->buffer[0], rbuf->buffer[1], rbuf->buffer[2], rbuf->buffer[3],
771       rbuf->buffer[4]);
772   }
773 
774   RF_LOCK_PSS_MUTEX(raidPtr,rbuf->row,rbuf->parityStripeID);
775 
776   RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
777 
778   pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable,
779     rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created);
780   RF_ASSERT(pssPtr); /* if it didn't exist, we wouldn't have gotten an rbuf for it */
781 
782   /*
783    * Since this is simple mirroring, the first submission for a stripe is also
784    * treated as the last.
785    */
786 
787   t = NULL;
788   if (keep_it) {
789     if (rf_reconbufferDebug) {
790       printf("[%d] RAID1 rbuf submission: keeping rbuf\n", tid);
791     }
792     t = rbuf;
793   }
794   else {
795     if (use_committed) {
796       if (rf_reconbufferDebug) {
797         printf("[%d] RAID1 rbuf submission: using committed rbuf\n", tid);
798       }
799       t = reconCtrlPtr->committedRbufs;
800       RF_ASSERT(t);
801       reconCtrlPtr->committedRbufs = t->next;
802       t->next = NULL;
803     }
804     else if (reconCtrlPtr->floatingRbufs) {
805       if (rf_reconbufferDebug) {
806         printf("[%d] RAID1 rbuf submission: using floating rbuf\n", tid);
807       }
808       t = reconCtrlPtr->floatingRbufs;
809       reconCtrlPtr->floatingRbufs = t->next;
810       t->next = NULL;
811     }
812   }
813   if (t == NULL) {
814     if (rf_reconbufferDebug) {
815       printf("[%d] RAID1 rbuf submission: waiting for rbuf\n", tid);
816     }
817     RF_ASSERT((keep_it == 0) && (use_committed == 0));
818     raidPtr->procsInBufWait++;
819     if ((raidPtr->procsInBufWait == (raidPtr->numCol-1))
820       && (raidPtr->numFullReconBuffers == 0))
821     {
822       /* ruh-ro */
823       RF_ERRORMSG("Buffer wait deadlock\n");
824       rf_PrintPSStatusTable(raidPtr, rbuf->row);
825       RF_PANIC();
826     }
827     pssPtr->flags |= RF_PSS_BUFFERWAIT;
828     cb = rf_AllocCallbackDesc();
829     cb->row = rbuf->row;
830     cb->col = rbuf->col;
831     cb->callbackArg.v = rbuf->parityStripeID;
832     cb->callbackArg2.v = rbuf->which_ru;
833     cb->next = NULL;
834     if (reconCtrlPtr->bufferWaitList == NULL) {
835       /* we are the wait list- lucky us */
836       reconCtrlPtr->bufferWaitList = cb;
837     }
838     else {
839       /* append to wait list */
840       for(p=reconCtrlPtr->bufferWaitList;p->next;p=p->next);
841       p->next = cb;
842     }
843     retcode = 1;
844     goto out;
845   }
846   if (t != rbuf) {
847     t->row = rbuf->row;
848     t->col = reconCtrlPtr->fcol;
849     t->parityStripeID = rbuf->parityStripeID;
850     t->which_ru = rbuf->which_ru;
851     t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset;
852     t->spRow = rbuf->spRow;
853     t->spCol = rbuf->spCol;
854     t->spOffset = rbuf->spOffset;
855     /* Swap buffers. DANCE! */
856     ta = t->buffer;
857     t->buffer = rbuf->buffer;
858     rbuf->buffer = ta;
859   }
860   /*
861    * Use the rbuf we've been given as the target.
862    */
863   RF_ASSERT(pssPtr->rbuf == NULL);
864   pssPtr->rbuf = t;
865 
866   t->count = 1;
867   /*
868    * Below, we use 1 for numDataCol (which is equal to the count in the
869    * previous line), so we'll always be done.
870    */
871   rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, 1);
872 
873 out:
874   RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID);
875   RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex );
876   if (rf_reconbufferDebug) {
877     printf("[%d] RAID1 rbuf submission: returning %d\n", tid, retcode);
878   }
879   return(retcode);
880 }
881