xref: /netbsd-src/sys/dev/raidframe/rf_map.c (revision dc306354b0b29af51801a7632f1e95265a68cd81)
1 /*	$NetBSD: rf_map.c,v 1.1 1998/11/13 04:20:31 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /**************************************************************************
30  *
31  * map.c -- main code for mapping RAID addresses to physical disk addresses
32  *
33  **************************************************************************/
34 
35 /*
36  * :
37  * Log: rf_map.c,v
38  * Revision 1.53  1996/11/05 21:10:40  jimz
39  * failed pda generalization
40  *
41  * Revision 1.52  1996/08/20  19:58:39  jimz
42  * initialize numParityFailed and numQFailed to 0 in MarkFailuresInASMList
43  *
44  * Revision 1.51  1996/08/19  22:26:31  jimz
45  * add Chang's bugfixes for double-disk failures in MarkFailuresInASMList
46  *
47  * Revision 1.50  1996/08/19  21:38:06  jimz
48  * stripeOffset was uninitialized in CheckStripeForFailures
49  *
50  * Revision 1.49  1996/07/31  15:34:56  jimz
51  * evenodd changes; bugfixes for double-degraded archs, generalize
52  * some formerly PQ-only functions
53  *
54  * Revision 1.48  1996/07/27  23:36:08  jimz
55  * Solaris port of simulator
56  *
57  * Revision 1.47  1996/07/22  19:52:16  jimz
58  * switched node params to RF_DagParam_t, a union of
59  * a 64-bit int and a void *, for better portability
60  * attempted hpux port, but failed partway through for
61  * lack of a single C compiler capable of compiling all
62  * source files
63  *
64  * Revision 1.46  1996/06/10  12:50:57  jimz
65  * Add counters to freelists to track number of allocations, frees,
66  * grows, max size, etc. Adjust a couple sets of PRIME params based
67  * on the results.
68  *
69  * Revision 1.45  1996/06/10  11:55:47  jimz
70  * Straightened out some per-array/not-per-array distinctions, fixed
71  * a couple bugs related to confusion. Added shutdown lists. Removed
72  * layout shutdown function (now subsumed by shutdown lists).
73  *
74  * Revision 1.44  1996/06/09  02:36:46  jimz
75  * lots of little crufty cleanup- fixup whitespace
76  * issues, comment #ifdefs, improve typing in some
77  * places (esp size-related)
78  *
79  * Revision 1.43  1996/06/07  21:33:04  jimz
80  * begin using consistent types for sector numbers,
81  * stripe numbers, row+col numbers, recon unit numbers
82  *
83  * Revision 1.42  1996/06/05  18:06:02  jimz
84  * Major code cleanup. The Great Renaming is now done.
85  * Better modularity. Better typing. Fixed a bunch of
86  * synchronization bugs. Made a lot of global stuff
87  * per-desc or per-array. Removed dead code.
88  *
89  * Revision 1.41  1996/06/03  23:28:26  jimz
90  * more bugfixes
91  * check in tree to sync for IPDS runs with current bugfixes
92  * there still may be a problem with threads in the script test
93  * getting I/Os stuck- not trivially reproducible (runs ~50 times
94  * in a row without getting stuck)
95  *
96  * Revision 1.40  1996/05/31  22:26:54  jimz
97  * fix a lot of mapping problems, memory allocation problems
98  * found some weird lock issues, fixed 'em
99  * more code cleanup
100  *
101  * Revision 1.39  1996/05/30  23:22:16  jimz
102  * bugfixes of serialization, timing problems
103  * more cleanup
104  *
105  * Revision 1.38  1996/05/30  11:29:41  jimz
106  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
107  * about when stripes should be locked (I made it consistent: no parity, no lock)
108  * There was a lot of extra serialization of I/Os which I've removed- a lot of
109  * it was to calculate values for the cache code, which is no longer with us.
110  * More types, function, macro cleanup. Added code to properly quiesce the array
111  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
112  * before. Fixed memory allocation, freeing bugs.
113  *
114  * Revision 1.37  1996/05/27  18:56:37  jimz
115  * more code cleanup
116  * better typing
117  * compiles in all 3 environments
118  *
119  * Revision 1.36  1996/05/23  21:46:35  jimz
120  * checkpoint in code cleanup (release prep)
121  * lots of types, function names have been fixed
122  *
123  * Revision 1.35  1996/05/23  00:33:23  jimz
124  * code cleanup: move all debug decls to rf_options.c, all extern
125  * debug decls to rf_options.h, all debug vars preceded by rf_
126  *
127  * Revision 1.34  1996/05/20  16:14:45  jimz
128  * switch to rf_{mutex,cond}_{init,destroy}
129  *
130  * Revision 1.33  1996/05/18  19:51:34  jimz
131  * major code cleanup- fix syntax, make some types consistent,
132  * add prototypes, clean out dead code, et cetera
133  *
134  * Revision 1.32  1996/05/17  00:51:47  jimz
135  * reformat for readability
136  *
137  * Revision 1.31  1996/05/16  23:06:26  jimz
138  * convert asmhdr to use RF_FREELIST stuff
139  *
140  * Revision 1.30  1996/05/16  19:09:42  jimz
141  * grow init asm freelist to 32
142  *
143  * Revision 1.29  1996/05/16  15:27:55  jimz
144  * prime freelist pumps for asm and pda lists
145  *
146  * Revision 1.28  1996/05/02  14:58:35  jimz
147  * legibility cleanup
148  *
149  * Revision 1.27  1995/12/12  18:10:06  jimz
150  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
151  * fix 80-column brain damage in comments
152  *
153  * Revision 1.26  1995/12/01  19:25:06  root
154  * added copyright info
155  *
156  * Revision 1.25  1995/11/17  19:01:57  wvcii
157  * added call to MapQ in two fault tolerant case
158  *
159  * Revision 1.24  1995/11/17  15:10:53  wvcii
160  * fixed bug in ASMCheckStatus - ASSERT was using disk sector addresses
161  * rather than raidAddress
162  *
163  * Revision 1.23  1995/07/26  03:26:51  robby
164  * map the allocation and freeing routines for some stuff non-static
165  *
166  * Revision 1.22  1995/06/28  09:33:45  holland
167  * bug fixes related to dist sparing and multiple-row arrays
168  *
169  * Revision 1.21  1995/06/28  04:51:08  holland
170  * added some asserts against zero-length accesses
171  *
172  * Revision 1.20  1995/06/23  13:40:06  robby
173  * updeated to prototypes in rf_layout.h
174  *
175  */
176 
177 #include "rf_types.h"
178 #include "rf_threadstuff.h"
179 #include "rf_raid.h"
180 #include "rf_general.h"
181 #include "rf_map.h"
182 #include "rf_freelist.h"
183 #include "rf_shutdown.h"
184 #include "rf_sys.h"
185 
186 static void rf_FreePDAList(RF_PhysDiskAddr_t *start, RF_PhysDiskAddr_t *end, int count);
187 static void rf_FreeASMList(RF_AccessStripeMap_t *start, RF_AccessStripeMap_t *end,
188 	int count);
189 
190 /*****************************************************************************************
191  *
192  * MapAccess -- main 1st order mapping routine.
193  *
194  * Maps an access in the RAID address space to the corresponding set of physical disk
195  * addresses.  The result is returned as a list of AccessStripeMap structures, one per
196  * stripe accessed.  Each ASM structure contains a pointer to a list of PhysDiskAddr
197  * structures, which describe the physical locations touched by the user access.  Note
198  * that this routine returns only static mapping information, i.e. the list of physical
199  * addresses returned does not necessarily identify the set of physical locations that
200  * will actually be read or written.
201  *
202  * The routine also maps the parity.  The physical disk location returned always
203  * indicates the entire parity unit, even when only a subset of it is being accessed.
204  * This is because an access that is not stripe unit aligned but that spans a stripe
205  * unit boundary may require access two distinct portions of the parity unit, and we
206  * can't yet tell which portion(s) we'll actually need.  We leave it up to the algorithm
207  * selection code to decide what subset of the parity unit to access.
208  *
209  * Note that addresses in the RAID address space must always be maintained as
210  * longs, instead of ints.
211  *
212  * This routine returns NULL if numBlocks is 0
213  *
214  ****************************************************************************************/
215 
216 RF_AccessStripeMapHeader_t *rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap)
217   RF_Raid_t         *raidPtr;
218   RF_RaidAddr_t      raidAddress; /* starting address in RAID address space */
219   RF_SectorCount_t   numBlocks;   /* number of blocks in RAID address space to access */
220   caddr_t            buffer;      /* buffer to supply/receive data */
221   int                remap;       /* 1 => remap addresses to spare space */
222 {
223   RF_RaidLayout_t            *layoutPtr       = &(raidPtr->Layout);
224   RF_AccessStripeMapHeader_t *asm_hdr         = NULL;
225   RF_AccessStripeMap_t       *asm_list        = NULL, *asm_p = NULL;
226   int                         faultsTolerated = layoutPtr->map->faultsTolerated;
227   RF_RaidAddr_t               startAddress    = raidAddress;            /* we'll change raidAddress along the way */
228   RF_RaidAddr_t               endAddress      = raidAddress + numBlocks;
229   RF_RaidDisk_t             **disks           = raidPtr->Disks;
230 
231   RF_PhysDiskAddr_t          *pda_p, *pda_q;
232   RF_StripeCount_t            numStripes = 0;
233   RF_RaidAddr_t               stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress;
234   RF_RaidAddr_t               startAddrWithinStripe, lastRaidAddr;
235   RF_StripeCount_t            totStripes;
236   RF_StripeNum_t              stripeID, lastSID, SUID, lastSUID;
237   RF_AccessStripeMap_t  *asmList, *t_asm;
238   RF_PhysDiskAddr_t     *pdaList, *t_pda;
239 
240   /* allocate all the ASMs and PDAs up front */
241   lastRaidAddr = raidAddress + numBlocks - 1 ;
242   stripeID     = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
243   lastSID      = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
244   totStripes   = lastSID - stripeID + 1;
245   SUID         = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
246   lastSUID     = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
247 
248   asmList = rf_AllocASMList(totStripes);
249   pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes);     /* may also need pda(s) per stripe for parity */
250 
251   if (raidAddress+numBlocks > raidPtr->totalSectors) {
252     RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n",
253 		 (int)raidAddress);
254     return(NULL);
255   }
256 
257   if (rf_mapDebug)
258     rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
259   for (; raidAddress < endAddress; ) {
260     /* make the next stripe structure */
261     RF_ASSERT(asmList);
262     t_asm = asmList;
263     asmList = asmList->next;
264     bzero((char *)t_asm, sizeof(RF_AccessStripeMap_t));
265     if (!asm_p)
266       asm_list = asm_p = t_asm;
267     else {
268       asm_p->next = t_asm;
269       asm_p = asm_p->next;
270     }
271     numStripes++;
272 
273     /* map SUs from current location to the end of the stripe */
274     asm_p->stripeID = /*rf_RaidAddressToStripeID(layoutPtr, raidAddress)*/ stripeID++;
275     stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
276     stripeEndAddress = RF_MIN(endAddress,stripeRealEndAddress );
277     asm_p->raidAddress    = raidAddress;
278     asm_p->endRaidAddress = stripeEndAddress;
279 
280     /* map each stripe unit in the stripe */
281     pda_p = NULL;
282     startAddrWithinStripe = raidAddress;      /* Raid addr of start of portion of access that is within this stripe */
283     for (; raidAddress < stripeEndAddress; ) {
284       RF_ASSERT(pdaList);
285       t_pda = pdaList;
286       pdaList = pdaList->next;
287       bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
288       if (!pda_p)
289 	asm_p->physInfo = pda_p = t_pda;
290       else {
291         pda_p->next = t_pda;
292         pda_p = pda_p->next;
293       }
294 
295       pda_p->type = RF_PDA_TYPE_DATA;
296       (layoutPtr->map->MapSector)(raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
297 
298       /* mark any failures we find.  failedPDA is don't-care if there is more than one failure */
299       pda_p->raidAddress = raidAddress;          /* the RAID address corresponding to this physical disk address */
300       nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress);
301       pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress;
302       RF_ASSERT(pda_p->numSector != 0);
303       rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,0);
304       pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress));
305       asm_p->totalSectorsAccessed += pda_p->numSector;
306       asm_p->numStripeUnitsAccessed++;
307       asm_p->origRow = pda_p->row;               /* redundant but harmless to do this in every loop iteration */
308 
309       raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
310     }
311 
312     /* Map the parity. At this stage, the startSector and numSector fields
313      * for the parity unit are always set to indicate the entire parity unit.
314      * We may modify this after mapping the data portion.
315      */
316     switch (faultsTolerated)
317       {
318       case 0:
319 	break;
320       case 1: /* single fault tolerant */
321 	RF_ASSERT(pdaList);
322 	t_pda = pdaList;
323 	pdaList = pdaList->next;
324 	bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
325 	pda_p = asm_p->parityInfo = t_pda;
326 	pda_p->type = RF_PDA_TYPE_PARITY;
327 	(layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
328 				    &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
329 	pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
330 	/* raidAddr may be needed to find unit to redirect to */
331 	pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
332 	rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1);
333 	rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
334 
335 	break;
336       case 2: /* two fault tolerant */
337 	RF_ASSERT(pdaList && pdaList->next);
338 	t_pda = pdaList;
339 	pdaList = pdaList->next;
340 	bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
341 	pda_p = asm_p->parityInfo = t_pda;
342 	pda_p->type = RF_PDA_TYPE_PARITY;
343 	t_pda = pdaList;
344 	pdaList = pdaList->next;
345 	bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
346 	pda_q = asm_p->qInfo = t_pda;
347 	pda_q->type = RF_PDA_TYPE_Q;
348 	(layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
349 				    &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
350 	(layoutPtr->map->MapQ)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
351 			       &(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap);
352 	pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
353 	/* raidAddr may be needed to find unit to redirect to */
354 	pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
355 	pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
356 	/* failure mode stuff */
357 	rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1);
358 	rf_ASMCheckStatus(raidPtr,pda_q,asm_p,disks,1);
359 	rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
360 	rf_ASMParityAdjust(asm_p->qInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
361 	break;
362       }
363   }
364   RF_ASSERT(asmList == NULL && pdaList == NULL);
365   /* make the header structure */
366   asm_hdr = rf_AllocAccessStripeMapHeader();
367   RF_ASSERT(numStripes == totStripes);
368   asm_hdr->numStripes = numStripes;
369   asm_hdr->stripeMap  = asm_list;
370 
371   if (rf_mapDebug)
372     rf_PrintAccessStripeMap(asm_hdr);
373   return(asm_hdr);
374 }
375 
376 /*****************************************************************************************
377  * This routine walks through an ASM list and marks the PDAs that have failed.
378  * It's called only when a disk failure causes an in-flight DAG to fail.
379  * The parity may consist of two components, but we want to use only one failedPDA
380  * pointer.  Thus we set failedPDA to point to the first parity component, and rely
381  * on the rest of the code to do the right thing with this.
382  ****************************************************************************************/
383 
384 void rf_MarkFailuresInASMList(raidPtr, asm_h)
385   RF_Raid_t                   *raidPtr;
386   RF_AccessStripeMapHeader_t  *asm_h;
387 {
388   RF_RaidDisk_t  **disks = raidPtr->Disks;
389   RF_AccessStripeMap_t *asmap;
390   RF_PhysDiskAddr_t *pda;
391 
392   for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
393     asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0;
394     asmap->numFailedPDAs = 0;
395     bzero((char *)asmap->failedPDAs,
396       RF_MAX_FAILED_PDA*sizeof(RF_PhysDiskAddr_t *));
397     for (pda = asmap->physInfo; pda; pda=pda->next) {
398       if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
399 	      printf("DEAD DISK BOGUSLY DETECTED!!\n");
400         asmap->numDataFailed++;
401         asmap->failedPDAs[asmap->numFailedPDAs] = pda;
402         asmap->numFailedPDAs++;
403       }
404     }
405     pda = asmap->parityInfo;
406     if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
407       asmap->numParityFailed++;
408       asmap->failedPDAs[asmap->numFailedPDAs] = pda;
409       asmap->numFailedPDAs++;
410     }
411     pda = asmap->qInfo;
412     if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
413       asmap->numQFailed++;
414       asmap->failedPDAs[asmap->numFailedPDAs] = pda;
415       asmap->numFailedPDAs++;
416     }
417   }
418 }
419 
420 /*****************************************************************************************
421  *
422  * DuplicateASM -- duplicates an ASM and returns the new one
423  *
424  ****************************************************************************************/
425 RF_AccessStripeMap_t *rf_DuplicateASM(asmap)
426   RF_AccessStripeMap_t  *asmap;
427 {
428   RF_AccessStripeMap_t *new_asm;
429   RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
430 
431   new_pda = NULL;
432   new_asm = rf_AllocAccessStripeMapComponent();
433   bcopy((char *)asmap, (char *)new_asm, sizeof(RF_AccessStripeMap_t));
434   new_asm->numFailedPDAs = 0; /* ??? */
435   new_asm->failedPDAs[0] = NULL;
436   new_asm->physInfo = NULL;
437   new_asm->parityInfo = NULL;
438   new_asm->next = NULL;
439 
440   for (pda = asmap->physInfo; pda; pda=pda->next) {      /* copy the physInfo list */
441     t_pda = rf_AllocPhysDiskAddr();
442     bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t));
443     t_pda->next = NULL;
444     if (!new_asm->physInfo) {new_asm->physInfo = t_pda; new_pda = t_pda;}
445     else {new_pda->next = t_pda; new_pda = new_pda->next;}
446     if (pda == asmap->failedPDAs[0])
447       new_asm->failedPDAs[0] = t_pda;
448   }
449   for (pda = asmap->parityInfo; pda; pda=pda->next) {      /* copy the parityInfo list */
450     t_pda = rf_AllocPhysDiskAddr();
451     bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t));
452     t_pda->next = NULL;
453     if (!new_asm->parityInfo) {new_asm->parityInfo = t_pda; new_pda = t_pda;}
454     else {new_pda->next = t_pda; new_pda = new_pda->next;}
455     if (pda == asmap->failedPDAs[0])
456       new_asm->failedPDAs[0] = t_pda;
457   }
458   return(new_asm);
459 }
460 
461 /*****************************************************************************************
462  *
463  * DuplicatePDA -- duplicates a PDA and returns the new one
464  *
465  ****************************************************************************************/
466 RF_PhysDiskAddr_t *rf_DuplicatePDA(pda)
467   RF_PhysDiskAddr_t  *pda;
468 {
469   RF_PhysDiskAddr_t *new;
470 
471   new = rf_AllocPhysDiskAddr();
472   bcopy((char *)pda, (char *)new, sizeof(RF_PhysDiskAddr_t));
473   return(new);
474 }
475 
476 /*****************************************************************************************
477  *
478  * routines to allocate and free list elements.  All allocation routines zero the
479  * structure before returning it.
480  *
481  * FreePhysDiskAddr is static.  It should never be called directly, because
482  * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
483  *
484  ****************************************************************************************/
485 
486 static RF_FreeList_t *rf_asmhdr_freelist;
487 #define RF_MAX_FREE_ASMHDR 128
488 #define RF_ASMHDR_INC       16
489 #define RF_ASMHDR_INITIAL   32
490 
491 static RF_FreeList_t *rf_asm_freelist;
492 #define RF_MAX_FREE_ASM 192
493 #define RF_ASM_INC       24
494 #define RF_ASM_INITIAL   64
495 
496 static RF_FreeList_t *rf_pda_freelist;
497 #define RF_MAX_FREE_PDA 192
498 #define RF_PDA_INC       24
499 #define RF_PDA_INITIAL   64
500 
501 /* called at shutdown time.  So far, all that is necessary is to release all the free lists */
502 static void rf_ShutdownMapModule(void *);
503 static void rf_ShutdownMapModule(ignored)
504   void  *ignored;
505 {
506   RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
507   RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *));
508   RF_FREELIST_DESTROY(rf_asm_freelist,next,(RF_AccessStripeMap_t *));
509 }
510 
511 int rf_ConfigureMapModule(listp)
512   RF_ShutdownList_t  **listp;
513 {
514 	int rc;
515 
516 	RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
517 		RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
518 	if (rf_asmhdr_freelist == NULL) {
519 		return(ENOMEM);
520 	}
521 	RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
522 		RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
523 	if (rf_asm_freelist == NULL) {
524 		RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
525 		return(ENOMEM);
526 	}
527 	RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA,
528 		RF_PDA_INC, sizeof(RF_PhysDiskAddr_t));
529 	if (rf_pda_freelist == NULL) {
530 		RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
531 		RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *));
532 		return(ENOMEM);
533 	}
534 
535 	rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
536 	if (rc) {
537 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
538 			__LINE__, rc);
539 		rf_ShutdownMapModule(NULL);
540 		return(rc);
541 	}
542 
543 	RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL,next,
544 		(RF_AccessStripeMapHeader_t *));
545 	RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL,next,
546 		(RF_AccessStripeMap_t *));
547 	RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL,next,
548 		(RF_PhysDiskAddr_t *));
549 
550 	return(0);
551 }
552 
553 RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader()
554 {
555 	RF_AccessStripeMapHeader_t *p;
556 
557 	RF_FREELIST_GET(rf_asmhdr_freelist,p,next,(RF_AccessStripeMapHeader_t *));
558 	bzero((char *)p, sizeof(RF_AccessStripeMapHeader_t));
559 
560 	return(p);
561 }
562 
563 
564 void rf_FreeAccessStripeMapHeader(p)
565   RF_AccessStripeMapHeader_t  *p;
566 {
567 	RF_FREELIST_FREE(rf_asmhdr_freelist,p,next);
568 }
569 
570 RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr()
571 {
572 	RF_PhysDiskAddr_t *p;
573 
574 	RF_FREELIST_GET(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *));
575 	bzero((char *)p, sizeof(RF_PhysDiskAddr_t));
576 
577 	return(p);
578 }
579 
580 /* allocates a list of PDAs, locking the free list only once
581  * when we have to call calloc, we do it one component at a time to simplify
582  * the process of freeing the list at program shutdown.  This should not be
583  * much of a performance hit, because it should be very infrequently executed.
584  */
585 RF_PhysDiskAddr_t *rf_AllocPDAList(count)
586   int  count;
587 {
588 	RF_PhysDiskAddr_t *p = NULL;
589 
590 	RF_FREELIST_GET_N(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *),count);
591 	return(p);
592 }
593 
594 void rf_FreePhysDiskAddr(p)
595   RF_PhysDiskAddr_t  *p;
596 {
597 	RF_FREELIST_FREE(rf_pda_freelist,p,next);
598 }
599 
600 static void rf_FreePDAList(l_start, l_end, count)
601   RF_PhysDiskAddr_t *l_start, *l_end;   /* pointers to start and end of list */
602   int count;                            /* number of elements in list */
603 {
604 	RF_FREELIST_FREE_N(rf_pda_freelist,l_start,next,(RF_PhysDiskAddr_t *),count);
605 }
606 
607 RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent()
608 {
609 	RF_AccessStripeMap_t *p;
610 
611 	RF_FREELIST_GET(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *));
612 	bzero((char *)p, sizeof(RF_AccessStripeMap_t));
613 
614 	return(p);
615 }
616 
617 /* this is essentially identical to AllocPDAList.  I should combine the two.
618  * when we have to call calloc, we do it one component at a time to simplify
619  * the process of freeing the list at program shutdown.  This should not be
620  * much of a performance hit, because it should be very infrequently executed.
621  */
622 RF_AccessStripeMap_t *rf_AllocASMList(count)
623   int  count;
624 {
625 	RF_AccessStripeMap_t *p = NULL;
626 
627 	RF_FREELIST_GET_N(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *),count);
628 	return(p);
629 }
630 
631 void rf_FreeAccessStripeMapComponent(p)
632   RF_AccessStripeMap_t  *p;
633 {
634 	RF_FREELIST_FREE(rf_asm_freelist,p,next);
635 }
636 
637 static void rf_FreeASMList(l_start, l_end, count)
638   RF_AccessStripeMap_t  *l_start, *l_end;
639   int                    count;
640 {
641 	RF_FREELIST_FREE_N(rf_asm_freelist,l_start,next,(RF_AccessStripeMap_t *),count);
642 }
643 
644 void rf_FreeAccessStripeMap(hdr)
645   RF_AccessStripeMapHeader_t  *hdr;
646 {
647   RF_AccessStripeMap_t *p, *pt = NULL;
648   RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
649   int count = 0, t, asm_count = 0;
650 
651   for (p = hdr->stripeMap; p; p=p->next) {
652 
653     /* link the 3 pda lists into the accumulating pda list */
654 
655     if (!pdaList) pdaList = p->qInfo; else pdaEnd->next = p->qInfo;
656     for (trailer=NULL,pdp=p->qInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
657     if (trailer) pdaEnd = trailer;
658 
659     if (!pdaList) pdaList = p->parityInfo; else pdaEnd->next = p->parityInfo;
660     for (trailer=NULL,pdp=p->parityInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
661     if (trailer) pdaEnd = trailer;
662 
663     if (!pdaList) pdaList = p->physInfo; else pdaEnd->next = p->physInfo;
664     for (trailer=NULL,pdp=p->physInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
665     if (trailer) pdaEnd = trailer;
666 
667     pt = p;
668     asm_count++;
669   }
670 
671   /* debug only */
672   for (t=0,pdp=pdaList; pdp; pdp=pdp->next)
673     t++;
674   RF_ASSERT(t == count);
675 
676   if (pdaList)
677     rf_FreePDAList(pdaList, pdaEnd, count);
678   rf_FreeASMList(hdr->stripeMap, pt, asm_count);
679   rf_FreeAccessStripeMapHeader(hdr);
680 }
681 
682 /* We can't use the large write optimization if there are any failures in the stripe.
683  * In the declustered layout, there is no way to immediately determine what disks
684  * constitute a stripe, so we actually have to hunt through the stripe looking for failures.
685  * The reason we map the parity instead of just using asm->parityInfo->col is because
686  * the latter may have been already redirected to a spare drive, which would
687  * mess up the computation of the stripe offset.
688  *
689  * ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
690  */
691 int rf_CheckStripeForFailures(raidPtr, asmap)
692   RF_Raid_t             *raidPtr;
693   RF_AccessStripeMap_t  *asmap;
694 {
695   RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
696   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
697   RF_StripeCount_t stripeOffset;
698   int numFailures;
699   RF_RaidAddr_t sosAddr;
700   RF_SectorNum_t diskOffset, poffset;
701   RF_RowCol_t testrow;
702 
703   /* quick out in the fault-free case.  */
704   RF_LOCK_MUTEX(raidPtr->mutex);
705   numFailures = raidPtr->numFailures;
706   RF_UNLOCK_MUTEX(raidPtr->mutex);
707   if (numFailures == 0) return(0);
708 
709   sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
710   row = asmap->physInfo->row;
711   (layoutPtr->map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &testrow);
712   (layoutPtr->map->MapParity)(raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0);  /* get pcol */
713 
714   /* this need not be true if we've redirected the access to a spare in another row
715   RF_ASSERT(row == testrow);
716   */
717   stripeOffset = 0;
718   for (i=0; i<layoutPtr->numDataCol+layoutPtr->numParityCol; i++) {
719     if (diskids[i] != pcol) {
720       if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) {
721         if (raidPtr->status[testrow] != rf_rs_reconstructing)
722           return(1);
723         RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]);
724         layoutPtr->map->MapSector(raidPtr,
725           sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit,
726           &trow, &tcol, &diskOffset, 0);
727         RF_ASSERT( (trow == testrow) && (tcol == diskids[i]) );
728         if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset))
729           return(1);
730         asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
731         return(0);
732       }
733       stripeOffset++;
734     }
735   }
736   return(0);
737 }
738 
739 /*
740    return the number of failed data units in the stripe.
741 */
742 
743 int rf_NumFailedDataUnitsInStripe(raidPtr, asmap)
744   RF_Raid_t             *raidPtr;
745   RF_AccessStripeMap_t  *asmap;
746 {
747   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
748   RF_RowCol_t trow, tcol, row, i;
749   RF_SectorNum_t diskOffset;
750   RF_RaidAddr_t sosAddr;
751   int numFailures;
752 
753   /* quick out in the fault-free case.  */
754   RF_LOCK_MUTEX(raidPtr->mutex);
755   numFailures = raidPtr->numFailures;
756   RF_UNLOCK_MUTEX(raidPtr->mutex);
757   if (numFailures == 0) return(0);
758   numFailures = 0;
759 
760   sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
761   row = asmap->physInfo->row;
762   for (i=0; i<layoutPtr->numDataCol; i++)
763     {
764       (layoutPtr->map->MapSector)(raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit,
765 				  &trow, &tcol, &diskOffset, 0);
766       if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
767 	numFailures++;
768     }
769 
770   return numFailures;
771 }
772 
773 
774 /*****************************************************************************************
775  *
776  * debug routines
777  *
778  ****************************************************************************************/
779 
780 void rf_PrintAccessStripeMap(asm_h)
781   RF_AccessStripeMapHeader_t  *asm_h;
782 {
783   rf_PrintFullAccessStripeMap(asm_h, 0);
784 }
785 
786 void rf_PrintFullAccessStripeMap(asm_h, prbuf)
787   RF_AccessStripeMapHeader_t  *asm_h;
788   int                          prbuf; /* flag to print buffer pointers */
789 {
790   int i;
791   RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
792   RF_PhysDiskAddr_t *p;
793   printf("%d stripes total\n", (int)asm_h->numStripes);
794   for (; asmap; asmap = asmap->next) {
795 	  /* 	  printf("Num failures: %d\n",asmap->numDataFailed); */
796 	  /* printf("Num sectors: %d\n",(int)asmap->totalSectorsAccessed); */
797     printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
798 	   (int) asmap->stripeID,
799 	   (int) asmap->totalSectorsAccessed,
800 	   (int) asmap->numDataFailed,
801 	   (int) asmap->numParityFailed);
802     if (asmap->parityInfo) {
803       printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col,
804 	     (int)asmap->parityInfo->startSector,
805 	     (int)(asmap->parityInfo->startSector +
806 		   asmap->parityInfo->numSector - 1));
807       if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->bufPtr);
808       if (asmap->parityInfo->next) {
809 	printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row,
810 	       asmap->parityInfo->next->col,
811 	       (int) asmap->parityInfo->next->startSector,
812 	       (int)(asmap->parityInfo->next->startSector +
813 		     asmap->parityInfo->next->numSector - 1));
814 	if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->next->bufPtr);
815 	RF_ASSERT(asmap->parityInfo->next->next == NULL);
816       }
817       printf("]\n\t");
818     }
819     for (i=0,p=asmap->physInfo; p; p=p->next,i++) {
820       printf("SU r%d c%d s%d-%d ", p->row, p->col, (int)p->startSector,
821 	     (int)(p->startSector + p->numSector - 1));
822       if (prbuf) printf("b0x%lx ", (unsigned long) p->bufPtr);
823       if (i && !(i&1)) printf("\n\t");
824     }
825     printf("\n");
826     p = asm_h->stripeMap->failedPDAs[0];
827     if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1) printf("[multiple failures]\n");
828     else if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0)
829       printf("\t[Failed PDA: r%d c%d s%d-%d]\n",p->row, p->col,
830 	     (int)p->startSector, (int)(p->startSector + p->numSector-1));
831   }
832 }
833 
834 void rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks)
835   RF_Raid_t         *raidPtr;
836   RF_RaidAddr_t      raidAddr;
837   RF_SectorCount_t   numBlocks;
838 {
839   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
840   RF_RaidAddr_t ra, sosAddr  = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
841 
842   printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t");
843   for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) {
844     printf("%d (0x%x), ",(int)ra, (int)ra);
845   }
846   printf("\n");
847   printf("Offset into stripe unit: %d (0x%x)\n",
848 	 (int)(raidAddr % layoutPtr->sectorsPerStripeUnit),
849 	 (int)(raidAddr % layoutPtr->sectorsPerStripeUnit));
850 }
851 
852 /*
853    given a parity descriptor and the starting address within a stripe,
854    range restrict the parity descriptor to touch only the correct stuff.
855 */
856 void rf_ASMParityAdjust(
857   RF_PhysDiskAddr_t     *toAdjust,
858   RF_StripeNum_t         startAddrWithinStripe,
859   RF_SectorNum_t         endAddress,
860   RF_RaidLayout_t       *layoutPtr,
861   RF_AccessStripeMap_t  *asm_p)
862 {
863   RF_PhysDiskAddr_t *new_pda;
864 
865   /* when we're accessing only a portion of one stripe unit, we want the parity descriptor
866    * to identify only the chunk of parity associated with the data.  When the access spans
867    * exactly one stripe unit boundary and is less than a stripe unit in size, it uses two disjoint
868    * regions of the parity unit.  When an access spans more than one stripe unit boundary, it
869    * uses all of the parity unit.
870    *
871    * To better handle the case where stripe units are small, we may eventually want to change
872    * the 2nd case so that if the SU size is below some threshold, we just read/write the whole
873    * thing instead of breaking it up into two accesses.
874    */
875   if (asm_p->numStripeUnitsAccessed == 1)
876     {
877       int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
878       toAdjust->startSector += x;
879       toAdjust->raidAddress += x;
880       toAdjust->numSector = asm_p->physInfo->numSector;
881       RF_ASSERT(toAdjust->numSector != 0);
882     }
883   else
884     if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit)
885       {
886 	int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
887 
888 	/* create a second pda and copy the parity map info into it */
889 	RF_ASSERT(toAdjust->next == NULL);
890 	new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
891 	*new_pda = *toAdjust; /* structure assignment */
892 	new_pda->next = NULL;
893 
894 	/* adjust the start sector & number of blocks for the first parity pda */
895 	toAdjust->startSector += x;
896 	toAdjust->raidAddress += x;
897 	toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe;
898 	RF_ASSERT(toAdjust->numSector != 0);
899 
900 	/* adjust the second pda */
901 	new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress);
902 	/*new_pda->raidAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, toAdjust->raidAddress);*/
903 	RF_ASSERT(new_pda->numSector != 0);
904       }
905 }
906 
907 /*
908    Check if a disk has been spared or failed. If spared,
909    redirect the I/O.
910    If it has been failed, record it in the asm pointer.
911    Fourth arg is whether data or parity.
912 */
913 void rf_ASMCheckStatus(
914   RF_Raid_t              *raidPtr,
915   RF_PhysDiskAddr_t      *pda_p,
916   RF_AccessStripeMap_t   *asm_p,
917   RF_RaidDisk_t         **disks,
918   int                     parity)
919 {
920   RF_DiskStatus_t dstatus;
921   RF_RowCol_t frow, fcol;
922 
923   dstatus = disks[pda_p->row][pda_p->col].status;
924 
925   if (dstatus == rf_ds_spared) {
926     /* if the disk has been spared, redirect access to the spare */
927     frow = pda_p->row; fcol = pda_p->col;
928     pda_p->row = disks[frow][fcol].spareRow;
929     pda_p->col = disks[frow][fcol].spareCol;
930   }
931   else if (dstatus == rf_ds_dist_spared) {
932     /* ditto if disk has been spared to dist spare space */
933     RF_RowCol_t or = pda_p->row, oc=pda_p->col;
934     RF_SectorNum_t oo = pda_p->startSector;
935 
936     if (pda_p -> type == RF_PDA_TYPE_DATA)
937       raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
938     else
939       raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
940 
941     if (rf_mapDebug) {
942       printf("Redirected r %d c %d o %d -> r%d c %d o %d\n",or,oc,(int)oo,
943         pda_p->row,pda_p->col,(int)pda_p->startSector);
944     }
945   } else if (RF_DEAD_DISK(dstatus)) {
946     /* if the disk is inaccessible, mark the failure */
947     if (parity)
948       asm_p->numParityFailed++;
949     else {
950       asm_p->numDataFailed++;
951 #if 0
952       /* XXX Do we really want this spewing out on the console? GO */
953       printf("DATA_FAILED!\n");
954 #endif
955     }
956     asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
957     asm_p->numFailedPDAs++;
958 #if 0
959     switch (asm_p->numParityFailed + asm_p->numDataFailed)
960       {
961       case 1:
962 	asm_p->failedPDAs[0] = pda_p;
963 	break;
964       case 2:
965 	asm_p->failedPDAs[1] = pda_p;
966       default:
967 	break;
968       }
969 #endif
970   }
971   /* the redirected access should never span a stripe unit boundary */
972   RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress) ==
973 	 rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress + pda_p->numSector -1));
974   RF_ASSERT(pda_p->col != -1);
975 }
976