xref: /netbsd-src/sys/dev/raidframe/rf_decluster.c (revision dc306354b0b29af51801a7632f1e95265a68cd81)
1 /*	$NetBSD: rf_decluster.c,v 1.1 1998/11/13 04:20:28 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*----------------------------------------------------------------------
30  *
31  * rf_decluster.c -- code related to the declustered layout
32  *
33  * Created 10-21-92 (MCH)
34  *
35  * Nov 93:  adding support for distributed sparing.  This code is a little
36  *          complex:  the basic layout used is as follows:
37  *          let F = (v-1)/GCD(r,v-1).  The spare space for each set of
38  *          F consecutive fulltables is grouped together and placed after
39  *          that set of tables.
40  *                   +------------------------------+
41  *                   |        F fulltables          |
42  *                   |        Spare Space           |
43  *                   |        F fulltables          |
44  *                   |        Spare Space           |
45  *                   |            ...               |
46  *                   +------------------------------+
47  *
48  *--------------------------------------------------------------------*/
49 
50 /*
51  * :
52  * Log: rf_decluster.c,v
53  * Revision 1.51  1996/08/21 19:47:10  jimz
54  * fix bogus return values from config
55  *
56  * Revision 1.50  1996/08/20  22:41:42  jimz
57  * better diagnostics for bad blockdesigns
58  *
59  * Revision 1.49  1996/07/31  16:56:18  jimz
60  * dataBytesPerStripe, sectorsPerDisk init arch-indep.
61  *
62  * Revision 1.48  1996/07/29  14:05:12  jimz
63  * fix numPUs/numRUs confusion (everything is now numRUs)
64  * clean up some commenting, return values
65  *
66  * Revision 1.47  1996/07/27  23:36:08  jimz
67  * Solaris port of simulator
68  *
69  * Revision 1.46  1996/07/27  18:40:11  jimz
70  * cleanup sweep
71  *
72  * Revision 1.45  1996/07/18  22:57:14  jimz
73  * port simulator to AIX
74  *
75  * Revision 1.44  1996/07/13  00:00:59  jimz
76  * sanitized generalized reconstruction architecture
77  * cleaned up head sep, rbuf problems
78  *
79  * Revision 1.43  1996/06/19  17:53:48  jimz
80  * move GetNumSparePUs, InstallSpareTable ops into layout switch
81  *
82  * Revision 1.42  1996/06/17  03:23:48  jimz
83  * switch DeclusteredDS typing
84  *
85  * Revision 1.41  1996/06/11  08:55:15  jimz
86  * improved error-checking at configuration time
87  *
88  * Revision 1.40  1996/06/10  11:55:47  jimz
89  * Straightened out some per-array/not-per-array distinctions, fixed
90  * a couple bugs related to confusion. Added shutdown lists. Removed
91  * layout shutdown function (now subsumed by shutdown lists).
92  *
93  * Revision 1.39  1996/06/09  02:36:46  jimz
94  * lots of little crufty cleanup- fixup whitespace
95  * issues, comment #ifdefs, improve typing in some
96  * places (esp size-related)
97  *
98  * Revision 1.38  1996/06/07  22:26:27  jimz
99  * type-ify which_ru (RF_ReconUnitNum_t)
100  *
101  * Revision 1.37  1996/06/07  21:33:04  jimz
102  * begin using consistent types for sector numbers,
103  * stripe numbers, row+col numbers, recon unit numbers
104  *
105  * Revision 1.36  1996/06/03  23:28:26  jimz
106  * more bugfixes
107  * check in tree to sync for IPDS runs with current bugfixes
108  * there still may be a problem with threads in the script test
109  * getting I/Os stuck- not trivially reproducible (runs ~50 times
110  * in a row without getting stuck)
111  *
112  * Revision 1.35  1996/06/02  17:31:48  jimz
113  * Moved a lot of global stuff into array structure, where it belongs.
114  * Fixed up paritylogging, pss modules in this manner. Some general
115  * code cleanup. Removed lots of dead code, some dead files.
116  *
117  * Revision 1.34  1996/05/30  23:22:16  jimz
118  * bugfixes of serialization, timing problems
119  * more cleanup
120  *
121  * Revision 1.33  1996/05/30  11:29:41  jimz
122  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
123  * about when stripes should be locked (I made it consistent: no parity, no lock)
124  * There was a lot of extra serialization of I/Os which I've removed- a lot of
125  * it was to calculate values for the cache code, which is no longer with us.
126  * More types, function, macro cleanup. Added code to properly quiesce the array
127  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
128  * before. Fixed memory allocation, freeing bugs.
129  *
130  * Revision 1.32  1996/05/27  18:56:37  jimz
131  * more code cleanup
132  * better typing
133  * compiles in all 3 environments
134  *
135  * Revision 1.31  1996/05/24  01:59:45  jimz
136  * another checkpoint in code cleanup for release
137  * time to sync kernel tree
138  *
139  * Revision 1.30  1996/05/23  00:33:23  jimz
140  * code cleanup: move all debug decls to rf_options.c, all extern
141  * debug decls to rf_options.h, all debug vars preceded by rf_
142  *
143  * Revision 1.29  1996/05/18  19:51:34  jimz
144  * major code cleanup- fix syntax, make some types consistent,
145  * add prototypes, clean out dead code, et cetera
146  *
147  * Revision 1.28  1995/12/12  18:10:06  jimz
148  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
149  * fix 80-column brain damage in comments
150  *
151  * Revision 1.27  1995/12/01  16:00:08  root
152  * added copyright info
153  *
154  * Revision 1.26  1995/11/28  21:35:12  amiri
155  * set the RF_BD_DECLUSTERED flag
156  *
157  * Revision 1.25  1995/11/17  18:56:00  wvcii
158  * added prototyping to MapParity
159  *
160  * Revision 1.24  1995/07/04  22:25:33  holland
161  * increased default num bufs
162  *
163  * Revision 1.23  1995/07/03  20:23:51  holland
164  * changed floating recon bufs & head sep yet again
165  *
166  * Revision 1.22  1995/07/03  18:12:14  holland
167  * changed the way the number of floating recon bufs & the head sep
168  * limit are set
169  *
170  * Revision 1.21  1995/07/02  15:07:42  holland
171  * bug fixes related to getting distributed sparing numbers
172  *
173  * Revision 1.20  1995/06/23  13:41:28  robby
174  * updeated to prototypes in rf_layout.h
175  *
176  */
177 
178 #ifdef _KERNEL
179 #define KERNEL
180 #endif
181 
182 
183 #include "rf_types.h"
184 #include "rf_raid.h"
185 #include "rf_raidframe.h"
186 #include "rf_configure.h"
187 #include "rf_decluster.h"
188 #include "rf_debugMem.h"
189 #include "rf_utils.h"
190 #include "rf_alloclist.h"
191 #include "rf_general.h"
192 #include "rf_shutdown.h"
193 #include "rf_sys.h"
194 
195 extern int rf_copyback_in_progress;                /* debug only */
196 
197 /* found in rf_kintf.c */
198 int rf_GetSpareTableFromDaemon(RF_SparetWait_t  *req);
199 
200 /* configuration code */
201 
202 int rf_ConfigureDeclustered(
203   RF_ShutdownList_t  **listp,
204   RF_Raid_t           *raidPtr,
205   RF_Config_t         *cfgPtr)
206 {
207     RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
208     int b, v, k, r, lambda;				/* block design params */
209     int i, j;
210     RF_RowCol_t *first_avail_slot;
211     RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
212     RF_DeclusteredConfigInfo_t *info;
213     RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, extraPUsPerDisk;
214     RF_StripeCount_t totSparePUsPerDisk;
215     RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
216     RF_SectorCount_t SpareSpaceInSUs;
217     char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
218     RF_StripeNum_t l, SUID;
219 
220     SUID = l = 0;
221     numCompleteSpareRegionsPerDisk = 0;
222 
223     /* 1. create layout specific structure */
224     RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
225     if (info == NULL)
226       return(ENOMEM);
227     layoutPtr->layoutSpecificInfo = (void *) info;
228     info->SpareTable = NULL;
229 
230     /* 2. extract parameters from the config structure */
231     if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
232       (void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
233     }
234     cfgBuf += RF_SPAREMAP_NAME_LEN;
235 
236     b        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
237     v        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
238     k        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
239     r        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
240     lambda   = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
241     raidPtr->noRotate = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
242 
243     /* the sparemaps are generated assuming that parity is rotated, so we issue
244      * a warning if both distributed sparing and no-rotate are on at the same time
245      */
246     if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
247 	RF_ERRORMSG("Warning:  distributed sparing specified without parity rotation.\n");
248     }
249 
250     if (raidPtr->numCol != v) {
251         RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
252         return(EINVAL);
253     }
254 
255     /* 3.  set up the values used in the mapping code */
256     info->BlocksPerTable = b;
257     info->Lambda = lambda;
258     info->NumParityReps = info->groupSize = k;
259     info->SUsPerTable = b * (k-1) * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */
260     info->SUsPerFullTable = k * info->SUsPerTable;	/* rot k times */
261     info->PUsPerBlock = k-1;
262     info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
263     info->TableDepthInPUs = (b*k) / v;
264     info->FullTableDepthInPUs = info->TableDepthInPUs * k;		/* k repetitions */
265 
266     /* used only in distributed sparing case */
267     info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1);		/* (v-1)/gcd fulltables */
268     info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
269     info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU;
270 
271     /* check to make sure the block design is sufficiently small */
272     if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
273         if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
274 	    RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
275 			 (int)info->FullTableDepthInPUs,
276 			 (int)info->SpareSpaceDepthPerRegionInSUs,
277 			 (int)layoutPtr->stripeUnitsPerDisk);
278 	    return(EINVAL);
279 	}
280     } else {
281 	if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
282 	    RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
283 			 (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU), \
284 			 (int)layoutPtr->stripeUnitsPerDisk);
285 	    return(EINVAL);
286 	}
287     }
288 
289 
290     /* compute the size of each disk, and the number of tables in the last fulltable (which
291      * need not be complete)
292      */
293     if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
294 
295 	PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
296 	spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
297 				 (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1));
298 	info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
299 
300 	numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
301 	info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
302 	extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
303 
304 	/* assume conservatively that we need the full amount of spare space in one region in order
305 	 * to provide spares for the partial spare region at the end of the array.  We set "i" to
306 	 * the number of tables in the partial spare region.  This may actually include some fulltables.
307 	 */
308 	extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
309 	if (extraPUsPerDisk <= 0) i = 0;
310 	else i = extraPUsPerDisk/info->TableDepthInPUs;
311 
312 	complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k);
313         info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
314 	info->ExtraTablesPerDisk = i % k;
315 
316 	/* note that in the last spare region, the spare space is complete even though data/parity space is not */
317 	totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
318 	info->TotSparePUsPerDisk = totSparePUsPerDisk;
319 
320 	layoutPtr->stripeUnitsPerDisk =
321 	    ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs +	 	/* data & parity space */
322 	     info->ExtraTablesPerDisk * info->TableDepthInPUs +
323 	     totSparePUsPerDisk								/* spare space */
324 	    ) * layoutPtr->SUsPerPU;
325 	layoutPtr->dataStripeUnitsPerDisk =
326 	    (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
327 	    * layoutPtr->SUsPerPU * (k-1) / k;
328 
329     } else {
330         /* non-dist spare case:  force each disk to contain an integral number of tables */
331         layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
332         layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
333 
334 	/* compute the number of tables in the last fulltable, which need not be complete */
335         complete_FT_count =
336             ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
337 
338         info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
339         info->ExtraTablesPerDisk =
340 		((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
341     }
342 
343     raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
344 
345     /* find the disk offset of the stripe unit where the last fulltable starts */
346     numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
347     diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
348     if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
349         SpareSpaceInSUs  = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
350         diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
351         info->DiskOffsetOfLastSpareSpaceChunkInSUs =
352 	    diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
353     }
354     info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
355     info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
356 
357     /* 4.  create and initialize the lookup tables */
358     info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
359     if (info->LayoutTable == NULL)
360       return(ENOMEM);
361     info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
362     if (info->OffsetTable == NULL)
363       return(ENOMEM);
364     info->BlockTable  =	rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
365     if (info->BlockTable == NULL)
366       return(ENOMEM);
367 
368     first_avail_slot = rf_make_1d_array(v, NULL);
369     if (first_avail_slot == NULL)
370       return(ENOMEM);
371 
372     for (i=0; i<b; i++)
373       for (j=0; j<k; j++)
374         info->LayoutTable[i][j] = *cfgBuf++;
375 
376     /* initialize offset table */
377     for (i=0; i<b; i++) for (j=0; j<k; j++) {
378         info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ];
379         first_avail_slot[ info->LayoutTable[i][j] ]++;
380     }
381 
382     /* initialize block table */
383     for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) {
384         for (i=0; i<b; i++) {
385             for (j=0; j<k; j++) {
386                 info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ]
387 		                [ info->LayoutTable[i][j] ] = SUID;
388             }
389             SUID++;
390         }
391     }
392 
393     rf_free_1d_array(first_avail_slot, v);
394 
395     /* 5.  set up the remaining redundant-but-useful parameters */
396 
397     raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) *
398     			  info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
399     layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-1);
400 
401     /* strange evaluation order below to try and minimize overflow problems */
402 
403     layoutPtr->dataSectorsPerStripe = (k-1) * layoutPtr->sectorsPerStripeUnit;
404     layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
405     layoutPtr->numDataCol = k-1;
406     layoutPtr->numParityCol = 1;
407 
408     return(0);
409 }
410 
411 /* declustering with distributed sparing */
412 static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
413 static void rf_ShutdownDeclusteredDS(arg)
414   RF_ThreadArg_t  arg;
415 {
416   RF_DeclusteredConfigInfo_t *info;
417   RF_Raid_t *raidPtr;
418 
419   raidPtr = (RF_Raid_t *)arg;
420   info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
421   if (info->SpareTable)
422     rf_FreeSpareTable(raidPtr);
423 }
424 
425 int rf_ConfigureDeclusteredDS(
426   RF_ShutdownList_t  **listp,
427   RF_Raid_t           *raidPtr,
428   RF_Config_t         *cfgPtr)
429 {
430   int rc;
431 
432   rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
433   if (rc)
434     return(rc);
435   rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
436   if (rc) {
437     RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc);
438     rf_ShutdownDeclusteredDS(raidPtr);
439     return(rc);
440   }
441   return(0);
442 }
443 
444 void rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap)
445   RF_Raid_t       *raidPtr;
446   RF_RaidAddr_t    raidSector;
447   RF_RowCol_t     *row;
448   RF_RowCol_t     *col;
449   RF_SectorNum_t  *diskSector;
450   int              remap;
451 {
452     RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
453     RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
454     RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
455     RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
456     RF_StripeNum_t BlockID, BlockOffset, RepIndex;
457     RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
458     RF_StripeCount_t fulltable_depth  = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
459     RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
460 
461     rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
462 
463     FullTableID     = SUID / sus_per_fulltable;		/* fulltable ID within array (across rows) */
464     if (raidPtr->numRow == 1) *row = 0;                 /* avoid a mod and a div in the common case */
465     else {
466       *row            = FullTableID % raidPtr->numRow;
467       FullTableID    /= raidPtr->numRow;			/* convert to fulltable ID on this disk */
468     }
469     if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
470 	SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
471         SpareSpace  = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
472     }
473     FullTableOffset = SUID % sus_per_fulltable;
474     TableID         = FullTableOffset / info->SUsPerTable;
475     TableOffset     = FullTableOffset - TableID * info->SUsPerTable;
476     BlockID         = TableOffset / info->PUsPerBlock;
477     BlockOffset     = TableOffset - BlockID * info->PUsPerBlock;
478     BlockID        %= info->BlocksPerTable;
479     RepIndex        = info->PUsPerBlock - TableID;
480     if (!raidPtr->noRotate) BlockOffset    += ((BlockOffset >= RepIndex) ? 1 : 0);
481     *col            = info->LayoutTable[BlockID][BlockOffset];
482 
483     /* remap to distributed spare space if indicated */
484     if (remap) {
485       RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
486 	     (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
487       rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
488     } else {
489 
490         outSU	    = base_suid;
491         outSU      += FullTableID * fulltable_depth;  				        /* offs to strt of FT */
492         outSU	   += SpareSpace;						        /* skip rsvd spare space */
493         outSU      += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;   	        /* offs to strt of tble */
494         outSU      += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU;	/* offs to the PU */
495     }
496     outSU          += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);	        /* offs to the SU within a PU */
497 
498     /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector.  */
499     *diskSector     = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
500 
501     RF_ASSERT( *col != -1 );
502 }
503 
504 
505 /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */
506 void rf_MapParityDeclustered(
507   RF_Raid_t       *raidPtr,
508   RF_RaidAddr_t    raidSector,
509   RF_RowCol_t     *row,
510   RF_RowCol_t     *col,
511   RF_SectorNum_t  *diskSector,
512   int              remap)
513 {
514     RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
515     RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
516     RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
517     RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
518     RF_StripeNum_t BlockID, BlockOffset, RepIndex;
519     RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
520     RF_StripeCount_t fulltable_depth  = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
521     RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
522 
523     rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
524 
525     /* compute row & (possibly) spare space exactly as before */
526     FullTableID     = SUID / sus_per_fulltable;
527     if (raidPtr->numRow == 1) *row = 0;                         /* avoid a mod and a div in the common case */
528     else {
529       *row            = FullTableID % raidPtr->numRow;
530       FullTableID    /= raidPtr->numRow;			/* convert to fulltable ID on this disk */
531     }
532     if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
533 	SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
534         SpareSpace  = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
535     }
536 
537     /* compute BlockID and RepIndex exactly as before */
538     FullTableOffset = SUID % sus_per_fulltable;
539     TableID         = FullTableOffset / info->SUsPerTable;
540     TableOffset     = FullTableOffset - TableID * info->SUsPerTable;
541     /*TableOffset     = FullTableOffset % info->SUsPerTable;*/
542     /*BlockID         = (TableOffset / info->PUsPerBlock) % info->BlocksPerTable;*/
543     BlockID         = TableOffset / info->PUsPerBlock;
544     /*BlockOffset     = TableOffset % info->PUsPerBlock;*/
545     BlockOffset     = TableOffset - BlockID * info->PUsPerBlock;
546     BlockID        %= info->BlocksPerTable;
547 
548     /* the parity block is in the position indicated by RepIndex */
549     RepIndex        = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID;
550     *col	    = info->LayoutTable[BlockID][RepIndex];
551 
552     if (remap) {
553       RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
554 	     (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
555       rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
556     } else {
557 
558         /* compute sector as before, except use RepIndex instead of BlockOffset */
559         outSU        = base_suid;
560         outSU       += FullTableID * fulltable_depth;
561         outSU	    += SpareSpace;						/* skip rsvd spare space */
562         outSU       += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
563         outSU       += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
564     }
565 
566     outSU       += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
567     *diskSector  = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
568 
569     RF_ASSERT( *col != -1 );
570 }
571 
572 /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
573  * the caller must _never_ attempt to modify this array.
574  */
575 void rf_IdentifyStripeDeclustered(
576   RF_Raid_t        *raidPtr,
577   RF_RaidAddr_t     addr,
578   RF_RowCol_t     **diskids,
579   RF_RowCol_t      *outRow)
580 {
581   RF_RaidLayout_t *layoutPtr           = &(raidPtr->Layout);
582   RF_DeclusteredConfigInfo_t *info     = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
583   RF_StripeCount_t sus_per_fulltable   = info->SUsPerFullTable;
584   RF_StripeCount_t fulltable_depth     = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
585   RF_StripeNum_t  base_suid            = 0;
586   RF_StripeNum_t SUID                  = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
587   RF_StripeNum_t stripeID, FullTableID;
588   int tableOffset;
589 
590   rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
591   FullTableID     = SUID / sus_per_fulltable;		/* fulltable ID within array (across rows) */
592   *outRow         = FullTableID % raidPtr->numRow;
593   stripeID        = rf_StripeUnitIDToStripeID(layoutPtr, SUID);                     /* find stripe offset into array */
594   tableOffset     = (stripeID % info->BlocksPerTable);                        /* find offset into block design table */
595   *diskids        = info->LayoutTable[tableOffset];
596 }
597 
598 /* This returns the default head-separation limit, which is measured
599  * in "required units for reconstruction".  Each time a disk fetches
600  * a unit, it bumps a counter.  The head-sep code prohibits any disk
601  * from getting more than headSepLimit counter values ahead of any
602  * other.
603  *
604  * We assume here that the number of floating recon buffers is already
605  * set.  There are r stripes to be reconstructed in each table, and so
606  * if we have a total of B buffers, we can have at most B/r tables
607  * under recon at any one time.  In each table, lambda units are required
608  * from each disk, so given B buffers, the head sep limit has to be
609  * (lambda*B)/r units.  We subtract one to avoid weird boundary cases.
610  *
611  * for example, suppose were given 50 buffers, r=19, and lambda=4 as in
612  * the 20.5 design.  There are 19 stripes/table to be reconstructed, so
613  * we can have 50/19 tables concurrently under reconstruction, which means
614  * we can allow the fastest disk to get 50/19 tables ahead of the slower
615  * disk.  There are lambda "required units" for each disk, so the fastest
616  * disk can get 4*50/19 = 10 counter values ahead of the slowest.
617  *
618  * If numBufsToAccumulate is not 1, we need to limit the head sep further
619  * because multiple bufs will be required for each stripe under recon.
620  */
621 RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(
622   RF_Raid_t  *raidPtr)
623 {
624   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
625 
626   return(info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate);
627 }
628 
629 /* returns the default number of recon buffers to use.  The value
630  * is somewhat arbitrary...it's intended to be large enough to allow
631  * for a reasonably large head-sep limit, but small enough that you
632  * don't use up all your system memory with buffers.
633  */
634 int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr)
635 {
636   return(100 * rf_numBufsToAccumulate);
637 }
638 
639 /* sectors in the last fulltable of the array need to be handled
640  * specially since this fulltable can be incomplete.  this function
641  * changes the values of certain params to handle this.
642  *
643  * the idea here is that MapSector et. al. figure out which disk the
644  * addressed unit lives on by computing the modulos of the unit number
645  * with the number of units per fulltable, table, etc.  In the last
646  * fulltable, there are fewer units per fulltable, so we need to adjust
647  * the number of user data units per fulltable to reflect this.
648  *
649  * so, we (1) convert the fulltable size and depth parameters to
650  * the size of the partial fulltable at the end, (2) compute the
651  * disk sector offset where this fulltable starts, and (3) convert
652  * the users stripe unit number from an offset into the array to
653  * an offset into the last fulltable.
654  */
655 void rf_decluster_adjust_params(
656   RF_RaidLayout_t   *layoutPtr,
657   RF_StripeNum_t    *SUID,
658   RF_StripeCount_t  *sus_per_fulltable,
659   RF_StripeCount_t  *fulltable_depth,
660   RF_StripeNum_t    *base_suid)
661 {
662     RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
663 #if defined(__NetBSD__) && defined(_KERNEL)
664     /* Nothing! */
665 #else
666     char pc = layoutPtr->map->parityConfig;
667 #endif
668 
669     if (*SUID >= info->FullTableLimitSUID) {
670 	/* new full table size is size of last full table on disk */
671 	*sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable;
672 
673 	/* new full table depth is corresponding depth */
674 	*fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
675 
676 	/* set up the new base offset */
677 	*base_suid = info->DiskOffsetOfLastFullTableInSUs;
678 
679 	/* convert users array address to an offset into the last fulltable */
680 	*SUID -= info->FullTableLimitSUID;
681     }
682 }
683 
684 /*
685  * map a stripe ID to a parity stripe ID.
686  * See comment above RaidAddressToParityStripeID in layout.c.
687  */
688 void rf_MapSIDToPSIDDeclustered(
689   RF_RaidLayout_t    *layoutPtr,
690   RF_StripeNum_t      stripeID,
691   RF_StripeNum_t     *psID,
692   RF_ReconUnitNum_t  *which_ru)
693 {
694     RF_DeclusteredConfigInfo_t *info;
695 
696     info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
697 
698     *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable))
699         * info->BlocksPerTable + (stripeID % info->BlocksPerTable);
700     *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU))
701         / info->BlocksPerTable;
702     RF_ASSERT( (*which_ru) < layoutPtr->SUsPerPU/layoutPtr->SUsPerRU);
703 }
704 
705 /*
706  * Called from MapSector and MapParity to retarget an access at the spare unit.
707  * Modifies the "col" and "outSU" parameters only.
708  */
709 void rf_remap_to_spare_space(
710   RF_RaidLayout_t             *layoutPtr,
711   RF_DeclusteredConfigInfo_t  *info,
712   RF_RowCol_t                  row,
713   RF_StripeNum_t               FullTableID,
714   RF_StripeNum_t               TableID,
715   RF_SectorNum_t               BlockID,
716   RF_StripeNum_t               base_suid,
717   RF_StripeNum_t               SpareRegion,
718   RF_RowCol_t                 *outCol,
719   RF_StripeNum_t              *outSU)
720 {
721     RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, which_ft;
722 
723     /*
724      * note that FullTableID and hence SpareRegion may have gotten
725      * tweaked by rf_decluster_adjust_params. We detect this by
726      * noticing that base_suid is not 0.
727      */
728     if (base_suid == 0) {
729       ftID = FullTableID;
730     }
731     else {
732       /*
733        * There may be > 1.0 full tables in the last (i.e. partial)
734        * spare region.  find out which of these we're in.
735        */
736       lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs;
737       which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
738 
739       /* compute the actual full table ID */
740       ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft;
741       SpareRegion = info->NumCompleteSRs;
742     }
743     TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion;
744 
745     *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
746     RF_ASSERT( *outCol != -1);
747 
748     spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
749 	    info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU :
750 	    (SpareRegion+1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs;
751     *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
752     if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
753 	printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n",(long)*outSU);
754     }
755 }
756 
757 int rf_InstallSpareTable(
758   RF_Raid_t    *raidPtr,
759   RF_RowCol_t   frow,
760   RF_RowCol_t   fcol)
761 {
762   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
763   RF_SparetWait_t *req;
764   int retcode;
765 
766   RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
767   req->C                             = raidPtr->numCol;
768   req->G                             = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
769   req->fcol                          = fcol;
770   req->SUsPerPU                      = raidPtr->Layout.SUsPerPU;
771   req->TablesPerSpareRegion          = info->TablesPerSpareRegion;
772   req->BlocksPerTable                = info->BlocksPerTable;
773   req->TableDepthInPUs               = info->TableDepthInPUs;
774   req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs;
775 
776 #ifndef KERNEL
777   info->SpareTable = rf_ReadSpareTable(req, info->sparemap_fname);
778   RF_Free(req, sizeof(*req));
779   retcode = (info->SpareTable) ? 0 : 1;
780 #else /* !KERNEL */
781   retcode = rf_GetSpareTableFromDaemon(req);
782   RF_ASSERT(!retcode);                                     /* XXX -- fix this to recover gracefully -- XXX */
783 #endif /* !KERNEL */
784 
785   return(retcode);
786 }
787 
788 #ifdef KERNEL
789 /*
790  * Invoked via ioctl to install a spare table in the kernel.
791  */
792 int rf_SetSpareTable(raidPtr, data)
793   RF_Raid_t  *raidPtr;
794   void       *data;
795 {
796   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
797   RF_SpareTableEntry_t **ptrs;
798   int i, retcode;
799 
800   /* what we need to copyin is a 2-d array, so first copyin the user pointers to the rows in the table */
801   RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
802   retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
803 
804   if (retcode) return(retcode);
805 
806   /* now allocate kernel space for the row pointers */
807   RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
808 
809   /* now allocate kernel space for each row in the table, and copy it in from user space */
810   for (i=0; i<info->TablesPerSpareRegion; i++) {
811     RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
812     retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
813     if (retcode) {
814       info->SpareTable = NULL;             /* blow off the memory we've allocated */
815       return(retcode);
816     }
817   }
818 
819   /* free up the temporary array we used */
820   RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
821 
822   return(0);
823 }
824 #endif /* KERNEL */
825 
826 RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(raidPtr)
827   RF_Raid_t *raidPtr;
828 {
829   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
830 
831   return( ((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk );
832 }
833 
834 
835 void rf_FreeSpareTable(raidPtr)
836   RF_Raid_t  *raidPtr;
837 {
838   long i;
839   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
840   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
841   RF_SpareTableEntry_t **table = info->SpareTable;
842 
843   for (i=0; i<info->TablesPerSpareRegion; i++) {RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));}
844   RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
845   info->SpareTable = (RF_SpareTableEntry_t **) NULL;
846 }
847