1 /* $NetBSD: rf_decluster.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /*---------------------------------------------------------------------- 30 * 31 * rf_decluster.c -- code related to the declustered layout 32 * 33 * Created 10-21-92 (MCH) 34 * 35 * Nov 93: adding support for distributed sparing. This code is a little 36 * complex: the basic layout used is as follows: 37 * let F = (v-1)/GCD(r,v-1). The spare space for each set of 38 * F consecutive fulltables is grouped together and placed after 39 * that set of tables. 40 * +------------------------------+ 41 * | F fulltables | 42 * | Spare Space | 43 * | F fulltables | 44 * | Spare Space | 45 * | ... | 46 * +------------------------------+ 47 * 48 *--------------------------------------------------------------------*/ 49 50 /* 51 * : 52 * Log: rf_decluster.c,v 53 * Revision 1.51 1996/08/21 19:47:10 jimz 54 * fix bogus return values from config 55 * 56 * Revision 1.50 1996/08/20 22:41:42 jimz 57 * better diagnostics for bad blockdesigns 58 * 59 * Revision 1.49 1996/07/31 16:56:18 jimz 60 * dataBytesPerStripe, sectorsPerDisk init arch-indep. 61 * 62 * Revision 1.48 1996/07/29 14:05:12 jimz 63 * fix numPUs/numRUs confusion (everything is now numRUs) 64 * clean up some commenting, return values 65 * 66 * Revision 1.47 1996/07/27 23:36:08 jimz 67 * Solaris port of simulator 68 * 69 * Revision 1.46 1996/07/27 18:40:11 jimz 70 * cleanup sweep 71 * 72 * Revision 1.45 1996/07/18 22:57:14 jimz 73 * port simulator to AIX 74 * 75 * Revision 1.44 1996/07/13 00:00:59 jimz 76 * sanitized generalized reconstruction architecture 77 * cleaned up head sep, rbuf problems 78 * 79 * Revision 1.43 1996/06/19 17:53:48 jimz 80 * move GetNumSparePUs, InstallSpareTable ops into layout switch 81 * 82 * Revision 1.42 1996/06/17 03:23:48 jimz 83 * switch DeclusteredDS typing 84 * 85 * Revision 1.41 1996/06/11 08:55:15 jimz 86 * improved error-checking at configuration time 87 * 88 * Revision 1.40 1996/06/10 11:55:47 jimz 89 * Straightened out some per-array/not-per-array distinctions, fixed 90 * a couple bugs related to confusion. Added shutdown lists. Removed 91 * layout shutdown function (now subsumed by shutdown lists). 92 * 93 * Revision 1.39 1996/06/09 02:36:46 jimz 94 * lots of little crufty cleanup- fixup whitespace 95 * issues, comment #ifdefs, improve typing in some 96 * places (esp size-related) 97 * 98 * Revision 1.38 1996/06/07 22:26:27 jimz 99 * type-ify which_ru (RF_ReconUnitNum_t) 100 * 101 * Revision 1.37 1996/06/07 21:33:04 jimz 102 * begin using consistent types for sector numbers, 103 * stripe numbers, row+col numbers, recon unit numbers 104 * 105 * Revision 1.36 1996/06/03 23:28:26 jimz 106 * more bugfixes 107 * check in tree to sync for IPDS runs with current bugfixes 108 * there still may be a problem with threads in the script test 109 * getting I/Os stuck- not trivially reproducible (runs ~50 times 110 * in a row without getting stuck) 111 * 112 * Revision 1.35 1996/06/02 17:31:48 jimz 113 * Moved a lot of global stuff into array structure, where it belongs. 114 * Fixed up paritylogging, pss modules in this manner. Some general 115 * code cleanup. Removed lots of dead code, some dead files. 116 * 117 * Revision 1.34 1996/05/30 23:22:16 jimz 118 * bugfixes of serialization, timing problems 119 * more cleanup 120 * 121 * Revision 1.33 1996/05/30 11:29:41 jimz 122 * Numerous bug fixes. Stripe lock release code disagreed with the taking code 123 * about when stripes should be locked (I made it consistent: no parity, no lock) 124 * There was a lot of extra serialization of I/Os which I've removed- a lot of 125 * it was to calculate values for the cache code, which is no longer with us. 126 * More types, function, macro cleanup. Added code to properly quiesce the array 127 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general 128 * before. Fixed memory allocation, freeing bugs. 129 * 130 * Revision 1.32 1996/05/27 18:56:37 jimz 131 * more code cleanup 132 * better typing 133 * compiles in all 3 environments 134 * 135 * Revision 1.31 1996/05/24 01:59:45 jimz 136 * another checkpoint in code cleanup for release 137 * time to sync kernel tree 138 * 139 * Revision 1.30 1996/05/23 00:33:23 jimz 140 * code cleanup: move all debug decls to rf_options.c, all extern 141 * debug decls to rf_options.h, all debug vars preceded by rf_ 142 * 143 * Revision 1.29 1996/05/18 19:51:34 jimz 144 * major code cleanup- fix syntax, make some types consistent, 145 * add prototypes, clean out dead code, et cetera 146 * 147 * Revision 1.28 1995/12/12 18:10:06 jimz 148 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT 149 * fix 80-column brain damage in comments 150 * 151 * Revision 1.27 1995/12/01 16:00:08 root 152 * added copyright info 153 * 154 * Revision 1.26 1995/11/28 21:35:12 amiri 155 * set the RF_BD_DECLUSTERED flag 156 * 157 * Revision 1.25 1995/11/17 18:56:00 wvcii 158 * added prototyping to MapParity 159 * 160 * Revision 1.24 1995/07/04 22:25:33 holland 161 * increased default num bufs 162 * 163 * Revision 1.23 1995/07/03 20:23:51 holland 164 * changed floating recon bufs & head sep yet again 165 * 166 * Revision 1.22 1995/07/03 18:12:14 holland 167 * changed the way the number of floating recon bufs & the head sep 168 * limit are set 169 * 170 * Revision 1.21 1995/07/02 15:07:42 holland 171 * bug fixes related to getting distributed sparing numbers 172 * 173 * Revision 1.20 1995/06/23 13:41:28 robby 174 * updeated to prototypes in rf_layout.h 175 * 176 */ 177 178 #ifdef _KERNEL 179 #define KERNEL 180 #endif 181 182 183 #include "rf_types.h" 184 #include "rf_raid.h" 185 #include "rf_raidframe.h" 186 #include "rf_configure.h" 187 #include "rf_decluster.h" 188 #include "rf_debugMem.h" 189 #include "rf_utils.h" 190 #include "rf_alloclist.h" 191 #include "rf_general.h" 192 #include "rf_shutdown.h" 193 #include "rf_sys.h" 194 195 extern int rf_copyback_in_progress; /* debug only */ 196 197 /* found in rf_kintf.c */ 198 int rf_GetSpareTableFromDaemon(RF_SparetWait_t *req); 199 200 /* configuration code */ 201 202 int rf_ConfigureDeclustered( 203 RF_ShutdownList_t **listp, 204 RF_Raid_t *raidPtr, 205 RF_Config_t *cfgPtr) 206 { 207 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 208 int b, v, k, r, lambda; /* block design params */ 209 int i, j; 210 RF_RowCol_t *first_avail_slot; 211 RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk; 212 RF_DeclusteredConfigInfo_t *info; 213 RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, extraPUsPerDisk; 214 RF_StripeCount_t totSparePUsPerDisk; 215 RF_SectorNum_t diskOffsetOfLastFullTableInSUs; 216 RF_SectorCount_t SpareSpaceInSUs; 217 char *cfgBuf = (char *) (cfgPtr->layoutSpecific); 218 RF_StripeNum_t l, SUID; 219 220 SUID = l = 0; 221 numCompleteSpareRegionsPerDisk = 0; 222 223 /* 1. create layout specific structure */ 224 RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList); 225 if (info == NULL) 226 return(ENOMEM); 227 layoutPtr->layoutSpecificInfo = (void *) info; 228 info->SpareTable = NULL; 229 230 /* 2. extract parameters from the config structure */ 231 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 232 (void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN); 233 } 234 cfgBuf += RF_SPAREMAP_NAME_LEN; 235 236 b = *( (int *) cfgBuf); cfgBuf += sizeof(int); 237 v = *( (int *) cfgBuf); cfgBuf += sizeof(int); 238 k = *( (int *) cfgBuf); cfgBuf += sizeof(int); 239 r = *( (int *) cfgBuf); cfgBuf += sizeof(int); 240 lambda = *( (int *) cfgBuf); cfgBuf += sizeof(int); 241 raidPtr->noRotate = *( (int *) cfgBuf); cfgBuf += sizeof(int); 242 243 /* the sparemaps are generated assuming that parity is rotated, so we issue 244 * a warning if both distributed sparing and no-rotate are on at the same time 245 */ 246 if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { 247 RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n"); 248 } 249 250 if (raidPtr->numCol != v) { 251 RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol); 252 return(EINVAL); 253 } 254 255 /* 3. set up the values used in the mapping code */ 256 info->BlocksPerTable = b; 257 info->Lambda = lambda; 258 info->NumParityReps = info->groupSize = k; 259 info->SUsPerTable = b * (k-1) * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */ 260 info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ 261 info->PUsPerBlock = k-1; 262 info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; 263 info->TableDepthInPUs = (b*k) / v; 264 info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ 265 266 /* used only in distributed sparing case */ 267 info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1); /* (v-1)/gcd fulltables */ 268 info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; 269 info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU; 270 271 /* check to make sure the block design is sufficiently small */ 272 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { 273 if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { 274 RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n", 275 (int)info->FullTableDepthInPUs, 276 (int)info->SpareSpaceDepthPerRegionInSUs, 277 (int)layoutPtr->stripeUnitsPerDisk); 278 return(EINVAL); 279 } 280 } else { 281 if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { 282 RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n", 283 (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU), \ 284 (int)layoutPtr->stripeUnitsPerDisk); 285 return(EINVAL); 286 } 287 } 288 289 290 /* compute the size of each disk, and the number of tables in the last fulltable (which 291 * need not be complete) 292 */ 293 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 294 295 PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; 296 spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + 297 (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1)); 298 info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; 299 300 numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; 301 info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; 302 extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; 303 304 /* assume conservatively that we need the full amount of spare space in one region in order 305 * to provide spares for the partial spare region at the end of the array. We set "i" to 306 * the number of tables in the partial spare region. This may actually include some fulltables. 307 */ 308 extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); 309 if (extraPUsPerDisk <= 0) i = 0; 310 else i = extraPUsPerDisk/info->TableDepthInPUs; 311 312 complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k); 313 info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; 314 info->ExtraTablesPerDisk = i % k; 315 316 /* note that in the last spare region, the spare space is complete even though data/parity space is not */ 317 totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); 318 info->TotSparePUsPerDisk = totSparePUsPerDisk; 319 320 layoutPtr->stripeUnitsPerDisk = 321 ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */ 322 info->ExtraTablesPerDisk * info->TableDepthInPUs + 323 totSparePUsPerDisk /* spare space */ 324 ) * layoutPtr->SUsPerPU; 325 layoutPtr->dataStripeUnitsPerDisk = 326 (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) 327 * layoutPtr->SUsPerPU * (k-1) / k; 328 329 } else { 330 /* non-dist spare case: force each disk to contain an integral number of tables */ 331 layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); 332 layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); 333 334 /* compute the number of tables in the last fulltable, which need not be complete */ 335 complete_FT_count = 336 ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow; 337 338 info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; 339 info->ExtraTablesPerDisk = 340 ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; 341 } 342 343 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; 344 345 /* find the disk offset of the stripe unit where the last fulltable starts */ 346 numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow; 347 diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 348 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 349 SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; 350 diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; 351 info->DiskOffsetOfLastSpareSpaceChunkInSUs = 352 diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; 353 } 354 info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; 355 info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; 356 357 /* 4. create and initialize the lookup tables */ 358 info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); 359 if (info->LayoutTable == NULL) 360 return(ENOMEM); 361 info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); 362 if (info->OffsetTable == NULL) 363 return(ENOMEM); 364 info->BlockTable = rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); 365 if (info->BlockTable == NULL) 366 return(ENOMEM); 367 368 first_avail_slot = rf_make_1d_array(v, NULL); 369 if (first_avail_slot == NULL) 370 return(ENOMEM); 371 372 for (i=0; i<b; i++) 373 for (j=0; j<k; j++) 374 info->LayoutTable[i][j] = *cfgBuf++; 375 376 /* initialize offset table */ 377 for (i=0; i<b; i++) for (j=0; j<k; j++) { 378 info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ]; 379 first_avail_slot[ info->LayoutTable[i][j] ]++; 380 } 381 382 /* initialize block table */ 383 for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) { 384 for (i=0; i<b; i++) { 385 for (j=0; j<k; j++) { 386 info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ] 387 [ info->LayoutTable[i][j] ] = SUID; 388 } 389 SUID++; 390 } 391 } 392 393 rf_free_1d_array(first_avail_slot, v); 394 395 /* 5. set up the remaining redundant-but-useful parameters */ 396 397 raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) * 398 info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; 399 layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-1); 400 401 /* strange evaluation order below to try and minimize overflow problems */ 402 403 layoutPtr->dataSectorsPerStripe = (k-1) * layoutPtr->sectorsPerStripeUnit; 404 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; 405 layoutPtr->numDataCol = k-1; 406 layoutPtr->numParityCol = 1; 407 408 return(0); 409 } 410 411 /* declustering with distributed sparing */ 412 static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t); 413 static void rf_ShutdownDeclusteredDS(arg) 414 RF_ThreadArg_t arg; 415 { 416 RF_DeclusteredConfigInfo_t *info; 417 RF_Raid_t *raidPtr; 418 419 raidPtr = (RF_Raid_t *)arg; 420 info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 421 if (info->SpareTable) 422 rf_FreeSpareTable(raidPtr); 423 } 424 425 int rf_ConfigureDeclusteredDS( 426 RF_ShutdownList_t **listp, 427 RF_Raid_t *raidPtr, 428 RF_Config_t *cfgPtr) 429 { 430 int rc; 431 432 rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr); 433 if (rc) 434 return(rc); 435 rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr); 436 if (rc) { 437 RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc); 438 rf_ShutdownDeclusteredDS(raidPtr); 439 return(rc); 440 } 441 return(0); 442 } 443 444 void rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap) 445 RF_Raid_t *raidPtr; 446 RF_RaidAddr_t raidSector; 447 RF_RowCol_t *row; 448 RF_RowCol_t *col; 449 RF_SectorNum_t *diskSector; 450 int remap; 451 { 452 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 453 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 454 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; 455 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; 456 RF_StripeNum_t BlockID, BlockOffset, RepIndex; 457 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 458 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 459 RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0; 460 461 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 462 463 FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */ 464 if (raidPtr->numRow == 1) *row = 0; /* avoid a mod and a div in the common case */ 465 else { 466 *row = FullTableID % raidPtr->numRow; 467 FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ 468 } 469 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 470 SpareRegion = FullTableID / info->FullTablesPerSpareRegion; 471 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; 472 } 473 FullTableOffset = SUID % sus_per_fulltable; 474 TableID = FullTableOffset / info->SUsPerTable; 475 TableOffset = FullTableOffset - TableID * info->SUsPerTable; 476 BlockID = TableOffset / info->PUsPerBlock; 477 BlockOffset = TableOffset - BlockID * info->PUsPerBlock; 478 BlockID %= info->BlocksPerTable; 479 RepIndex = info->PUsPerBlock - TableID; 480 if (!raidPtr->noRotate) BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0); 481 *col = info->LayoutTable[BlockID][BlockOffset]; 482 483 /* remap to distributed spare space if indicated */ 484 if (remap) { 485 RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared || 486 (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal)); 487 rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); 488 } else { 489 490 outSU = base_suid; 491 outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ 492 outSU += SpareSpace; /* skip rsvd spare space */ 493 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ 494 outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ 495 } 496 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within a PU */ 497 498 /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector. */ 499 *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); 500 501 RF_ASSERT( *col != -1 ); 502 } 503 504 505 /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */ 506 void rf_MapParityDeclustered( 507 RF_Raid_t *raidPtr, 508 RF_RaidAddr_t raidSector, 509 RF_RowCol_t *row, 510 RF_RowCol_t *col, 511 RF_SectorNum_t *diskSector, 512 int remap) 513 { 514 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 515 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 516 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; 517 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; 518 RF_StripeNum_t BlockID, BlockOffset, RepIndex; 519 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 520 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 521 RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0; 522 523 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 524 525 /* compute row & (possibly) spare space exactly as before */ 526 FullTableID = SUID / sus_per_fulltable; 527 if (raidPtr->numRow == 1) *row = 0; /* avoid a mod and a div in the common case */ 528 else { 529 *row = FullTableID % raidPtr->numRow; 530 FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ 531 } 532 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { 533 SpareRegion = FullTableID / info->FullTablesPerSpareRegion; 534 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; 535 } 536 537 /* compute BlockID and RepIndex exactly as before */ 538 FullTableOffset = SUID % sus_per_fulltable; 539 TableID = FullTableOffset / info->SUsPerTable; 540 TableOffset = FullTableOffset - TableID * info->SUsPerTable; 541 /*TableOffset = FullTableOffset % info->SUsPerTable;*/ 542 /*BlockID = (TableOffset / info->PUsPerBlock) % info->BlocksPerTable;*/ 543 BlockID = TableOffset / info->PUsPerBlock; 544 /*BlockOffset = TableOffset % info->PUsPerBlock;*/ 545 BlockOffset = TableOffset - BlockID * info->PUsPerBlock; 546 BlockID %= info->BlocksPerTable; 547 548 /* the parity block is in the position indicated by RepIndex */ 549 RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID; 550 *col = info->LayoutTable[BlockID][RepIndex]; 551 552 if (remap) { 553 RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared || 554 (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal)); 555 rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); 556 } else { 557 558 /* compute sector as before, except use RepIndex instead of BlockOffset */ 559 outSU = base_suid; 560 outSU += FullTableID * fulltable_depth; 561 outSU += SpareSpace; /* skip rsvd spare space */ 562 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; 563 outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; 564 } 565 566 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); 567 *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); 568 569 RF_ASSERT( *col != -1 ); 570 } 571 572 /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. 573 * the caller must _never_ attempt to modify this array. 574 */ 575 void rf_IdentifyStripeDeclustered( 576 RF_Raid_t *raidPtr, 577 RF_RaidAddr_t addr, 578 RF_RowCol_t **diskids, 579 RF_RowCol_t *outRow) 580 { 581 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 582 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 583 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 584 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 585 RF_StripeNum_t base_suid = 0; 586 RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); 587 RF_StripeNum_t stripeID, FullTableID; 588 int tableOffset; 589 590 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 591 FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */ 592 *outRow = FullTableID % raidPtr->numRow; 593 stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset into array */ 594 tableOffset = (stripeID % info->BlocksPerTable); /* find offset into block design table */ 595 *diskids = info->LayoutTable[tableOffset]; 596 } 597 598 /* This returns the default head-separation limit, which is measured 599 * in "required units for reconstruction". Each time a disk fetches 600 * a unit, it bumps a counter. The head-sep code prohibits any disk 601 * from getting more than headSepLimit counter values ahead of any 602 * other. 603 * 604 * We assume here that the number of floating recon buffers is already 605 * set. There are r stripes to be reconstructed in each table, and so 606 * if we have a total of B buffers, we can have at most B/r tables 607 * under recon at any one time. In each table, lambda units are required 608 * from each disk, so given B buffers, the head sep limit has to be 609 * (lambda*B)/r units. We subtract one to avoid weird boundary cases. 610 * 611 * for example, suppose were given 50 buffers, r=19, and lambda=4 as in 612 * the 20.5 design. There are 19 stripes/table to be reconstructed, so 613 * we can have 50/19 tables concurrently under reconstruction, which means 614 * we can allow the fastest disk to get 50/19 tables ahead of the slower 615 * disk. There are lambda "required units" for each disk, so the fastest 616 * disk can get 4*50/19 = 10 counter values ahead of the slowest. 617 * 618 * If numBufsToAccumulate is not 1, we need to limit the head sep further 619 * because multiple bufs will be required for each stripe under recon. 620 */ 621 RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered( 622 RF_Raid_t *raidPtr) 623 { 624 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 625 626 return(info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate); 627 } 628 629 /* returns the default number of recon buffers to use. The value 630 * is somewhat arbitrary...it's intended to be large enough to allow 631 * for a reasonably large head-sep limit, but small enough that you 632 * don't use up all your system memory with buffers. 633 */ 634 int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr) 635 { 636 return(100 * rf_numBufsToAccumulate); 637 } 638 639 /* sectors in the last fulltable of the array need to be handled 640 * specially since this fulltable can be incomplete. this function 641 * changes the values of certain params to handle this. 642 * 643 * the idea here is that MapSector et. al. figure out which disk the 644 * addressed unit lives on by computing the modulos of the unit number 645 * with the number of units per fulltable, table, etc. In the last 646 * fulltable, there are fewer units per fulltable, so we need to adjust 647 * the number of user data units per fulltable to reflect this. 648 * 649 * so, we (1) convert the fulltable size and depth parameters to 650 * the size of the partial fulltable at the end, (2) compute the 651 * disk sector offset where this fulltable starts, and (3) convert 652 * the users stripe unit number from an offset into the array to 653 * an offset into the last fulltable. 654 */ 655 void rf_decluster_adjust_params( 656 RF_RaidLayout_t *layoutPtr, 657 RF_StripeNum_t *SUID, 658 RF_StripeCount_t *sus_per_fulltable, 659 RF_StripeCount_t *fulltable_depth, 660 RF_StripeNum_t *base_suid) 661 { 662 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 663 #if defined(__NetBSD__) && defined(_KERNEL) 664 /* Nothing! */ 665 #else 666 char pc = layoutPtr->map->parityConfig; 667 #endif 668 669 if (*SUID >= info->FullTableLimitSUID) { 670 /* new full table size is size of last full table on disk */ 671 *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable; 672 673 /* new full table depth is corresponding depth */ 674 *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; 675 676 /* set up the new base offset */ 677 *base_suid = info->DiskOffsetOfLastFullTableInSUs; 678 679 /* convert users array address to an offset into the last fulltable */ 680 *SUID -= info->FullTableLimitSUID; 681 } 682 } 683 684 /* 685 * map a stripe ID to a parity stripe ID. 686 * See comment above RaidAddressToParityStripeID in layout.c. 687 */ 688 void rf_MapSIDToPSIDDeclustered( 689 RF_RaidLayout_t *layoutPtr, 690 RF_StripeNum_t stripeID, 691 RF_StripeNum_t *psID, 692 RF_ReconUnitNum_t *which_ru) 693 { 694 RF_DeclusteredConfigInfo_t *info; 695 696 info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 697 698 *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) 699 * info->BlocksPerTable + (stripeID % info->BlocksPerTable); 700 *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) 701 / info->BlocksPerTable; 702 RF_ASSERT( (*which_ru) < layoutPtr->SUsPerPU/layoutPtr->SUsPerRU); 703 } 704 705 /* 706 * Called from MapSector and MapParity to retarget an access at the spare unit. 707 * Modifies the "col" and "outSU" parameters only. 708 */ 709 void rf_remap_to_spare_space( 710 RF_RaidLayout_t *layoutPtr, 711 RF_DeclusteredConfigInfo_t *info, 712 RF_RowCol_t row, 713 RF_StripeNum_t FullTableID, 714 RF_StripeNum_t TableID, 715 RF_SectorNum_t BlockID, 716 RF_StripeNum_t base_suid, 717 RF_StripeNum_t SpareRegion, 718 RF_RowCol_t *outCol, 719 RF_StripeNum_t *outSU) 720 { 721 RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, which_ft; 722 723 /* 724 * note that FullTableID and hence SpareRegion may have gotten 725 * tweaked by rf_decluster_adjust_params. We detect this by 726 * noticing that base_suid is not 0. 727 */ 728 if (base_suid == 0) { 729 ftID = FullTableID; 730 } 731 else { 732 /* 733 * There may be > 1.0 full tables in the last (i.e. partial) 734 * spare region. find out which of these we're in. 735 */ 736 lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs; 737 which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU); 738 739 /* compute the actual full table ID */ 740 ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft; 741 SpareRegion = info->NumCompleteSRs; 742 } 743 TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion; 744 745 *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk; 746 RF_ASSERT( *outCol != -1); 747 748 spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ? 749 info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU : 750 (SpareRegion+1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs; 751 *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs; 752 if (*outSU >= layoutPtr->stripeUnitsPerDisk) { 753 printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n",(long)*outSU); 754 } 755 } 756 757 int rf_InstallSpareTable( 758 RF_Raid_t *raidPtr, 759 RF_RowCol_t frow, 760 RF_RowCol_t fcol) 761 { 762 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 763 RF_SparetWait_t *req; 764 int retcode; 765 766 RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *)); 767 req->C = raidPtr->numCol; 768 req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; 769 req->fcol = fcol; 770 req->SUsPerPU = raidPtr->Layout.SUsPerPU; 771 req->TablesPerSpareRegion = info->TablesPerSpareRegion; 772 req->BlocksPerTable = info->BlocksPerTable; 773 req->TableDepthInPUs = info->TableDepthInPUs; 774 req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs; 775 776 #ifndef KERNEL 777 info->SpareTable = rf_ReadSpareTable(req, info->sparemap_fname); 778 RF_Free(req, sizeof(*req)); 779 retcode = (info->SpareTable) ? 0 : 1; 780 #else /* !KERNEL */ 781 retcode = rf_GetSpareTableFromDaemon(req); 782 RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- XXX */ 783 #endif /* !KERNEL */ 784 785 return(retcode); 786 } 787 788 #ifdef KERNEL 789 /* 790 * Invoked via ioctl to install a spare table in the kernel. 791 */ 792 int rf_SetSpareTable(raidPtr, data) 793 RF_Raid_t *raidPtr; 794 void *data; 795 { 796 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 797 RF_SpareTableEntry_t **ptrs; 798 int i, retcode; 799 800 /* what we need to copyin is a 2-d array, so first copyin the user pointers to the rows in the table */ 801 RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); 802 retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 803 804 if (retcode) return(retcode); 805 806 /* now allocate kernel space for the row pointers */ 807 RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); 808 809 /* now allocate kernel space for each row in the table, and copy it in from user space */ 810 for (i=0; i<info->TablesPerSpareRegion; i++) { 811 RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *)); 812 retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); 813 if (retcode) { 814 info->SpareTable = NULL; /* blow off the memory we've allocated */ 815 return(retcode); 816 } 817 } 818 819 /* free up the temporary array we used */ 820 RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 821 822 return(0); 823 } 824 #endif /* KERNEL */ 825 826 RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(raidPtr) 827 RF_Raid_t *raidPtr; 828 { 829 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 830 831 return( ((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk ); 832 } 833 834 835 void rf_FreeSpareTable(raidPtr) 836 RF_Raid_t *raidPtr; 837 { 838 long i; 839 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 840 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 841 RF_SpareTableEntry_t **table = info->SpareTable; 842 843 for (i=0; i<info->TablesPerSpareRegion; i++) {RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));} 844 RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 845 info->SpareTable = (RF_SpareTableEntry_t **) NULL; 846 } 847