1 /* $NetBSD: rf_decluster.c,v 1.16 2004/02/29 04:03:50 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /*---------------------------------------------------------------------- 30 * 31 * rf_decluster.c -- code related to the declustered layout 32 * 33 * Created 10-21-92 (MCH) 34 * 35 * Nov 93: adding support for distributed sparing. This code is a little 36 * complex: the basic layout used is as follows: 37 * let F = (v-1)/GCD(r,v-1). The spare space for each set of 38 * F consecutive fulltables is grouped together and placed after 39 * that set of tables. 40 * +------------------------------+ 41 * | F fulltables | 42 * | Spare Space | 43 * | F fulltables | 44 * | Spare Space | 45 * | ... | 46 * +------------------------------+ 47 * 48 *--------------------------------------------------------------------*/ 49 50 #include <sys/cdefs.h> 51 __KERNEL_RCSID(0, "$NetBSD: rf_decluster.c,v 1.16 2004/02/29 04:03:50 oster Exp $"); 52 53 #include <dev/raidframe/raidframevar.h> 54 55 #include "rf_archs.h" 56 #include "rf_raid.h" 57 #include "rf_decluster.h" 58 #include "rf_debugMem.h" 59 #include "rf_utils.h" 60 #include "rf_alloclist.h" 61 #include "rf_general.h" 62 #include "rf_kintf.h" 63 #include "rf_shutdown.h" 64 65 #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) 66 67 /* configuration code */ 68 69 int 70 rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 71 RF_Config_t *cfgPtr) 72 { 73 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 74 int b, v, k, r, lambda; /* block design params */ 75 int i, j; 76 RF_RowCol_t *first_avail_slot; 77 RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk; 78 RF_DeclusteredConfigInfo_t *info; 79 RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, 80 extraPUsPerDisk; 81 RF_StripeCount_t totSparePUsPerDisk; 82 RF_SectorNum_t diskOffsetOfLastFullTableInSUs; 83 RF_SectorCount_t SpareSpaceInSUs; 84 char *cfgBuf = (char *) (cfgPtr->layoutSpecific); 85 RF_StripeNum_t l, SUID; 86 87 SUID = l = 0; 88 numCompleteSpareRegionsPerDisk = 0; 89 90 /* 1. create layout specific structure */ 91 RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList); 92 if (info == NULL) 93 return (ENOMEM); 94 layoutPtr->layoutSpecificInfo = (void *) info; 95 info->SpareTable = NULL; 96 97 /* 2. extract parameters from the config structure */ 98 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 99 (void)memcpy(info->sparemap_fname, cfgBuf, RF_SPAREMAP_NAME_LEN); 100 } 101 cfgBuf += RF_SPAREMAP_NAME_LEN; 102 103 b = *((int *) cfgBuf); 104 cfgBuf += sizeof(int); 105 v = *((int *) cfgBuf); 106 cfgBuf += sizeof(int); 107 k = *((int *) cfgBuf); 108 cfgBuf += sizeof(int); 109 r = *((int *) cfgBuf); 110 cfgBuf += sizeof(int); 111 lambda = *((int *) cfgBuf); 112 cfgBuf += sizeof(int); 113 raidPtr->noRotate = *((int *) cfgBuf); 114 cfgBuf += sizeof(int); 115 116 /* the sparemaps are generated assuming that parity is rotated, so we 117 * issue a warning if both distributed sparing and no-rotate are on at 118 * the same time */ 119 if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { 120 RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n"); 121 } 122 if (raidPtr->numCol != v) { 123 RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol); 124 return (EINVAL); 125 } 126 /* 3. set up the values used in the mapping code */ 127 info->BlocksPerTable = b; 128 info->Lambda = lambda; 129 info->NumParityReps = info->groupSize = k; 130 info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU; /* b blks, k-1 SUs each */ 131 info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ 132 info->PUsPerBlock = k - 1; 133 info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; 134 info->TableDepthInPUs = (b * k) / v; 135 info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ 136 137 /* used only in distributed sparing case */ 138 info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1); /* (v-1)/gcd fulltables */ 139 info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; 140 info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU; 141 142 /* check to make sure the block design is sufficiently small */ 143 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { 144 if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { 145 RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n", 146 (int) info->FullTableDepthInPUs, 147 (int) info->SpareSpaceDepthPerRegionInSUs, 148 (int) layoutPtr->stripeUnitsPerDisk); 149 return (EINVAL); 150 } 151 } else { 152 if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { 153 RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n", 154 (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU), \ 155 (int) layoutPtr->stripeUnitsPerDisk); 156 return (EINVAL); 157 } 158 } 159 160 161 /* compute the size of each disk, and the number of tables in the last 162 * fulltable (which need not be complete) */ 163 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 164 165 PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; 166 spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + 167 (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1)); 168 info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; 169 170 numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; 171 info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; 172 extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; 173 174 /* assume conservatively that we need the full amount of spare 175 * space in one region in order to provide spares for the 176 * partial spare region at the end of the array. We set "i" 177 * to the number of tables in the partial spare region. This 178 * may actually include some fulltables. */ 179 extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); 180 if (extraPUsPerDisk <= 0) 181 i = 0; 182 else 183 i = extraPUsPerDisk / info->TableDepthInPUs; 184 185 complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k); 186 info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; 187 info->ExtraTablesPerDisk = i % k; 188 189 /* note that in the last spare region, the spare space is 190 * complete even though data/parity space is not */ 191 totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); 192 info->TotSparePUsPerDisk = totSparePUsPerDisk; 193 194 layoutPtr->stripeUnitsPerDisk = 195 ((complete_FT_count / raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */ 196 info->ExtraTablesPerDisk * info->TableDepthInPUs + 197 totSparePUsPerDisk /* spare space */ 198 ) * layoutPtr->SUsPerPU; 199 layoutPtr->dataStripeUnitsPerDisk = 200 (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) 201 * layoutPtr->SUsPerPU * (k - 1) / k; 202 203 } else { 204 /* non-dist spare case: force each disk to contain an 205 * integral number of tables */ 206 layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); 207 layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); 208 209 /* compute the number of tables in the last fulltable, which 210 * need not be complete */ 211 complete_FT_count = 212 ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow; 213 214 info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; 215 info->ExtraTablesPerDisk = 216 ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; 217 } 218 219 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; 220 221 /* find the disk offset of the stripe unit where the last fulltable 222 * starts */ 223 numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow; 224 diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 225 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 226 SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; 227 diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; 228 info->DiskOffsetOfLastSpareSpaceChunkInSUs = 229 diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; 230 } 231 info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; 232 info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; 233 234 /* 4. create and initialize the lookup tables */ 235 info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); 236 if (info->LayoutTable == NULL) 237 return (ENOMEM); 238 info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); 239 if (info->OffsetTable == NULL) 240 return (ENOMEM); 241 info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); 242 if (info->BlockTable == NULL) 243 return (ENOMEM); 244 245 first_avail_slot = rf_make_1d_array(v, NULL); 246 if (first_avail_slot == NULL) 247 return (ENOMEM); 248 249 for (i = 0; i < b; i++) 250 for (j = 0; j < k; j++) 251 info->LayoutTable[i][j] = *cfgBuf++; 252 253 /* initialize offset table */ 254 for (i = 0; i < b; i++) 255 for (j = 0; j < k; j++) { 256 info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]]; 257 first_avail_slot[info->LayoutTable[i][j]]++; 258 } 259 260 /* initialize block table */ 261 for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) { 262 for (i = 0; i < b; i++) { 263 for (j = 0; j < k; j++) { 264 info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l] 265 [info->LayoutTable[i][j]] = SUID; 266 } 267 SUID++; 268 } 269 } 270 271 rf_free_1d_array(first_avail_slot, v); 272 273 /* 5. set up the remaining redundant-but-useful parameters */ 274 275 raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow * info->ExtraTablesPerDisk) * 276 info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; 277 layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 1); 278 279 /* strange evaluation order below to try and minimize overflow 280 * problems */ 281 282 layoutPtr->dataSectorsPerStripe = (k - 1) * layoutPtr->sectorsPerStripeUnit; 283 layoutPtr->numDataCol = k - 1; 284 layoutPtr->numParityCol = 1; 285 286 return (0); 287 } 288 /* declustering with distributed sparing */ 289 static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t); 290 static void 291 rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg) 292 { 293 RF_DeclusteredConfigInfo_t *info; 294 RF_Raid_t *raidPtr; 295 296 raidPtr = (RF_Raid_t *) arg; 297 info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 298 if (info->SpareTable) 299 rf_FreeSpareTable(raidPtr); 300 } 301 302 int 303 rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 304 RF_Config_t *cfgPtr) 305 { 306 int rc; 307 308 rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr); 309 if (rc) 310 return (rc); 311 rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr); 312 313 return (0); 314 } 315 316 void 317 rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, 318 RF_RowCol_t *row, RF_RowCol_t *col, 319 RF_SectorNum_t *diskSector, int remap) 320 { 321 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 322 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 323 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; 324 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; 325 RF_StripeNum_t BlockID, BlockOffset, RepIndex; 326 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 327 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 328 RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; 329 330 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 331 332 FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array 333 * (across rows) */ 334 if (raidPtr->numRow == 1) 335 *row = 0; /* avoid a mod and a div in the common case */ 336 else { 337 *row = FullTableID % raidPtr->numRow; 338 FullTableID /= raidPtr->numRow; /* convert to fulltable ID on 339 * this disk */ 340 } 341 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 342 SpareRegion = FullTableID / info->FullTablesPerSpareRegion; 343 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; 344 } 345 FullTableOffset = SUID % sus_per_fulltable; 346 TableID = FullTableOffset / info->SUsPerTable; 347 TableOffset = FullTableOffset - TableID * info->SUsPerTable; 348 BlockID = TableOffset / info->PUsPerBlock; 349 BlockOffset = TableOffset - BlockID * info->PUsPerBlock; 350 BlockID %= info->BlocksPerTable; 351 RepIndex = info->PUsPerBlock - TableID; 352 if (!raidPtr->noRotate) 353 BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0); 354 *col = info->LayoutTable[BlockID][BlockOffset]; 355 356 /* remap to distributed spare space if indicated */ 357 if (remap) { 358 RF_ASSERT(raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared || 359 (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal)); 360 rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); 361 } else { 362 363 outSU = base_suid; 364 outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ 365 outSU += SpareSpace; /* skip rsvd spare space */ 366 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ 367 outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ 368 } 369 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within 370 * a PU */ 371 372 /* convert SUs to sectors, and, if not aligned to SU boundary, add in 373 * offset to sector. */ 374 *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); 375 376 RF_ASSERT(*col != -1); 377 } 378 379 380 /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */ 381 void 382 rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, 383 RF_RowCol_t *row, RF_RowCol_t *col, 384 RF_SectorNum_t *diskSector, int remap) 385 { 386 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 387 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 388 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; 389 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; 390 RF_StripeNum_t BlockID, BlockOffset, RepIndex; 391 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 392 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 393 RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; 394 395 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 396 397 /* compute row & (possibly) spare space exactly as before */ 398 FullTableID = SUID / sus_per_fulltable; 399 if (raidPtr->numRow == 1) 400 *row = 0; /* avoid a mod and a div in the common case */ 401 else { 402 *row = FullTableID % raidPtr->numRow; 403 FullTableID /= raidPtr->numRow; /* convert to fulltable ID on 404 * this disk */ 405 } 406 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { 407 SpareRegion = FullTableID / info->FullTablesPerSpareRegion; 408 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; 409 } 410 /* compute BlockID and RepIndex exactly as before */ 411 FullTableOffset = SUID % sus_per_fulltable; 412 TableID = FullTableOffset / info->SUsPerTable; 413 TableOffset = FullTableOffset - TableID * info->SUsPerTable; 414 /* TableOffset = FullTableOffset % info->SUsPerTable; */ 415 /* BlockID = (TableOffset / info->PUsPerBlock) % 416 * info->BlocksPerTable; */ 417 BlockID = TableOffset / info->PUsPerBlock; 418 /* BlockOffset = TableOffset % info->PUsPerBlock; */ 419 BlockOffset = TableOffset - BlockID * info->PUsPerBlock; 420 BlockID %= info->BlocksPerTable; 421 422 /* the parity block is in the position indicated by RepIndex */ 423 RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID; 424 *col = info->LayoutTable[BlockID][RepIndex]; 425 426 if (remap) { 427 RF_ASSERT(raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared || 428 (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal)); 429 rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); 430 } else { 431 432 /* compute sector as before, except use RepIndex instead of 433 * BlockOffset */ 434 outSU = base_suid; 435 outSU += FullTableID * fulltable_depth; 436 outSU += SpareSpace; /* skip rsvd spare space */ 437 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; 438 outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; 439 } 440 441 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); 442 *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); 443 444 RF_ASSERT(*col != -1); 445 } 446 /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. 447 * the caller must _never_ attempt to modify this array. 448 */ 449 void 450 rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, 451 RF_RowCol_t **diskids, RF_RowCol_t *outRow) 452 { 453 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 454 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 455 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 456 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 457 RF_StripeNum_t base_suid = 0; 458 RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); 459 RF_StripeNum_t stripeID, FullTableID; 460 int tableOffset; 461 462 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 463 FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array 464 * (across rows) */ 465 *outRow = FullTableID % raidPtr->numRow; 466 stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset 467 * into array */ 468 tableOffset = (stripeID % info->BlocksPerTable); /* find offset into 469 * block design table */ 470 *diskids = info->LayoutTable[tableOffset]; 471 } 472 /* This returns the default head-separation limit, which is measured 473 * in "required units for reconstruction". Each time a disk fetches 474 * a unit, it bumps a counter. The head-sep code prohibits any disk 475 * from getting more than headSepLimit counter values ahead of any 476 * other. 477 * 478 * We assume here that the number of floating recon buffers is already 479 * set. There are r stripes to be reconstructed in each table, and so 480 * if we have a total of B buffers, we can have at most B/r tables 481 * under recon at any one time. In each table, lambda units are required 482 * from each disk, so given B buffers, the head sep limit has to be 483 * (lambda*B)/r units. We subtract one to avoid weird boundary cases. 484 * 485 * for example, suppose were given 50 buffers, r=19, and lambda=4 as in 486 * the 20.5 design. There are 19 stripes/table to be reconstructed, so 487 * we can have 50/19 tables concurrently under reconstruction, which means 488 * we can allow the fastest disk to get 50/19 tables ahead of the slower 489 * disk. There are lambda "required units" for each disk, so the fastest 490 * disk can get 4*50/19 = 10 counter values ahead of the slowest. 491 * 492 * If numBufsToAccumulate is not 1, we need to limit the head sep further 493 * because multiple bufs will be required for each stripe under recon. 494 */ 495 RF_HeadSepLimit_t 496 rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr) 497 { 498 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 499 500 return (info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate); 501 } 502 /* returns the default number of recon buffers to use. The value 503 * is somewhat arbitrary...it's intended to be large enough to allow 504 * for a reasonably large head-sep limit, but small enough that you 505 * don't use up all your system memory with buffers. 506 */ 507 int 508 rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr) 509 { 510 return (100 * rf_numBufsToAccumulate); 511 } 512 /* sectors in the last fulltable of the array need to be handled 513 * specially since this fulltable can be incomplete. this function 514 * changes the values of certain params to handle this. 515 * 516 * the idea here is that MapSector et. al. figure out which disk the 517 * addressed unit lives on by computing the modulos of the unit number 518 * with the number of units per fulltable, table, etc. In the last 519 * fulltable, there are fewer units per fulltable, so we need to adjust 520 * the number of user data units per fulltable to reflect this. 521 * 522 * so, we (1) convert the fulltable size and depth parameters to 523 * the size of the partial fulltable at the end, (2) compute the 524 * disk sector offset where this fulltable starts, and (3) convert 525 * the users stripe unit number from an offset into the array to 526 * an offset into the last fulltable. 527 */ 528 void 529 rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, 530 RF_StripeNum_t *SUID, 531 RF_StripeCount_t *sus_per_fulltable, 532 RF_StripeCount_t *fulltable_depth, 533 RF_StripeNum_t *base_suid) 534 { 535 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 536 537 if (*SUID >= info->FullTableLimitSUID) { 538 /* new full table size is size of last full table on disk */ 539 *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable; 540 541 /* new full table depth is corresponding depth */ 542 *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; 543 544 /* set up the new base offset */ 545 *base_suid = info->DiskOffsetOfLastFullTableInSUs; 546 547 /* convert users array address to an offset into the last 548 * fulltable */ 549 *SUID -= info->FullTableLimitSUID; 550 } 551 } 552 /* 553 * map a stripe ID to a parity stripe ID. 554 * See comment above RaidAddressToParityStripeID in layout.c. 555 */ 556 void 557 rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, 558 RF_StripeNum_t stripeID, 559 RF_StripeNum_t *psID, 560 RF_ReconUnitNum_t *which_ru) 561 { 562 RF_DeclusteredConfigInfo_t *info; 563 564 info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 565 566 *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) 567 * info->BlocksPerTable + (stripeID % info->BlocksPerTable); 568 *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) 569 / info->BlocksPerTable; 570 RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU); 571 } 572 /* 573 * Called from MapSector and MapParity to retarget an access at the spare unit. 574 * Modifies the "col" and "outSU" parameters only. 575 */ 576 void 577 rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr, 578 RF_DeclusteredConfigInfo_t *info, 579 RF_RowCol_t row, 580 RF_StripeNum_t FullTableID, 581 RF_StripeNum_t TableID, 582 RF_SectorNum_t BlockID, 583 RF_StripeNum_t base_suid, 584 RF_StripeNum_t SpareRegion, 585 RF_RowCol_t *outCol, 586 RF_StripeNum_t *outSU) 587 { 588 RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, 589 which_ft; 590 591 /* 592 * note that FullTableID and hence SpareRegion may have gotten 593 * tweaked by rf_decluster_adjust_params. We detect this by 594 * noticing that base_suid is not 0. 595 */ 596 if (base_suid == 0) { 597 ftID = FullTableID; 598 } else { 599 /* 600 * There may be > 1.0 full tables in the last (i.e. partial) 601 * spare region. find out which of these we're in. 602 */ 603 lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs; 604 which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU); 605 606 /* compute the actual full table ID */ 607 ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft; 608 SpareRegion = info->NumCompleteSRs; 609 } 610 TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion; 611 612 *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk; 613 RF_ASSERT(*outCol != -1); 614 615 spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ? 616 info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU : 617 (SpareRegion + 1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs; 618 *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs; 619 if (*outSU >= layoutPtr->stripeUnitsPerDisk) { 620 printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n", (long) *outSU); 621 } 622 } 623 624 #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ 625 626 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 627 int 628 rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol) 629 { 630 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 631 RF_SparetWait_t *req; 632 int retcode; 633 634 RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *)); 635 req->C = raidPtr->numCol; 636 req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; 637 req->fcol = fcol; 638 req->SUsPerPU = raidPtr->Layout.SUsPerPU; 639 req->TablesPerSpareRegion = info->TablesPerSpareRegion; 640 req->BlocksPerTable = info->BlocksPerTable; 641 req->TableDepthInPUs = info->TableDepthInPUs; 642 req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs; 643 644 retcode = rf_GetSpareTableFromDaemon(req); 645 RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- 646 * XXX */ 647 return (retcode); 648 } 649 #endif 650 #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) 651 /* 652 * Invoked via ioctl to install a spare table in the kernel. 653 */ 654 int 655 rf_SetSpareTable(RF_Raid_t *raidPtr, void *data) 656 { 657 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 658 RF_SpareTableEntry_t **ptrs; 659 int i, retcode; 660 661 /* what we need to copyin is a 2-d array, so first copyin the user 662 * pointers to the rows in the table */ 663 RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); 664 retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 665 666 if (retcode) 667 return (retcode); 668 669 /* now allocate kernel space for the row pointers */ 670 RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); 671 672 /* now allocate kernel space for each row in the table, and copy it in 673 * from user space */ 674 for (i = 0; i < info->TablesPerSpareRegion; i++) { 675 RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *)); 676 retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); 677 if (retcode) { 678 info->SpareTable = NULL; /* blow off the memory 679 * we've allocated */ 680 return (retcode); 681 } 682 } 683 684 /* free up the temporary array we used */ 685 RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 686 687 return (0); 688 } 689 690 RF_ReconUnitCount_t 691 rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr) 692 { 693 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 694 695 return (((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk); 696 } 697 #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ 698 699 void 700 rf_FreeSpareTable(RF_Raid_t *raidPtr) 701 { 702 long i; 703 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 704 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 705 RF_SpareTableEntry_t **table = info->SpareTable; 706 707 for (i = 0; i < info->TablesPerSpareRegion; i++) { 708 RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); 709 } 710 RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 711 info->SpareTable = (RF_SpareTableEntry_t **) NULL; 712 } 713