1 /* $NetBSD: rf_decluster.c,v 1.27 2023/09/25 21:59:38 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /*---------------------------------------------------------------------- 30 * 31 * rf_decluster.c -- code related to the declustered layout 32 * 33 * Created 10-21-92 (MCH) 34 * 35 * Nov 93: adding support for distributed sparing. This code is a little 36 * complex: the basic layout used is as follows: 37 * let F = (v-1)/GCD(r,v-1). The spare space for each set of 38 * F consecutive fulltables is grouped together and placed after 39 * that set of tables. 40 * +------------------------------+ 41 * | F fulltables | 42 * | Spare Space | 43 * | F fulltables | 44 * | Spare Space | 45 * | ... | 46 * +------------------------------+ 47 * 48 *--------------------------------------------------------------------*/ 49 50 #include <sys/cdefs.h> 51 __KERNEL_RCSID(0, "$NetBSD: rf_decluster.c,v 1.27 2023/09/25 21:59:38 oster Exp $"); 52 53 #include <dev/raidframe/raidframevar.h> 54 55 #include "rf_archs.h" 56 #include "rf_raid.h" 57 #include "rf_decluster.h" 58 #include "rf_debugMem.h" 59 #include "rf_utils.h" 60 #include "rf_alloclist.h" 61 #include "rf_general.h" 62 #include "rf_kintf.h" 63 #include "rf_shutdown.h" 64 65 #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) 66 67 /* configuration code */ 68 69 int 70 rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 71 RF_Config_t *cfgPtr) 72 { 73 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 74 int b, v, k, r, lambda; /* block design params */ 75 int i, j; 76 RF_RowCol_t *first_avail_slot; 77 RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk; 78 RF_DeclusteredConfigInfo_t *info; 79 RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, 80 extraPUsPerDisk; 81 RF_StripeCount_t totSparePUsPerDisk; 82 RF_SectorNum_t diskOffsetOfLastFullTableInSUs; 83 RF_SectorCount_t SpareSpaceInSUs; 84 char *cfgBuf = (char *) (cfgPtr->layoutSpecific); 85 RF_StripeNum_t l, SUID; 86 87 SUID = l = 0; 88 numCompleteSpareRegionsPerDisk = 0; 89 90 /* 1. create layout specific structure */ 91 info = RF_MallocAndAdd(sizeof(*info), raidPtr->cleanupList); 92 if (info == NULL) 93 return (ENOMEM); 94 layoutPtr->layoutSpecificInfo = (void *) info; 95 info->SpareTable = NULL; 96 97 /* 2. extract parameters from the config structure */ 98 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 99 (void)memcpy(info->sparemap_fname, cfgBuf, RF_SPAREMAP_NAME_LEN); 100 } 101 cfgBuf += RF_SPAREMAP_NAME_LEN; 102 103 b = *((int *) cfgBuf); 104 cfgBuf += sizeof(int); 105 v = *((int *) cfgBuf); 106 cfgBuf += sizeof(int); 107 k = *((int *) cfgBuf); 108 cfgBuf += sizeof(int); 109 r = *((int *) cfgBuf); 110 cfgBuf += sizeof(int); 111 lambda = *((int *) cfgBuf); 112 cfgBuf += sizeof(int); 113 raidPtr->noRotate = *((int *) cfgBuf); 114 cfgBuf += sizeof(int); 115 116 /* the sparemaps are generated assuming that parity is rotated, so we 117 * issue a warning if both distributed sparing and no-rotate are on at 118 * the same time */ 119 if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { 120 RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n"); 121 } 122 if (raidPtr->numCol != v) { 123 RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol); 124 return (EINVAL); 125 } 126 /* 3. set up the values used in the mapping code */ 127 info->BlocksPerTable = b; 128 info->Lambda = lambda; 129 info->NumParityReps = info->groupSize = k; 130 info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU; /* b blks, k-1 SUs each */ 131 info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ 132 info->PUsPerBlock = k - 1; 133 info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; 134 info->TableDepthInPUs = (b * k) / v; 135 info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ 136 137 /* used only in distributed sparing case */ 138 info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1); /* (v-1)/gcd fulltables */ 139 info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; 140 info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v - 1)) * layoutPtr->SUsPerPU; 141 142 /* check to make sure the block design is sufficiently small */ 143 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { 144 if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { 145 RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n", 146 (int) info->FullTableDepthInPUs, 147 (int) info->SpareSpaceDepthPerRegionInSUs, 148 (int) layoutPtr->stripeUnitsPerDisk); 149 return (EINVAL); 150 } 151 } else { 152 if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { 153 RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n", 154 (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU), \ 155 (int) layoutPtr->stripeUnitsPerDisk); 156 return (EINVAL); 157 } 158 } 159 160 161 /* compute the size of each disk, and the number of tables in the last 162 * fulltable (which need not be complete) */ 163 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 164 165 PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; 166 spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + 167 (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v - 1)); 168 info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; 169 170 numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; 171 info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; 172 extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; 173 174 /* assume conservatively that we need the full amount of spare 175 * space in one region in order to provide spares for the 176 * partial spare region at the end of the array. We set "i" 177 * to the number of tables in the partial spare region. This 178 * may actually include some fulltables. */ 179 extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); 180 if (extraPUsPerDisk <= 0) 181 i = 0; 182 else 183 i = extraPUsPerDisk / info->TableDepthInPUs; 184 185 complete_FT_count = (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion / k) + i / k); 186 info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; 187 info->ExtraTablesPerDisk = i % k; 188 189 /* note that in the last spare region, the spare space is 190 * complete even though data/parity space is not */ 191 totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); 192 info->TotSparePUsPerDisk = totSparePUsPerDisk; 193 194 layoutPtr->stripeUnitsPerDisk = 195 ((complete_FT_count) * info->FullTableDepthInPUs + /* data & parity space */ 196 info->ExtraTablesPerDisk * info->TableDepthInPUs + 197 totSparePUsPerDisk /* spare space */ 198 ) * layoutPtr->SUsPerPU; 199 layoutPtr->dataStripeUnitsPerDisk = 200 (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) 201 * layoutPtr->SUsPerPU * (k - 1) / k; 202 203 } else { 204 /* non-dist spare case: force each disk to contain an 205 * integral number of tables */ 206 layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); 207 layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); 208 209 /* compute the number of tables in the last fulltable, which 210 * need not be complete */ 211 complete_FT_count = 212 ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->FullTableDepthInPUs); 213 214 info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; 215 info->ExtraTablesPerDisk = 216 ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; 217 } 218 219 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; 220 221 /* find the disk offset of the stripe unit where the last fulltable 222 * starts */ 223 numCompleteFullTablesPerDisk = complete_FT_count; 224 diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 225 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 226 SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; 227 diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; 228 info->DiskOffsetOfLastSpareSpaceChunkInSUs = 229 diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; 230 } 231 info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; 232 info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; 233 234 /* 4. create and initialize the lookup tables */ 235 info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); 236 if (info->LayoutTable == NULL) 237 return (ENOMEM); 238 info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); 239 if (info->OffsetTable == NULL) 240 return (ENOMEM); 241 info->BlockTable = rf_make_2d_array(info->TableDepthInPUs * layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); 242 if (info->BlockTable == NULL) 243 return (ENOMEM); 244 245 first_avail_slot = rf_make_1d_array(v, NULL); 246 if (first_avail_slot == NULL) 247 return (ENOMEM); 248 249 for (i = 0; i < b; i++) 250 for (j = 0; j < k; j++) 251 info->LayoutTable[i][j] = *cfgBuf++; 252 253 /* initialize offset table */ 254 for (i = 0; i < b; i++) 255 for (j = 0; j < k; j++) { 256 info->OffsetTable[i][j] = first_avail_slot[info->LayoutTable[i][j]]; 257 first_avail_slot[info->LayoutTable[i][j]]++; 258 } 259 260 /* initialize block table */ 261 for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) { 262 for (i = 0; i < b; i++) { 263 for (j = 0; j < k; j++) { 264 info->BlockTable[(info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l] 265 [info->LayoutTable[i][j]] = SUID; 266 } 267 SUID++; 268 } 269 } 270 271 rf_free_1d_array(first_avail_slot, v); 272 273 /* 5. set up the remaining redundant-but-useful parameters */ 274 275 raidPtr->totalSectors = (k * complete_FT_count + info->ExtraTablesPerDisk) * 276 info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; 277 layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k - 1); 278 279 /* strange evaluation order below to try and minimize overflow 280 * problems */ 281 282 layoutPtr->dataSectorsPerStripe = (k - 1) * layoutPtr->sectorsPerStripeUnit; 283 layoutPtr->numDataCol = k - 1; 284 layoutPtr->numParityCol = 1; 285 286 return (0); 287 } 288 /* declustering with distributed sparing */ 289 static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t); 290 static void 291 rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg) 292 { 293 RF_DeclusteredConfigInfo_t *info; 294 RF_Raid_t *raidPtr; 295 296 raidPtr = (RF_Raid_t *) arg; 297 info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 298 if (info->SpareTable) 299 rf_FreeSpareTable(raidPtr); 300 } 301 302 int 303 rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 304 RF_Config_t *cfgPtr) 305 { 306 int rc; 307 308 rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr); 309 if (rc) 310 return (rc); 311 rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr); 312 313 return (0); 314 } 315 316 void 317 rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, 318 RF_RowCol_t *col, 319 RF_SectorNum_t *diskSector, int remap) 320 { 321 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 322 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 323 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; 324 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; 325 RF_StripeNum_t BlockID, BlockOffset, RepIndex; 326 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 327 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 328 RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; 329 330 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 331 332 FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array 333 * (across rows) */ 334 335 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 336 SpareRegion = FullTableID / info->FullTablesPerSpareRegion; 337 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; 338 } 339 FullTableOffset = SUID % sus_per_fulltable; 340 TableID = FullTableOffset / info->SUsPerTable; 341 TableOffset = FullTableOffset - TableID * info->SUsPerTable; 342 BlockID = TableOffset / info->PUsPerBlock; 343 BlockOffset = TableOffset - BlockID * info->PUsPerBlock; 344 BlockID %= info->BlocksPerTable; 345 RepIndex = info->PUsPerBlock - TableID; 346 if (!raidPtr->noRotate) 347 BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0); 348 *col = info->LayoutTable[BlockID][BlockOffset]; 349 350 /* remap to distributed spare space if indicated */ 351 if (remap) { 352 RF_ASSERT(raidPtr->Disks[*col].status == rf_ds_reconstructing || raidPtr->Disks[*col].status == rf_ds_dist_spared); 353 rf_remap_to_spare_space(layoutPtr, info, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); 354 } else { 355 356 outSU = base_suid; 357 outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ 358 outSU += SpareSpace; /* skip rsvd spare space */ 359 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ 360 outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ 361 } 362 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within 363 * a PU */ 364 365 /* convert SUs to sectors, and, if not aligned to SU boundary, add in 366 * offset to sector. */ 367 *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); 368 369 RF_ASSERT(*col != -1); 370 } 371 372 373 /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */ 374 void 375 rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, 376 RF_RowCol_t *col, 377 RF_SectorNum_t *diskSector, int remap) 378 { 379 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 380 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 381 RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; 382 RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; 383 RF_StripeNum_t BlockID, RepIndex; 384 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 385 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 386 RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0; 387 388 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 389 390 /* compute row & (possibly) spare space exactly as before */ 391 FullTableID = SUID / sus_per_fulltable; 392 393 if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { 394 SpareRegion = FullTableID / info->FullTablesPerSpareRegion; 395 SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; 396 } 397 /* compute BlockID and RepIndex exactly as before */ 398 FullTableOffset = SUID % sus_per_fulltable; 399 TableID = FullTableOffset / info->SUsPerTable; 400 TableOffset = FullTableOffset - TableID * info->SUsPerTable; 401 /* TableOffset = FullTableOffset % info->SUsPerTable; */ 402 /* BlockID = (TableOffset / info->PUsPerBlock) % 403 * info->BlocksPerTable; */ 404 BlockID = TableOffset / info->PUsPerBlock; 405 BlockID %= info->BlocksPerTable; 406 407 /* the parity block is in the position indicated by RepIndex */ 408 RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID; 409 *col = info->LayoutTable[BlockID][RepIndex]; 410 411 if (remap) { 412 RF_ASSERT(raidPtr->Disks[*col].status == rf_ds_reconstructing || raidPtr->Disks[*col].status == rf_ds_dist_spared); 413 rf_remap_to_spare_space(layoutPtr, info, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); 414 } else { 415 416 /* compute sector as before, except use RepIndex instead of 417 * BlockOffset */ 418 outSU = base_suid; 419 outSU += FullTableID * fulltable_depth; 420 outSU += SpareSpace; /* skip rsvd spare space */ 421 outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; 422 outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; 423 } 424 425 outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); 426 *diskSector = outSU * layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); 427 428 RF_ASSERT(*col != -1); 429 } 430 /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. 431 * the caller must _never_ attempt to modify this array. 432 */ 433 void 434 rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, 435 RF_RowCol_t **diskids) 436 { 437 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 438 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 439 RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; 440 RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; 441 RF_StripeNum_t base_suid = 0; 442 RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); 443 RF_StripeNum_t stripeID; 444 int tableOffset; 445 446 rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); 447 stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset 448 * into array */ 449 tableOffset = (stripeID % info->BlocksPerTable); /* find offset into 450 * block design table */ 451 *diskids = info->LayoutTable[tableOffset]; 452 } 453 /* This returns the default head-separation limit, which is measured 454 * in "required units for reconstruction". Each time a disk fetches 455 * a unit, it bumps a counter. The head-sep code prohibits any disk 456 * from getting more than headSepLimit counter values ahead of any 457 * other. 458 * 459 * We assume here that the number of floating recon buffers is already 460 * set. There are r stripes to be reconstructed in each table, and so 461 * if we have a total of B buffers, we can have at most B/r tables 462 * under recon at any one time. In each table, lambda units are required 463 * from each disk, so given B buffers, the head sep limit has to be 464 * (lambda*B)/r units. We subtract one to avoid weird boundary cases. 465 * 466 * for example, suppose were given 50 buffers, r=19, and lambda=4 as in 467 * the 20.5 design. There are 19 stripes/table to be reconstructed, so 468 * we can have 50/19 tables concurrently under reconstruction, which means 469 * we can allow the fastest disk to get 50/19 tables ahead of the slower 470 * disk. There are lambda "required units" for each disk, so the fastest 471 * disk can get 4*50/19 = 10 counter values ahead of the slowest. 472 * 473 * If numBufsToAccumulate is not 1, we need to limit the head sep further 474 * because multiple bufs will be required for each stripe under recon. 475 */ 476 RF_HeadSepLimit_t 477 rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr) 478 { 479 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 480 481 return (info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate); 482 } 483 /* returns the default number of recon buffers to use. The value 484 * is somewhat arbitrary...it's intended to be large enough to allow 485 * for a reasonably large head-sep limit, but small enough that you 486 * don't use up all your system memory with buffers. 487 */ 488 int 489 rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t * raidPtr) 490 { 491 return (100 * rf_numBufsToAccumulate); 492 } 493 /* sectors in the last fulltable of the array need to be handled 494 * specially since this fulltable can be incomplete. this function 495 * changes the values of certain params to handle this. 496 * 497 * the idea here is that MapSector et. al. figure out which disk the 498 * addressed unit lives on by computing the modulos of the unit number 499 * with the number of units per fulltable, table, etc. In the last 500 * fulltable, there are fewer units per fulltable, so we need to adjust 501 * the number of user data units per fulltable to reflect this. 502 * 503 * so, we (1) convert the fulltable size and depth parameters to 504 * the size of the partial fulltable at the end, (2) compute the 505 * disk sector offset where this fulltable starts, and (3) convert 506 * the users stripe unit number from an offset into the array to 507 * an offset into the last fulltable. 508 */ 509 void 510 rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, 511 RF_StripeNum_t *SUID, 512 RF_StripeCount_t *sus_per_fulltable, 513 RF_StripeCount_t *fulltable_depth, 514 RF_StripeNum_t *base_suid) 515 { 516 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 517 518 if (*SUID >= info->FullTableLimitSUID) { 519 /* new full table size is size of last full table on disk */ 520 *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable; 521 522 /* new full table depth is corresponding depth */ 523 *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; 524 525 /* set up the new base offset */ 526 *base_suid = info->DiskOffsetOfLastFullTableInSUs; 527 528 /* convert users array address to an offset into the last 529 * fulltable */ 530 *SUID -= info->FullTableLimitSUID; 531 } 532 } 533 /* 534 * map a stripe ID to a parity stripe ID. 535 * See comment above RaidAddressToParityStripeID in layout.c. 536 */ 537 void 538 rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, 539 RF_StripeNum_t stripeID, 540 RF_StripeNum_t *psID, 541 RF_ReconUnitNum_t *which_ru) 542 { 543 RF_DeclusteredConfigInfo_t *info; 544 545 info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 546 547 *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) 548 * info->BlocksPerTable + (stripeID % info->BlocksPerTable); 549 *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) 550 / info->BlocksPerTable; 551 RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU); 552 } 553 /* 554 * Called from MapSector and MapParity to retarget an access at the spare unit. 555 * Modifies the "col" and "outSU" parameters only. 556 */ 557 void 558 rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr, 559 RF_DeclusteredConfigInfo_t *info, 560 RF_StripeNum_t FullTableID, 561 RF_StripeNum_t TableID, 562 RF_SectorNum_t BlockID, 563 RF_StripeNum_t base_suid, 564 RF_StripeNum_t SpareRegion, 565 RF_RowCol_t *outCol, 566 RF_StripeNum_t *outSU) 567 { 568 RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, 569 which_ft; 570 571 /* 572 * note that FullTableID and hence SpareRegion may have gotten 573 * tweaked by rf_decluster_adjust_params. We detect this by 574 * noticing that base_suid is not 0. 575 */ 576 if (base_suid == 0) { 577 ftID = FullTableID; 578 } else { 579 /* 580 * There may be > 1.0 full tables in the last (i.e. partial) 581 * spare region. find out which of these we're in. 582 */ 583 lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs; 584 which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU); 585 586 /* compute the actual full table ID */ 587 ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft; 588 SpareRegion = info->NumCompleteSRs; 589 } 590 TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion; 591 592 *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk; 593 RF_ASSERT(*outCol != -1); 594 595 spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ? 596 info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU : 597 (SpareRegion + 1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs; 598 *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs; 599 if (*outSU >= layoutPtr->stripeUnitsPerDisk) { 600 printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n", (long) *outSU); 601 } 602 } 603 604 #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ 605 606 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 607 int 608 rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t fcol) 609 { 610 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 611 RF_SparetWait_t *req; 612 int retcode; 613 614 req = RF_Malloc(sizeof(*req)); 615 req->C = raidPtr->numCol; 616 req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; 617 req->fcol = fcol; 618 req->SUsPerPU = raidPtr->Layout.SUsPerPU; 619 req->TablesPerSpareRegion = info->TablesPerSpareRegion; 620 req->BlocksPerTable = info->BlocksPerTable; 621 req->TableDepthInPUs = info->TableDepthInPUs; 622 req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs; 623 624 retcode = rf_GetSpareTableFromDaemon(req); 625 RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- 626 * XXX */ 627 return (retcode); 628 } 629 #endif 630 #if (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) 631 /* 632 * Invoked via ioctl to install a spare table in the kernel. 633 */ 634 int 635 rf_SetSpareTable(RF_Raid_t *raidPtr, void *data) 636 { 637 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; 638 RF_SpareTableEntry_t **ptrs; 639 int i, retcode; 640 641 /* what we need to copyin is a 2-d array, so first copyin the user 642 * pointers to the rows in the table */ 643 size_t ptrslen = info->TablesPerSpareRegion * sizeof(*ptrs); 644 ptrs = RF_Malloc(ptrslen); 645 retcode = copyin(data, ptrs, ptrslen); 646 647 if (retcode) 648 return (retcode); 649 650 /* now allocate kernel space for the row pointers */ 651 info->SpareTable = RF_Malloc(info->TablesPerSpareRegion * 652 sizeof(*info->SpareTable)); 653 654 /* now allocate kernel space for each row in the table, and copy it in 655 * from user space */ 656 size_t len = info->BlocksPerTable * sizeof(**info->SpareTable); 657 for (i = 0; i < info->TablesPerSpareRegion; i++) { 658 info->SpareTable[i] = RF_Malloc(len); 659 retcode = copyin(ptrs[i], info->SpareTable[i], len); 660 if (retcode) { 661 info->SpareTable = NULL; /* blow off the memory 662 * we've allocated */ 663 return (retcode); 664 } 665 } 666 667 /* free up the temporary array we used */ 668 RF_Free(ptrs, ptrslen); 669 670 return (0); 671 } 672 673 RF_ReconUnitCount_t 674 rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr) 675 { 676 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 677 678 return (((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk); 679 } 680 #endif /* (RF_INCLUDE_PARITY_DECLUSTERING > 0) || (RF_INCLUDE_PARITY_DECLUSTERING_PQ > 0) */ 681 682 void 683 rf_FreeSpareTable(RF_Raid_t *raidPtr) 684 { 685 long i; 686 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 687 RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; 688 RF_SpareTableEntry_t **table = info->SpareTable; 689 690 for (i = 0; i < info->TablesPerSpareRegion; i++) { 691 RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); 692 } 693 RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); 694 info->SpareTable = NULL; 695 } 696