1 /* $NetBSD: rf_paritylogging.c,v 1.10 2000/02/12 16:06:27 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include "rf_archs.h" 35 36 #if RF_INCLUDE_PARITYLOGGING > 0 37 38 #include "rf_types.h" 39 #include "rf_raid.h" 40 #include "rf_dag.h" 41 #include "rf_dagutils.h" 42 #include "rf_dagfuncs.h" 43 #include "rf_dagffrd.h" 44 #include "rf_dagffwr.h" 45 #include "rf_dagdegrd.h" 46 #include "rf_dagdegwr.h" 47 #include "rf_paritylog.h" 48 #include "rf_paritylogDiskMgr.h" 49 #include "rf_paritylogging.h" 50 #include "rf_parityloggingdags.h" 51 #include "rf_general.h" 52 #include "rf_map.h" 53 #include "rf_utils.h" 54 #include "rf_shutdown.h" 55 56 typedef struct RF_ParityLoggingConfigInfo_s { 57 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 58 * IdentifyStripe */ 59 } RF_ParityLoggingConfigInfo_t; 60 61 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 62 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 63 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 64 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 65 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 66 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 68 69 int 70 rf_ConfigureParityLogging( 71 RF_ShutdownList_t ** listp, 72 RF_Raid_t * raidPtr, 73 RF_Config_t * cfgPtr) 74 { 75 int i, j, startdisk, rc; 76 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 77 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 78 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 79 RF_ParityLoggingConfigInfo_t *info; 80 RF_ParityLog_t *l = NULL, *next; 81 caddr_t lHeapPtr; 82 83 if (rf_numParityRegions <= 0) 84 return(EINVAL); 85 86 /* 87 * We create multiple entries on the shutdown list here, since 88 * this configuration routine is fairly complicated in and of 89 * itself, and this makes backing out of a failed configuration 90 * much simpler. 91 */ 92 93 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 94 95 /* create a parity logging configuration structure */ 96 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 97 (RF_ParityLoggingConfigInfo_t *), 98 raidPtr->cleanupList); 99 if (info == NULL) 100 return (ENOMEM); 101 layoutPtr->layoutSpecificInfo = (void *) info; 102 103 RF_ASSERT(raidPtr->numRow == 1); 104 105 /* the stripe identifier must identify the disks in each stripe, IN 106 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 107 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 108 (raidPtr->numCol), 109 raidPtr->cleanupList); 110 if (info->stripeIdentifier == NULL) 111 return (ENOMEM); 112 113 startdisk = 0; 114 for (i = 0; i < (raidPtr->numCol); i++) { 115 for (j = 0; j < (raidPtr->numCol); j++) { 116 info->stripeIdentifier[i][j] = (startdisk + j) % 117 (raidPtr->numCol - 1); 118 } 119 if ((--startdisk) < 0) 120 startdisk = raidPtr->numCol - 1 - 1; 121 } 122 123 /* fill in the remaining layout parameters */ 124 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 125 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << 126 raidPtr->logBytesPerSector; 127 layoutPtr->numParityCol = 1; 128 layoutPtr->numParityLogCol = 1; 129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 130 layoutPtr->numParityLogCol; 131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 132 layoutPtr->sectorsPerStripeUnit; 133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 135 layoutPtr->sectorsPerStripeUnit; 136 137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 139 140 /* configure parity log parameters 141 * 142 * parameter comment/constraints 143 * ------------------------------------------- 144 * numParityRegions* all regions (except possibly last) 145 * of equal size 146 * totalInCoreLogCapacity* amount of memory in bytes available 147 * for in-core logs (default 1 MB) 148 * numSectorsPerLog# capacity of an in-core log in sectors 149 * (1 * disk track) 150 * numParityLogs total number of in-core logs, 151 * should be at least numParityRegions 152 * regionLogCapacity size of a region log (except possibly 153 * last one) in sectors 154 * totalLogCapacity total amount of log space in sectors 155 * 156 * where '*' denotes a user settable parameter. 157 * Note that logs are fixed to be the size of a disk track, 158 * value #defined in rf_paritylog.h 159 * 160 */ 161 162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 164 if (rf_parityLogDebug) 165 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 166 167 /* reduce fragmentation within a disk region by adjusting the number 168 * of regions in an attempt to allow an integral number of logs to fit 169 * into a disk region */ 170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 171 if (fragmentation > 0) 172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 173 if (((totalLogCapacity / (rf_numParityRegions + i)) % 174 raidPtr->numSectorsPerLog) < fragmentation) { 175 rf_numParityRegions++; 176 raidPtr->regionLogCapacity = totalLogCapacity / 177 rf_numParityRegions; 178 fragmentation = raidPtr->regionLogCapacity % 179 raidPtr->numSectorsPerLog; 180 } 181 if (((totalLogCapacity / (rf_numParityRegions - i)) % 182 raidPtr->numSectorsPerLog) < fragmentation) { 183 rf_numParityRegions--; 184 raidPtr->regionLogCapacity = totalLogCapacity / 185 rf_numParityRegions; 186 fragmentation = raidPtr->regionLogCapacity % 187 raidPtr->numSectorsPerLog; 188 } 189 } 190 /* ensure integral number of regions per log */ 191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 192 raidPtr->numSectorsPerLog) * 193 raidPtr->numSectorsPerLog; 194 195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 197 /* to avoid deadlock, must ensure that enough logs exist for each 198 * region to have one simultaneously */ 199 if (raidPtr->numParityLogs < rf_numParityRegions) 200 raidPtr->numParityLogs = rf_numParityRegions; 201 202 /* create region information structs */ 203 printf("Allocating %d bytes for in-core parity region info\n", 204 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 205 RF_Malloc(raidPtr->regionInfo, 206 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 207 (RF_RegionInfo_t *)); 208 if (raidPtr->regionInfo == NULL) 209 return (ENOMEM); 210 211 /* last region may not be full capacity */ 212 lastRegionCapacity = raidPtr->regionLogCapacity; 213 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 214 lastRegionCapacity > totalLogCapacity) 215 lastRegionCapacity = lastRegionCapacity - 216 raidPtr->numSectorsPerLog; 217 218 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 219 rf_numParityRegions; 220 maxRegionParityRange = raidPtr->regionParityRange; 221 222 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 224 regionParityRange++; */ 225 226 /* build pool of unused parity logs */ 227 printf("Allocating %d bytes for %d parity logs\n", 228 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 229 raidPtr->bytesPerSector, 230 raidPtr->numParityLogs); 231 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 232 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 233 (caddr_t)); 234 if (raidPtr->parityLogBufferHeap == NULL) 235 return (ENOMEM); 236 lHeapPtr = raidPtr->parityLogBufferHeap; 237 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); 238 if (rc) { 239 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 240 __FILE__, __LINE__, rc); 241 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 242 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 243 return (ENOMEM); 244 } 245 for (i = 0; i < raidPtr->numParityLogs; i++) { 246 if (i == 0) { 247 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, 248 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 249 if (raidPtr->parityLogPool.parityLogs == NULL) { 250 RF_Free(raidPtr->parityLogBufferHeap, 251 raidPtr->numParityLogs * 252 raidPtr->numSectorsPerLog * 253 raidPtr->bytesPerSector); 254 return (ENOMEM); 255 } 256 l = raidPtr->parityLogPool.parityLogs; 257 } else { 258 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), 259 (RF_ParityLog_t *)); 260 if (l->next == NULL) { 261 RF_Free(raidPtr->parityLogBufferHeap, 262 raidPtr->numParityLogs * 263 raidPtr->numSectorsPerLog * 264 raidPtr->bytesPerSector); 265 for (l = raidPtr->parityLogPool.parityLogs; 266 l; 267 l = next) { 268 next = l->next; 269 if (l->records) 270 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 271 RF_Free(l, sizeof(RF_ParityLog_t)); 272 } 273 return (ENOMEM); 274 } 275 l = l->next; 276 } 277 l->bufPtr = lHeapPtr; 278 lHeapPtr += raidPtr->numSectorsPerLog * 279 raidPtr->bytesPerSector; 280 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 281 sizeof(RF_ParityLogRecord_t)), 282 (RF_ParityLogRecord_t *)); 283 if (l->records == NULL) { 284 RF_Free(raidPtr->parityLogBufferHeap, 285 raidPtr->numParityLogs * 286 raidPtr->numSectorsPerLog * 287 raidPtr->bytesPerSector); 288 for (l = raidPtr->parityLogPool.parityLogs; 289 l; 290 l = next) { 291 next = l->next; 292 if (l->records) 293 RF_Free(l->records, 294 (raidPtr->numSectorsPerLog * 295 sizeof(RF_ParityLogRecord_t))); 296 RF_Free(l, sizeof(RF_ParityLog_t)); 297 } 298 return (ENOMEM); 299 } 300 } 301 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 302 if (rc) { 303 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 304 __LINE__, rc); 305 rf_ShutdownParityLoggingPool(raidPtr); 306 return (rc); 307 } 308 /* build pool of region buffers */ 309 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); 310 if (rc) { 311 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 312 __FILE__, __LINE__, rc); 313 return (ENOMEM); 314 } 315 rc = rf_cond_init(&raidPtr->regionBufferPool.cond); 316 if (rc) { 317 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 318 __FILE__, __LINE__, rc); 319 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 320 return (ENOMEM); 321 } 322 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 323 raidPtr->bytesPerSector; 324 printf("regionBufferPool.bufferSize %d\n", 325 raidPtr->regionBufferPool.bufferSize); 326 327 /* for now, only one region at a time may be reintegrated */ 328 raidPtr->regionBufferPool.totalBuffers = 1; 329 330 raidPtr->regionBufferPool.availableBuffers = 331 raidPtr->regionBufferPool.totalBuffers; 332 raidPtr->regionBufferPool.availBuffersIndex = 0; 333 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 334 printf("Allocating %d bytes for regionBufferPool\n", 335 (int) (raidPtr->regionBufferPool.totalBuffers * 336 sizeof(caddr_t))); 337 RF_Malloc(raidPtr->regionBufferPool.buffers, 338 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 339 (caddr_t *)); 340 if (raidPtr->regionBufferPool.buffers == NULL) { 341 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 342 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 343 return (ENOMEM); 344 } 345 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 346 printf("Allocating %d bytes for regionBufferPool#%d\n", 347 (int) (raidPtr->regionBufferPool.bufferSize * 348 sizeof(char)), i); 349 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 350 raidPtr->regionBufferPool.bufferSize * sizeof(char), 351 (caddr_t)); 352 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 353 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 354 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 355 for (j = 0; j < i; j++) { 356 RF_Free(raidPtr->regionBufferPool.buffers[i], 357 raidPtr->regionBufferPool.bufferSize * 358 sizeof(char)); 359 } 360 RF_Free(raidPtr->regionBufferPool.buffers, 361 raidPtr->regionBufferPool.totalBuffers * 362 sizeof(caddr_t)); 363 return (ENOMEM); 364 } 365 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 366 (long) raidPtr->regionBufferPool.buffers[i]); 367 } 368 rc = rf_ShutdownCreate(listp, 369 rf_ShutdownParityLoggingRegionBufferPool, 370 raidPtr); 371 if (rc) { 372 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 373 __LINE__, rc); 374 rf_ShutdownParityLoggingRegionBufferPool(raidPtr); 375 return (rc); 376 } 377 /* build pool of parity buffers */ 378 parityBufferCapacity = maxRegionParityRange; 379 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); 380 if (rc) { 381 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 382 __FILE__, __LINE__, rc); 383 return (rc); 384 } 385 rc = rf_cond_init(&raidPtr->parityBufferPool.cond); 386 if (rc) { 387 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 388 __FILE__, __LINE__, rc); 389 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 390 return (ENOMEM); 391 } 392 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 393 raidPtr->bytesPerSector; 394 printf("parityBufferPool.bufferSize %d\n", 395 raidPtr->parityBufferPool.bufferSize); 396 397 /* for now, only one region at a time may be reintegrated */ 398 raidPtr->parityBufferPool.totalBuffers = 1; 399 400 raidPtr->parityBufferPool.availableBuffers = 401 raidPtr->parityBufferPool.totalBuffers; 402 raidPtr->parityBufferPool.availBuffersIndex = 0; 403 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 404 printf("Allocating %d bytes for parityBufferPool of %d units\n", 405 (int) (raidPtr->parityBufferPool.totalBuffers * 406 sizeof(caddr_t)), 407 raidPtr->parityBufferPool.totalBuffers ); 408 RF_Malloc(raidPtr->parityBufferPool.buffers, 409 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 410 (caddr_t *)); 411 if (raidPtr->parityBufferPool.buffers == NULL) { 412 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 413 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 414 return (ENOMEM); 415 } 416 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 417 printf("Allocating %d bytes for parityBufferPool#%d\n", 418 (int) (raidPtr->parityBufferPool.bufferSize * 419 sizeof(char)),i); 420 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 421 raidPtr->parityBufferPool.bufferSize * sizeof(char), 422 (caddr_t)); 423 if (raidPtr->parityBufferPool.buffers == NULL) { 424 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 425 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 426 for (j = 0; j < i; j++) { 427 RF_Free(raidPtr->parityBufferPool.buffers[i], 428 raidPtr->regionBufferPool.bufferSize * 429 sizeof(char)); 430 } 431 RF_Free(raidPtr->parityBufferPool.buffers, 432 raidPtr->regionBufferPool.totalBuffers * 433 sizeof(caddr_t)); 434 return (ENOMEM); 435 } 436 printf("parityBufferPool.buffers[%d] = %lx\n", i, 437 (long) raidPtr->parityBufferPool.buffers[i]); 438 } 439 rc = rf_ShutdownCreate(listp, 440 rf_ShutdownParityLoggingParityBufferPool, 441 raidPtr); 442 if (rc) { 443 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 444 __LINE__, rc); 445 rf_ShutdownParityLoggingParityBufferPool(raidPtr); 446 return (rc); 447 } 448 /* initialize parityLogDiskQueue */ 449 rc = rf_create_managed_mutex(listp, 450 &raidPtr->parityLogDiskQueue.mutex); 451 if (rc) { 452 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 453 __FILE__, __LINE__, rc); 454 return (rc); 455 } 456 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond); 457 if (rc) { 458 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 459 __FILE__, __LINE__, rc); 460 return (rc); 461 } 462 raidPtr->parityLogDiskQueue.flushQueue = NULL; 463 raidPtr->parityLogDiskQueue.reintQueue = NULL; 464 raidPtr->parityLogDiskQueue.bufHead = NULL; 465 raidPtr->parityLogDiskQueue.bufTail = NULL; 466 raidPtr->parityLogDiskQueue.reintHead = NULL; 467 raidPtr->parityLogDiskQueue.reintTail = NULL; 468 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 469 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 470 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 471 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 472 raidPtr->parityLogDiskQueue.freeDataList = NULL; 473 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 474 475 rc = rf_ShutdownCreate(listp, 476 rf_ShutdownParityLoggingDiskQueue, 477 raidPtr); 478 if (rc) { 479 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 480 __LINE__, rc); 481 return (rc); 482 } 483 for (i = 0; i < rf_numParityRegions; i++) { 484 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); 485 if (rc) { 486 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, 487 __LINE__, rc); 488 for (j = 0; j < i; j++) 489 FreeRegionInfo(raidPtr, j); 490 RF_Free(raidPtr->regionInfo, 491 (rf_numParityRegions * 492 sizeof(RF_RegionInfo_t))); 493 return (ENOMEM); 494 } 495 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 496 if (rc) { 497 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, 498 __LINE__, rc); 499 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 500 for (j = 0; j < i; j++) 501 FreeRegionInfo(raidPtr, j); 502 RF_Free(raidPtr->regionInfo, 503 (rf_numParityRegions * 504 sizeof(RF_RegionInfo_t))); 505 return (ENOMEM); 506 } 507 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 508 raidPtr->regionInfo[i].regionStartAddr = 509 raidPtr->regionLogCapacity * i; 510 raidPtr->regionInfo[i].parityStartAddr = 511 raidPtr->regionParityRange * i; 512 if (i < rf_numParityRegions - 1) { 513 raidPtr->regionInfo[i].capacity = 514 raidPtr->regionLogCapacity; 515 raidPtr->regionInfo[i].numSectorsParity = 516 raidPtr->regionParityRange; 517 } else { 518 raidPtr->regionInfo[i].capacity = 519 lastRegionCapacity; 520 raidPtr->regionInfo[i].numSectorsParity = 521 raidPtr->sectorsPerDisk - 522 raidPtr->regionParityRange * i; 523 if (raidPtr->regionInfo[i].numSectorsParity > 524 maxRegionParityRange) 525 maxRegionParityRange = 526 raidPtr->regionInfo[i].numSectorsParity; 527 } 528 raidPtr->regionInfo[i].diskCount = 0; 529 RF_ASSERT(raidPtr->regionInfo[i].capacity + 530 raidPtr->regionInfo[i].regionStartAddr <= 531 totalLogCapacity); 532 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 533 raidPtr->regionInfo[i].numSectorsParity <= 534 raidPtr->sectorsPerDisk); 535 printf("Allocating %d bytes for region %d\n", 536 (int) (raidPtr->regionInfo[i].capacity * 537 sizeof(RF_DiskMap_t)), i); 538 RF_Malloc(raidPtr->regionInfo[i].diskMap, 539 (raidPtr->regionInfo[i].capacity * 540 sizeof(RF_DiskMap_t)), 541 (RF_DiskMap_t *)); 542 if (raidPtr->regionInfo[i].diskMap == NULL) { 543 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 544 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex); 545 for (j = 0; j < i; j++) 546 FreeRegionInfo(raidPtr, j); 547 RF_Free(raidPtr->regionInfo, 548 (rf_numParityRegions * 549 sizeof(RF_RegionInfo_t))); 550 return (ENOMEM); 551 } 552 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 553 raidPtr->regionInfo[i].coreLog = NULL; 554 } 555 rc = rf_ShutdownCreate(listp, 556 rf_ShutdownParityLoggingRegionInfo, 557 raidPtr); 558 if (rc) { 559 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 560 __LINE__, rc); 561 rf_ShutdownParityLoggingRegionInfo(raidPtr); 562 return (rc); 563 } 564 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 565 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 566 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 567 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 568 if (rc) { 569 raidPtr->parityLogDiskQueue.threadState = 0; 570 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 571 __FILE__, __LINE__, rc); 572 return (ENOMEM); 573 } 574 /* wait for thread to start */ 575 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 576 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 577 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 578 raidPtr->parityLogDiskQueue.mutex); 579 } 580 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 581 582 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 583 if (rc) { 584 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc); 585 rf_ShutdownParityLogging(raidPtr); 586 return (rc); 587 } 588 if (rf_parityLogDebug) { 589 printf(" size of disk log in sectors: %d\n", 590 (int) totalLogCapacity); 591 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 592 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 593 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 594 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 595 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 596 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 597 } 598 rf_EnableParityLogging(raidPtr); 599 600 return (0); 601 } 602 603 static void 604 FreeRegionInfo( 605 RF_Raid_t * raidPtr, 606 RF_RegionId_t regionID) 607 { 608 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 609 RF_Free(raidPtr->regionInfo[regionID].diskMap, 610 (raidPtr->regionInfo[regionID].capacity * 611 sizeof(RF_DiskMap_t))); 612 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 613 rf_ReleaseParityLogs(raidPtr, 614 raidPtr->regionInfo[regionID].coreLog); 615 raidPtr->regionInfo[regionID].coreLog = NULL; 616 } else { 617 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 618 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 619 } 620 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 621 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex); 622 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex); 623 } 624 625 626 static void 627 FreeParityLogQueue( 628 RF_Raid_t * raidPtr, 629 RF_ParityLogQueue_t * queue) 630 { 631 RF_ParityLog_t *l1, *l2; 632 633 RF_LOCK_MUTEX(queue->mutex); 634 l1 = queue->parityLogs; 635 while (l1) { 636 l2 = l1; 637 l1 = l2->next; 638 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 639 sizeof(RF_ParityLogRecord_t))); 640 RF_Free(l2, sizeof(RF_ParityLog_t)); 641 } 642 RF_UNLOCK_MUTEX(queue->mutex); 643 rf_mutex_destroy(&queue->mutex); 644 } 645 646 647 static void 648 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 649 { 650 int i; 651 652 RF_LOCK_MUTEX(queue->mutex); 653 if (queue->availableBuffers != queue->totalBuffers) { 654 printf("Attempt to free region queue which is still in use!\n"); 655 RF_ASSERT(0); 656 } 657 for (i = 0; i < queue->totalBuffers; i++) 658 RF_Free(queue->buffers[i], queue->bufferSize); 659 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); 660 RF_UNLOCK_MUTEX(queue->mutex); 661 rf_mutex_destroy(&queue->mutex); 662 } 663 664 static void 665 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 666 { 667 RF_Raid_t *raidPtr; 668 RF_RegionId_t i; 669 670 raidPtr = (RF_Raid_t *) arg; 671 if (rf_parityLogDebug) { 672 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 673 raidPtr->raidid); 674 } 675 /* free region information structs */ 676 for (i = 0; i < rf_numParityRegions; i++) 677 FreeRegionInfo(raidPtr, i); 678 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 679 sizeof(raidPtr->regionInfo))); 680 raidPtr->regionInfo = NULL; 681 } 682 683 static void 684 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 685 { 686 RF_Raid_t *raidPtr; 687 688 raidPtr = (RF_Raid_t *) arg; 689 if (rf_parityLogDebug) { 690 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 691 } 692 /* free contents of parityLogPool */ 693 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 694 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 695 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 696 } 697 698 static void 699 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 700 { 701 RF_Raid_t *raidPtr; 702 703 raidPtr = (RF_Raid_t *) arg; 704 if (rf_parityLogDebug) { 705 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 706 raidPtr->raidid); 707 } 708 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 709 } 710 711 static void 712 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 713 { 714 RF_Raid_t *raidPtr; 715 716 raidPtr = (RF_Raid_t *) arg; 717 if (rf_parityLogDebug) { 718 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 719 raidPtr->raidid); 720 } 721 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 722 } 723 724 static void 725 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 726 { 727 RF_ParityLogData_t *d; 728 RF_CommonLogData_t *c; 729 RF_Raid_t *raidPtr; 730 731 raidPtr = (RF_Raid_t *) arg; 732 if (rf_parityLogDebug) { 733 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 734 raidPtr->raidid); 735 } 736 /* free disk manager stuff */ 737 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 738 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 739 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 740 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 741 while (raidPtr->parityLogDiskQueue.freeDataList) { 742 d = raidPtr->parityLogDiskQueue.freeDataList; 743 raidPtr->parityLogDiskQueue.freeDataList = 744 raidPtr->parityLogDiskQueue.freeDataList->next; 745 RF_Free(d, sizeof(RF_ParityLogData_t)); 746 } 747 while (raidPtr->parityLogDiskQueue.freeCommonList) { 748 c = raidPtr->parityLogDiskQueue.freeCommonList; 749 rf_mutex_destroy(&c->mutex); 750 raidPtr->parityLogDiskQueue.freeCommonList = 751 raidPtr->parityLogDiskQueue.freeCommonList->next; 752 RF_Free(c, sizeof(RF_CommonLogData_t)); 753 } 754 } 755 756 static void 757 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 758 { 759 RF_Raid_t *raidPtr; 760 761 raidPtr = (RF_Raid_t *) arg; 762 if (rf_parityLogDebug) { 763 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 764 } 765 /* shutdown disk thread */ 766 /* This has the desirable side-effect of forcing all regions to be 767 * reintegrated. This is necessary since all parity log maps are 768 * currently held in volatile memory. */ 769 770 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 771 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 772 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 773 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 774 /* 775 * pLogDiskThread will now terminate when queues are cleared 776 * now wait for it to be done 777 */ 778 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 779 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 780 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 781 raidPtr->parityLogDiskQueue.mutex); 782 } 783 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 784 if (rf_parityLogDebug) { 785 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 786 } 787 } 788 789 int 790 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 791 { 792 return (20); 793 } 794 795 RF_HeadSepLimit_t 796 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 797 { 798 return (10); 799 } 800 /* return the region ID for a given RAID address */ 801 RF_RegionId_t 802 rf_MapRegionIDParityLogging( 803 RF_Raid_t * raidPtr, 804 RF_SectorNum_t address) 805 { 806 RF_RegionId_t regionID; 807 808 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 809 regionID = address / raidPtr->regionParityRange; 810 if (regionID == rf_numParityRegions) { 811 /* last region may be larger than other regions */ 812 regionID--; 813 } 814 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 815 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 816 raidPtr->regionInfo[regionID].numSectorsParity); 817 RF_ASSERT(regionID < rf_numParityRegions); 818 return (regionID); 819 } 820 821 822 /* given a logical RAID sector, determine physical disk address of data */ 823 void 824 rf_MapSectorParityLogging( 825 RF_Raid_t * raidPtr, 826 RF_RaidAddr_t raidSector, 827 RF_RowCol_t * row, 828 RF_RowCol_t * col, 829 RF_SectorNum_t * diskSector, 830 int remap) 831 { 832 RF_StripeNum_t SUID = raidSector / 833 raidPtr->Layout.sectorsPerStripeUnit; 834 *row = 0; 835 /* *col = (SUID % (raidPtr->numCol - 836 * raidPtr->Layout.numParityLogCol)); */ 837 *col = SUID % raidPtr->Layout.numDataCol; 838 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 839 raidPtr->Layout.sectorsPerStripeUnit + 840 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 841 } 842 843 844 /* given a logical RAID sector, determine physical disk address of parity */ 845 void 846 rf_MapParityParityLogging( 847 RF_Raid_t * raidPtr, 848 RF_RaidAddr_t raidSector, 849 RF_RowCol_t * row, 850 RF_RowCol_t * col, 851 RF_SectorNum_t * diskSector, 852 int remap) 853 { 854 RF_StripeNum_t SUID = raidSector / 855 raidPtr->Layout.sectorsPerStripeUnit; 856 857 *row = 0; 858 /* *col = 859 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 860 * r->numCol - raidPtr->Layout.numParityLogCol); */ 861 *col = raidPtr->Layout.numDataCol; 862 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 863 raidPtr->Layout.sectorsPerStripeUnit + 864 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 865 } 866 867 868 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 869 void 870 rf_MapLogParityLogging( 871 RF_Raid_t * raidPtr, 872 RF_RegionId_t regionID, 873 RF_SectorNum_t regionOffset, 874 RF_RowCol_t * row, 875 RF_RowCol_t * col, 876 RF_SectorNum_t * startSector) 877 { 878 *row = 0; 879 *col = raidPtr->numCol - 1; 880 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 881 } 882 883 884 /* given a regionID, determine the physical disk address of the logged 885 parity for that region */ 886 void 887 rf_MapRegionParity( 888 RF_Raid_t * raidPtr, 889 RF_RegionId_t regionID, 890 RF_RowCol_t * row, 891 RF_RowCol_t * col, 892 RF_SectorNum_t * startSector, 893 RF_SectorCount_t * numSector) 894 { 895 *row = 0; 896 *col = raidPtr->numCol - 2; 897 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 898 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 899 } 900 901 902 /* given a logical RAID address, determine the participating disks in 903 the stripe */ 904 void 905 rf_IdentifyStripeParityLogging( 906 RF_Raid_t * raidPtr, 907 RF_RaidAddr_t addr, 908 RF_RowCol_t ** diskids, 909 RF_RowCol_t * outRow) 910 { 911 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 912 addr); 913 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 914 raidPtr->Layout.layoutSpecificInfo; 915 *outRow = 0; 916 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 917 } 918 919 920 void 921 rf_MapSIDToPSIDParityLogging( 922 RF_RaidLayout_t * layoutPtr, 923 RF_StripeNum_t stripeID, 924 RF_StripeNum_t * psID, 925 RF_ReconUnitNum_t * which_ru) 926 { 927 *which_ru = 0; 928 *psID = stripeID; 929 } 930 931 932 /* select an algorithm for performing an access. Returns two pointers, 933 * one to a function that will return information about the DAG, and 934 * another to a function that will create the dag. 935 */ 936 void 937 rf_ParityLoggingDagSelect( 938 RF_Raid_t * raidPtr, 939 RF_IoType_t type, 940 RF_AccessStripeMap_t * asmp, 941 RF_VoidFuncPtr * createFunc) 942 { 943 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 944 RF_PhysDiskAddr_t *failedPDA = NULL; 945 RF_RowCol_t frow, fcol; 946 RF_RowStatus_t rstat; 947 int prior_recon; 948 949 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 950 951 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 952 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 953 /* *infoFunc = */ *createFunc = NULL; 954 return; 955 } else 956 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 957 958 /* if under recon & already reconstructed, redirect 959 * the access to the spare drive and eliminate the 960 * failure indication */ 961 failedPDA = asmp->failedPDAs[0]; 962 frow = failedPDA->row; 963 fcol = failedPDA->col; 964 rstat = raidPtr->status[failedPDA->row]; 965 prior_recon = (rstat == rf_rs_reconfigured) || ( 966 (rstat == rf_rs_reconstructing) ? 967 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 968 ); 969 if (prior_recon) { 970 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col; 971 RF_SectorNum_t oo = failedPDA->startSector; 972 if (layoutPtr->map->flags & 973 RF_DISTRIBUTE_SPARE) { 974 /* redirect to dist spare space */ 975 976 if (failedPDA == asmp->parityInfo) { 977 978 /* parity has failed */ 979 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 980 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 981 982 if (asmp->parityInfo->next) { /* redir 2nd component, 983 * if any */ 984 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 985 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 986 p->row = failedPDA->row; 987 p->col = failedPDA->col; 988 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 989 SUoffs; /* cheating: 990 * startSector is not 991 * really a RAID address */ 992 } 993 } else 994 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 995 RF_ASSERT(0); /* should not ever 996 * happen */ 997 } else { 998 999 /* data has failed */ 1000 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 1001 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 1002 1003 } 1004 1005 } else { 1006 /* redirect to dedicated spare space */ 1007 1008 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; 1009 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; 1010 1011 /* the parity may have two distinct 1012 * components, both of which may need 1013 * to be redirected */ 1014 if (asmp->parityInfo->next) { 1015 if (failedPDA == asmp->parityInfo) { 1016 failedPDA->next->row = failedPDA->row; 1017 failedPDA->next->col = failedPDA->col; 1018 } else 1019 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 1020 asmp->parityInfo->row = failedPDA->row; 1021 asmp->parityInfo->col = failedPDA->col; 1022 } 1023 } 1024 } 1025 1026 RF_ASSERT(failedPDA->col != -1); 1027 1028 if (rf_dagDebug || rf_mapDebug) { 1029 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", 1030 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector); 1031 } 1032 asmp->numDataFailed = asmp->numParityFailed = 0; 1033 } 1034 } 1035 if (type == RF_IO_TYPE_READ) { 1036 1037 if (asmp->numDataFailed == 0) 1038 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 1039 else 1040 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 1041 1042 } else { 1043 1044 1045 /* if mirroring, always use large writes. If the access 1046 * requires two distinct parity updates, always do a small 1047 * write. If the stripe contains a failure but the access 1048 * does not, do a small write. The first conditional 1049 * (numStripeUnitsAccessed <= numDataCol/2) uses a 1050 * less-than-or-equal rather than just a less-than because 1051 * when G is 3 or 4, numDataCol/2 is 1, and I want 1052 * single-stripe-unit updates to use just one disk. */ 1053 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 1054 if (((asmp->numStripeUnitsAccessed <= 1055 (layoutPtr->numDataCol / 2)) && 1056 (layoutPtr->numDataCol != 1)) || 1057 (asmp->parityInfo->next != NULL) || 1058 rf_CheckStripeForFailures(raidPtr, asmp)) { 1059 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 1060 } else 1061 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 1062 } else 1063 if (asmp->numParityFailed == 1) 1064 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 1065 else 1066 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 1067 *createFunc = NULL; 1068 else 1069 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 1070 } 1071 } 1072 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 1073