1 /* $NetBSD: rf_paritylogging.c,v 1.11 2001/10/04 15:58:55 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include "rf_archs.h" 35 36 #if RF_INCLUDE_PARITYLOGGING > 0 37 38 #include <dev/raidframe/raidframevar.h> 39 40 #include "rf_raid.h" 41 #include "rf_dag.h" 42 #include "rf_dagutils.h" 43 #include "rf_dagfuncs.h" 44 #include "rf_dagffrd.h" 45 #include "rf_dagffwr.h" 46 #include "rf_dagdegrd.h" 47 #include "rf_dagdegwr.h" 48 #include "rf_paritylog.h" 49 #include "rf_paritylogDiskMgr.h" 50 #include "rf_paritylogging.h" 51 #include "rf_parityloggingdags.h" 52 #include "rf_general.h" 53 #include "rf_map.h" 54 #include "rf_utils.h" 55 #include "rf_shutdown.h" 56 57 typedef struct RF_ParityLoggingConfigInfo_s { 58 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 59 * IdentifyStripe */ 60 } RF_ParityLoggingConfigInfo_t; 61 62 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 63 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 64 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 65 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 66 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 68 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 69 70 int 71 rf_ConfigureParityLogging( 72 RF_ShutdownList_t ** listp, 73 RF_Raid_t * raidPtr, 74 RF_Config_t * cfgPtr) 75 { 76 int i, j, startdisk, rc; 77 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 78 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 79 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 80 RF_ParityLoggingConfigInfo_t *info; 81 RF_ParityLog_t *l = NULL, *next; 82 caddr_t lHeapPtr; 83 84 if (rf_numParityRegions <= 0) 85 return(EINVAL); 86 87 /* 88 * We create multiple entries on the shutdown list here, since 89 * this configuration routine is fairly complicated in and of 90 * itself, and this makes backing out of a failed configuration 91 * much simpler. 92 */ 93 94 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 95 96 /* create a parity logging configuration structure */ 97 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 98 (RF_ParityLoggingConfigInfo_t *), 99 raidPtr->cleanupList); 100 if (info == NULL) 101 return (ENOMEM); 102 layoutPtr->layoutSpecificInfo = (void *) info; 103 104 RF_ASSERT(raidPtr->numRow == 1); 105 106 /* the stripe identifier must identify the disks in each stripe, IN 107 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 108 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 109 (raidPtr->numCol), 110 raidPtr->cleanupList); 111 if (info->stripeIdentifier == NULL) 112 return (ENOMEM); 113 114 startdisk = 0; 115 for (i = 0; i < (raidPtr->numCol); i++) { 116 for (j = 0; j < (raidPtr->numCol); j++) { 117 info->stripeIdentifier[i][j] = (startdisk + j) % 118 (raidPtr->numCol - 1); 119 } 120 if ((--startdisk) < 0) 121 startdisk = raidPtr->numCol - 1 - 1; 122 } 123 124 /* fill in the remaining layout parameters */ 125 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 126 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << 127 raidPtr->logBytesPerSector; 128 layoutPtr->numParityCol = 1; 129 layoutPtr->numParityLogCol = 1; 130 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 131 layoutPtr->numParityLogCol; 132 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 133 layoutPtr->sectorsPerStripeUnit; 134 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 135 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 136 layoutPtr->sectorsPerStripeUnit; 137 138 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 139 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 140 141 /* configure parity log parameters 142 * 143 * parameter comment/constraints 144 * ------------------------------------------- 145 * numParityRegions* all regions (except possibly last) 146 * of equal size 147 * totalInCoreLogCapacity* amount of memory in bytes available 148 * for in-core logs (default 1 MB) 149 * numSectorsPerLog# capacity of an in-core log in sectors 150 * (1 * disk track) 151 * numParityLogs total number of in-core logs, 152 * should be at least numParityRegions 153 * regionLogCapacity size of a region log (except possibly 154 * last one) in sectors 155 * totalLogCapacity total amount of log space in sectors 156 * 157 * where '*' denotes a user settable parameter. 158 * Note that logs are fixed to be the size of a disk track, 159 * value #defined in rf_paritylog.h 160 * 161 */ 162 163 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 164 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 165 if (rf_parityLogDebug) 166 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 167 168 /* reduce fragmentation within a disk region by adjusting the number 169 * of regions in an attempt to allow an integral number of logs to fit 170 * into a disk region */ 171 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 172 if (fragmentation > 0) 173 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 174 if (((totalLogCapacity / (rf_numParityRegions + i)) % 175 raidPtr->numSectorsPerLog) < fragmentation) { 176 rf_numParityRegions++; 177 raidPtr->regionLogCapacity = totalLogCapacity / 178 rf_numParityRegions; 179 fragmentation = raidPtr->regionLogCapacity % 180 raidPtr->numSectorsPerLog; 181 } 182 if (((totalLogCapacity / (rf_numParityRegions - i)) % 183 raidPtr->numSectorsPerLog) < fragmentation) { 184 rf_numParityRegions--; 185 raidPtr->regionLogCapacity = totalLogCapacity / 186 rf_numParityRegions; 187 fragmentation = raidPtr->regionLogCapacity % 188 raidPtr->numSectorsPerLog; 189 } 190 } 191 /* ensure integral number of regions per log */ 192 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 193 raidPtr->numSectorsPerLog) * 194 raidPtr->numSectorsPerLog; 195 196 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 197 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 198 /* to avoid deadlock, must ensure that enough logs exist for each 199 * region to have one simultaneously */ 200 if (raidPtr->numParityLogs < rf_numParityRegions) 201 raidPtr->numParityLogs = rf_numParityRegions; 202 203 /* create region information structs */ 204 printf("Allocating %d bytes for in-core parity region info\n", 205 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 206 RF_Malloc(raidPtr->regionInfo, 207 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 208 (RF_RegionInfo_t *)); 209 if (raidPtr->regionInfo == NULL) 210 return (ENOMEM); 211 212 /* last region may not be full capacity */ 213 lastRegionCapacity = raidPtr->regionLogCapacity; 214 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 215 lastRegionCapacity > totalLogCapacity) 216 lastRegionCapacity = lastRegionCapacity - 217 raidPtr->numSectorsPerLog; 218 219 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 220 rf_numParityRegions; 221 maxRegionParityRange = raidPtr->regionParityRange; 222 223 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 224 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 225 regionParityRange++; */ 226 227 /* build pool of unused parity logs */ 228 printf("Allocating %d bytes for %d parity logs\n", 229 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 230 raidPtr->bytesPerSector, 231 raidPtr->numParityLogs); 232 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 233 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 234 (caddr_t)); 235 if (raidPtr->parityLogBufferHeap == NULL) 236 return (ENOMEM); 237 lHeapPtr = raidPtr->parityLogBufferHeap; 238 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); 239 if (rc) { 240 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 241 __FILE__, __LINE__, rc); 242 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 243 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 244 return (ENOMEM); 245 } 246 for (i = 0; i < raidPtr->numParityLogs; i++) { 247 if (i == 0) { 248 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, 249 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 250 if (raidPtr->parityLogPool.parityLogs == NULL) { 251 RF_Free(raidPtr->parityLogBufferHeap, 252 raidPtr->numParityLogs * 253 raidPtr->numSectorsPerLog * 254 raidPtr->bytesPerSector); 255 return (ENOMEM); 256 } 257 l = raidPtr->parityLogPool.parityLogs; 258 } else { 259 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), 260 (RF_ParityLog_t *)); 261 if (l->next == NULL) { 262 RF_Free(raidPtr->parityLogBufferHeap, 263 raidPtr->numParityLogs * 264 raidPtr->numSectorsPerLog * 265 raidPtr->bytesPerSector); 266 for (l = raidPtr->parityLogPool.parityLogs; 267 l; 268 l = next) { 269 next = l->next; 270 if (l->records) 271 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 272 RF_Free(l, sizeof(RF_ParityLog_t)); 273 } 274 return (ENOMEM); 275 } 276 l = l->next; 277 } 278 l->bufPtr = lHeapPtr; 279 lHeapPtr += raidPtr->numSectorsPerLog * 280 raidPtr->bytesPerSector; 281 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 282 sizeof(RF_ParityLogRecord_t)), 283 (RF_ParityLogRecord_t *)); 284 if (l->records == NULL) { 285 RF_Free(raidPtr->parityLogBufferHeap, 286 raidPtr->numParityLogs * 287 raidPtr->numSectorsPerLog * 288 raidPtr->bytesPerSector); 289 for (l = raidPtr->parityLogPool.parityLogs; 290 l; 291 l = next) { 292 next = l->next; 293 if (l->records) 294 RF_Free(l->records, 295 (raidPtr->numSectorsPerLog * 296 sizeof(RF_ParityLogRecord_t))); 297 RF_Free(l, sizeof(RF_ParityLog_t)); 298 } 299 return (ENOMEM); 300 } 301 } 302 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 303 if (rc) { 304 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 305 __LINE__, rc); 306 rf_ShutdownParityLoggingPool(raidPtr); 307 return (rc); 308 } 309 /* build pool of region buffers */ 310 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); 311 if (rc) { 312 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 313 __FILE__, __LINE__, rc); 314 return (ENOMEM); 315 } 316 rc = rf_cond_init(&raidPtr->regionBufferPool.cond); 317 if (rc) { 318 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 319 __FILE__, __LINE__, rc); 320 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 321 return (ENOMEM); 322 } 323 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 324 raidPtr->bytesPerSector; 325 printf("regionBufferPool.bufferSize %d\n", 326 raidPtr->regionBufferPool.bufferSize); 327 328 /* for now, only one region at a time may be reintegrated */ 329 raidPtr->regionBufferPool.totalBuffers = 1; 330 331 raidPtr->regionBufferPool.availableBuffers = 332 raidPtr->regionBufferPool.totalBuffers; 333 raidPtr->regionBufferPool.availBuffersIndex = 0; 334 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 335 printf("Allocating %d bytes for regionBufferPool\n", 336 (int) (raidPtr->regionBufferPool.totalBuffers * 337 sizeof(caddr_t))); 338 RF_Malloc(raidPtr->regionBufferPool.buffers, 339 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 340 (caddr_t *)); 341 if (raidPtr->regionBufferPool.buffers == NULL) { 342 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 343 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 344 return (ENOMEM); 345 } 346 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 347 printf("Allocating %d bytes for regionBufferPool#%d\n", 348 (int) (raidPtr->regionBufferPool.bufferSize * 349 sizeof(char)), i); 350 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 351 raidPtr->regionBufferPool.bufferSize * sizeof(char), 352 (caddr_t)); 353 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 354 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 355 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 356 for (j = 0; j < i; j++) { 357 RF_Free(raidPtr->regionBufferPool.buffers[i], 358 raidPtr->regionBufferPool.bufferSize * 359 sizeof(char)); 360 } 361 RF_Free(raidPtr->regionBufferPool.buffers, 362 raidPtr->regionBufferPool.totalBuffers * 363 sizeof(caddr_t)); 364 return (ENOMEM); 365 } 366 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 367 (long) raidPtr->regionBufferPool.buffers[i]); 368 } 369 rc = rf_ShutdownCreate(listp, 370 rf_ShutdownParityLoggingRegionBufferPool, 371 raidPtr); 372 if (rc) { 373 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 374 __LINE__, rc); 375 rf_ShutdownParityLoggingRegionBufferPool(raidPtr); 376 return (rc); 377 } 378 /* build pool of parity buffers */ 379 parityBufferCapacity = maxRegionParityRange; 380 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); 381 if (rc) { 382 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 383 __FILE__, __LINE__, rc); 384 return (rc); 385 } 386 rc = rf_cond_init(&raidPtr->parityBufferPool.cond); 387 if (rc) { 388 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 389 __FILE__, __LINE__, rc); 390 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 391 return (ENOMEM); 392 } 393 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 394 raidPtr->bytesPerSector; 395 printf("parityBufferPool.bufferSize %d\n", 396 raidPtr->parityBufferPool.bufferSize); 397 398 /* for now, only one region at a time may be reintegrated */ 399 raidPtr->parityBufferPool.totalBuffers = 1; 400 401 raidPtr->parityBufferPool.availableBuffers = 402 raidPtr->parityBufferPool.totalBuffers; 403 raidPtr->parityBufferPool.availBuffersIndex = 0; 404 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 405 printf("Allocating %d bytes for parityBufferPool of %d units\n", 406 (int) (raidPtr->parityBufferPool.totalBuffers * 407 sizeof(caddr_t)), 408 raidPtr->parityBufferPool.totalBuffers ); 409 RF_Malloc(raidPtr->parityBufferPool.buffers, 410 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 411 (caddr_t *)); 412 if (raidPtr->parityBufferPool.buffers == NULL) { 413 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 414 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 415 return (ENOMEM); 416 } 417 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 418 printf("Allocating %d bytes for parityBufferPool#%d\n", 419 (int) (raidPtr->parityBufferPool.bufferSize * 420 sizeof(char)),i); 421 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 422 raidPtr->parityBufferPool.bufferSize * sizeof(char), 423 (caddr_t)); 424 if (raidPtr->parityBufferPool.buffers == NULL) { 425 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 426 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 427 for (j = 0; j < i; j++) { 428 RF_Free(raidPtr->parityBufferPool.buffers[i], 429 raidPtr->regionBufferPool.bufferSize * 430 sizeof(char)); 431 } 432 RF_Free(raidPtr->parityBufferPool.buffers, 433 raidPtr->regionBufferPool.totalBuffers * 434 sizeof(caddr_t)); 435 return (ENOMEM); 436 } 437 printf("parityBufferPool.buffers[%d] = %lx\n", i, 438 (long) raidPtr->parityBufferPool.buffers[i]); 439 } 440 rc = rf_ShutdownCreate(listp, 441 rf_ShutdownParityLoggingParityBufferPool, 442 raidPtr); 443 if (rc) { 444 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 445 __LINE__, rc); 446 rf_ShutdownParityLoggingParityBufferPool(raidPtr); 447 return (rc); 448 } 449 /* initialize parityLogDiskQueue */ 450 rc = rf_create_managed_mutex(listp, 451 &raidPtr->parityLogDiskQueue.mutex); 452 if (rc) { 453 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", 454 __FILE__, __LINE__, rc); 455 return (rc); 456 } 457 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond); 458 if (rc) { 459 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", 460 __FILE__, __LINE__, rc); 461 return (rc); 462 } 463 raidPtr->parityLogDiskQueue.flushQueue = NULL; 464 raidPtr->parityLogDiskQueue.reintQueue = NULL; 465 raidPtr->parityLogDiskQueue.bufHead = NULL; 466 raidPtr->parityLogDiskQueue.bufTail = NULL; 467 raidPtr->parityLogDiskQueue.reintHead = NULL; 468 raidPtr->parityLogDiskQueue.reintTail = NULL; 469 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 470 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 471 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 472 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 473 raidPtr->parityLogDiskQueue.freeDataList = NULL; 474 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 475 476 rc = rf_ShutdownCreate(listp, 477 rf_ShutdownParityLoggingDiskQueue, 478 raidPtr); 479 if (rc) { 480 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 481 __LINE__, rc); 482 return (rc); 483 } 484 for (i = 0; i < rf_numParityRegions; i++) { 485 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); 486 if (rc) { 487 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, 488 __LINE__, rc); 489 for (j = 0; j < i; j++) 490 FreeRegionInfo(raidPtr, j); 491 RF_Free(raidPtr->regionInfo, 492 (rf_numParityRegions * 493 sizeof(RF_RegionInfo_t))); 494 return (ENOMEM); 495 } 496 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 497 if (rc) { 498 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, 499 __LINE__, rc); 500 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 501 for (j = 0; j < i; j++) 502 FreeRegionInfo(raidPtr, j); 503 RF_Free(raidPtr->regionInfo, 504 (rf_numParityRegions * 505 sizeof(RF_RegionInfo_t))); 506 return (ENOMEM); 507 } 508 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 509 raidPtr->regionInfo[i].regionStartAddr = 510 raidPtr->regionLogCapacity * i; 511 raidPtr->regionInfo[i].parityStartAddr = 512 raidPtr->regionParityRange * i; 513 if (i < rf_numParityRegions - 1) { 514 raidPtr->regionInfo[i].capacity = 515 raidPtr->regionLogCapacity; 516 raidPtr->regionInfo[i].numSectorsParity = 517 raidPtr->regionParityRange; 518 } else { 519 raidPtr->regionInfo[i].capacity = 520 lastRegionCapacity; 521 raidPtr->regionInfo[i].numSectorsParity = 522 raidPtr->sectorsPerDisk - 523 raidPtr->regionParityRange * i; 524 if (raidPtr->regionInfo[i].numSectorsParity > 525 maxRegionParityRange) 526 maxRegionParityRange = 527 raidPtr->regionInfo[i].numSectorsParity; 528 } 529 raidPtr->regionInfo[i].diskCount = 0; 530 RF_ASSERT(raidPtr->regionInfo[i].capacity + 531 raidPtr->regionInfo[i].regionStartAddr <= 532 totalLogCapacity); 533 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 534 raidPtr->regionInfo[i].numSectorsParity <= 535 raidPtr->sectorsPerDisk); 536 printf("Allocating %d bytes for region %d\n", 537 (int) (raidPtr->regionInfo[i].capacity * 538 sizeof(RF_DiskMap_t)), i); 539 RF_Malloc(raidPtr->regionInfo[i].diskMap, 540 (raidPtr->regionInfo[i].capacity * 541 sizeof(RF_DiskMap_t)), 542 (RF_DiskMap_t *)); 543 if (raidPtr->regionInfo[i].diskMap == NULL) { 544 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 545 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex); 546 for (j = 0; j < i; j++) 547 FreeRegionInfo(raidPtr, j); 548 RF_Free(raidPtr->regionInfo, 549 (rf_numParityRegions * 550 sizeof(RF_RegionInfo_t))); 551 return (ENOMEM); 552 } 553 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 554 raidPtr->regionInfo[i].coreLog = NULL; 555 } 556 rc = rf_ShutdownCreate(listp, 557 rf_ShutdownParityLoggingRegionInfo, 558 raidPtr); 559 if (rc) { 560 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 561 __LINE__, rc); 562 rf_ShutdownParityLoggingRegionInfo(raidPtr); 563 return (rc); 564 } 565 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 566 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 567 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 568 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 569 if (rc) { 570 raidPtr->parityLogDiskQueue.threadState = 0; 571 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 572 __FILE__, __LINE__, rc); 573 return (ENOMEM); 574 } 575 /* wait for thread to start */ 576 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 577 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 578 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 579 raidPtr->parityLogDiskQueue.mutex); 580 } 581 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 582 583 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 584 if (rc) { 585 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc); 586 rf_ShutdownParityLogging(raidPtr); 587 return (rc); 588 } 589 if (rf_parityLogDebug) { 590 printf(" size of disk log in sectors: %d\n", 591 (int) totalLogCapacity); 592 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 593 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 594 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 595 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 596 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 597 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 598 } 599 rf_EnableParityLogging(raidPtr); 600 601 return (0); 602 } 603 604 static void 605 FreeRegionInfo( 606 RF_Raid_t * raidPtr, 607 RF_RegionId_t regionID) 608 { 609 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 610 RF_Free(raidPtr->regionInfo[regionID].diskMap, 611 (raidPtr->regionInfo[regionID].capacity * 612 sizeof(RF_DiskMap_t))); 613 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 614 rf_ReleaseParityLogs(raidPtr, 615 raidPtr->regionInfo[regionID].coreLog); 616 raidPtr->regionInfo[regionID].coreLog = NULL; 617 } else { 618 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 619 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 620 } 621 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 622 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex); 623 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex); 624 } 625 626 627 static void 628 FreeParityLogQueue( 629 RF_Raid_t * raidPtr, 630 RF_ParityLogQueue_t * queue) 631 { 632 RF_ParityLog_t *l1, *l2; 633 634 RF_LOCK_MUTEX(queue->mutex); 635 l1 = queue->parityLogs; 636 while (l1) { 637 l2 = l1; 638 l1 = l2->next; 639 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 640 sizeof(RF_ParityLogRecord_t))); 641 RF_Free(l2, sizeof(RF_ParityLog_t)); 642 } 643 RF_UNLOCK_MUTEX(queue->mutex); 644 rf_mutex_destroy(&queue->mutex); 645 } 646 647 648 static void 649 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 650 { 651 int i; 652 653 RF_LOCK_MUTEX(queue->mutex); 654 if (queue->availableBuffers != queue->totalBuffers) { 655 printf("Attempt to free region queue which is still in use!\n"); 656 RF_ASSERT(0); 657 } 658 for (i = 0; i < queue->totalBuffers; i++) 659 RF_Free(queue->buffers[i], queue->bufferSize); 660 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); 661 RF_UNLOCK_MUTEX(queue->mutex); 662 rf_mutex_destroy(&queue->mutex); 663 } 664 665 static void 666 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 667 { 668 RF_Raid_t *raidPtr; 669 RF_RegionId_t i; 670 671 raidPtr = (RF_Raid_t *) arg; 672 if (rf_parityLogDebug) { 673 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 674 raidPtr->raidid); 675 } 676 /* free region information structs */ 677 for (i = 0; i < rf_numParityRegions; i++) 678 FreeRegionInfo(raidPtr, i); 679 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 680 sizeof(raidPtr->regionInfo))); 681 raidPtr->regionInfo = NULL; 682 } 683 684 static void 685 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 686 { 687 RF_Raid_t *raidPtr; 688 689 raidPtr = (RF_Raid_t *) arg; 690 if (rf_parityLogDebug) { 691 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 692 } 693 /* free contents of parityLogPool */ 694 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 695 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 696 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 697 } 698 699 static void 700 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 701 { 702 RF_Raid_t *raidPtr; 703 704 raidPtr = (RF_Raid_t *) arg; 705 if (rf_parityLogDebug) { 706 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 707 raidPtr->raidid); 708 } 709 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 710 } 711 712 static void 713 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 714 { 715 RF_Raid_t *raidPtr; 716 717 raidPtr = (RF_Raid_t *) arg; 718 if (rf_parityLogDebug) { 719 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 720 raidPtr->raidid); 721 } 722 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 723 } 724 725 static void 726 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 727 { 728 RF_ParityLogData_t *d; 729 RF_CommonLogData_t *c; 730 RF_Raid_t *raidPtr; 731 732 raidPtr = (RF_Raid_t *) arg; 733 if (rf_parityLogDebug) { 734 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 735 raidPtr->raidid); 736 } 737 /* free disk manager stuff */ 738 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 739 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 740 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 741 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 742 while (raidPtr->parityLogDiskQueue.freeDataList) { 743 d = raidPtr->parityLogDiskQueue.freeDataList; 744 raidPtr->parityLogDiskQueue.freeDataList = 745 raidPtr->parityLogDiskQueue.freeDataList->next; 746 RF_Free(d, sizeof(RF_ParityLogData_t)); 747 } 748 while (raidPtr->parityLogDiskQueue.freeCommonList) { 749 c = raidPtr->parityLogDiskQueue.freeCommonList; 750 rf_mutex_destroy(&c->mutex); 751 raidPtr->parityLogDiskQueue.freeCommonList = 752 raidPtr->parityLogDiskQueue.freeCommonList->next; 753 RF_Free(c, sizeof(RF_CommonLogData_t)); 754 } 755 } 756 757 static void 758 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 759 { 760 RF_Raid_t *raidPtr; 761 762 raidPtr = (RF_Raid_t *) arg; 763 if (rf_parityLogDebug) { 764 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 765 } 766 /* shutdown disk thread */ 767 /* This has the desirable side-effect of forcing all regions to be 768 * reintegrated. This is necessary since all parity log maps are 769 * currently held in volatile memory. */ 770 771 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 772 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 773 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 774 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 775 /* 776 * pLogDiskThread will now terminate when queues are cleared 777 * now wait for it to be done 778 */ 779 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 780 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 781 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 782 raidPtr->parityLogDiskQueue.mutex); 783 } 784 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 785 if (rf_parityLogDebug) { 786 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 787 } 788 } 789 790 int 791 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 792 { 793 return (20); 794 } 795 796 RF_HeadSepLimit_t 797 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 798 { 799 return (10); 800 } 801 /* return the region ID for a given RAID address */ 802 RF_RegionId_t 803 rf_MapRegionIDParityLogging( 804 RF_Raid_t * raidPtr, 805 RF_SectorNum_t address) 806 { 807 RF_RegionId_t regionID; 808 809 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 810 regionID = address / raidPtr->regionParityRange; 811 if (regionID == rf_numParityRegions) { 812 /* last region may be larger than other regions */ 813 regionID--; 814 } 815 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 816 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 817 raidPtr->regionInfo[regionID].numSectorsParity); 818 RF_ASSERT(regionID < rf_numParityRegions); 819 return (regionID); 820 } 821 822 823 /* given a logical RAID sector, determine physical disk address of data */ 824 void 825 rf_MapSectorParityLogging( 826 RF_Raid_t * raidPtr, 827 RF_RaidAddr_t raidSector, 828 RF_RowCol_t * row, 829 RF_RowCol_t * col, 830 RF_SectorNum_t * diskSector, 831 int remap) 832 { 833 RF_StripeNum_t SUID = raidSector / 834 raidPtr->Layout.sectorsPerStripeUnit; 835 *row = 0; 836 /* *col = (SUID % (raidPtr->numCol - 837 * raidPtr->Layout.numParityLogCol)); */ 838 *col = SUID % raidPtr->Layout.numDataCol; 839 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 840 raidPtr->Layout.sectorsPerStripeUnit + 841 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 842 } 843 844 845 /* given a logical RAID sector, determine physical disk address of parity */ 846 void 847 rf_MapParityParityLogging( 848 RF_Raid_t * raidPtr, 849 RF_RaidAddr_t raidSector, 850 RF_RowCol_t * row, 851 RF_RowCol_t * col, 852 RF_SectorNum_t * diskSector, 853 int remap) 854 { 855 RF_StripeNum_t SUID = raidSector / 856 raidPtr->Layout.sectorsPerStripeUnit; 857 858 *row = 0; 859 /* *col = 860 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 861 * r->numCol - raidPtr->Layout.numParityLogCol); */ 862 *col = raidPtr->Layout.numDataCol; 863 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 864 raidPtr->Layout.sectorsPerStripeUnit + 865 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 866 } 867 868 869 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 870 void 871 rf_MapLogParityLogging( 872 RF_Raid_t * raidPtr, 873 RF_RegionId_t regionID, 874 RF_SectorNum_t regionOffset, 875 RF_RowCol_t * row, 876 RF_RowCol_t * col, 877 RF_SectorNum_t * startSector) 878 { 879 *row = 0; 880 *col = raidPtr->numCol - 1; 881 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 882 } 883 884 885 /* given a regionID, determine the physical disk address of the logged 886 parity for that region */ 887 void 888 rf_MapRegionParity( 889 RF_Raid_t * raidPtr, 890 RF_RegionId_t regionID, 891 RF_RowCol_t * row, 892 RF_RowCol_t * col, 893 RF_SectorNum_t * startSector, 894 RF_SectorCount_t * numSector) 895 { 896 *row = 0; 897 *col = raidPtr->numCol - 2; 898 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 899 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 900 } 901 902 903 /* given a logical RAID address, determine the participating disks in 904 the stripe */ 905 void 906 rf_IdentifyStripeParityLogging( 907 RF_Raid_t * raidPtr, 908 RF_RaidAddr_t addr, 909 RF_RowCol_t ** diskids, 910 RF_RowCol_t * outRow) 911 { 912 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 913 addr); 914 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 915 raidPtr->Layout.layoutSpecificInfo; 916 *outRow = 0; 917 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 918 } 919 920 921 void 922 rf_MapSIDToPSIDParityLogging( 923 RF_RaidLayout_t * layoutPtr, 924 RF_StripeNum_t stripeID, 925 RF_StripeNum_t * psID, 926 RF_ReconUnitNum_t * which_ru) 927 { 928 *which_ru = 0; 929 *psID = stripeID; 930 } 931 932 933 /* select an algorithm for performing an access. Returns two pointers, 934 * one to a function that will return information about the DAG, and 935 * another to a function that will create the dag. 936 */ 937 void 938 rf_ParityLoggingDagSelect( 939 RF_Raid_t * raidPtr, 940 RF_IoType_t type, 941 RF_AccessStripeMap_t * asmp, 942 RF_VoidFuncPtr * createFunc) 943 { 944 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 945 RF_PhysDiskAddr_t *failedPDA = NULL; 946 RF_RowCol_t frow, fcol; 947 RF_RowStatus_t rstat; 948 int prior_recon; 949 950 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 951 952 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 953 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 954 /* *infoFunc = */ *createFunc = NULL; 955 return; 956 } else 957 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 958 959 /* if under recon & already reconstructed, redirect 960 * the access to the spare drive and eliminate the 961 * failure indication */ 962 failedPDA = asmp->failedPDAs[0]; 963 frow = failedPDA->row; 964 fcol = failedPDA->col; 965 rstat = raidPtr->status[failedPDA->row]; 966 prior_recon = (rstat == rf_rs_reconfigured) || ( 967 (rstat == rf_rs_reconstructing) ? 968 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 969 ); 970 if (prior_recon) { 971 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col; 972 RF_SectorNum_t oo = failedPDA->startSector; 973 if (layoutPtr->map->flags & 974 RF_DISTRIBUTE_SPARE) { 975 /* redirect to dist spare space */ 976 977 if (failedPDA == asmp->parityInfo) { 978 979 /* parity has failed */ 980 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 981 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 982 983 if (asmp->parityInfo->next) { /* redir 2nd component, 984 * if any */ 985 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 986 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 987 p->row = failedPDA->row; 988 p->col = failedPDA->col; 989 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 990 SUoffs; /* cheating: 991 * startSector is not 992 * really a RAID address */ 993 } 994 } else 995 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 996 RF_ASSERT(0); /* should not ever 997 * happen */ 998 } else { 999 1000 /* data has failed */ 1001 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 1002 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 1003 1004 } 1005 1006 } else { 1007 /* redirect to dedicated spare space */ 1008 1009 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; 1010 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; 1011 1012 /* the parity may have two distinct 1013 * components, both of which may need 1014 * to be redirected */ 1015 if (asmp->parityInfo->next) { 1016 if (failedPDA == asmp->parityInfo) { 1017 failedPDA->next->row = failedPDA->row; 1018 failedPDA->next->col = failedPDA->col; 1019 } else 1020 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 1021 asmp->parityInfo->row = failedPDA->row; 1022 asmp->parityInfo->col = failedPDA->col; 1023 } 1024 } 1025 } 1026 1027 RF_ASSERT(failedPDA->col != -1); 1028 1029 if (rf_dagDebug || rf_mapDebug) { 1030 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", 1031 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector); 1032 } 1033 asmp->numDataFailed = asmp->numParityFailed = 0; 1034 } 1035 } 1036 if (type == RF_IO_TYPE_READ) { 1037 1038 if (asmp->numDataFailed == 0) 1039 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 1040 else 1041 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 1042 1043 } else { 1044 1045 1046 /* if mirroring, always use large writes. If the access 1047 * requires two distinct parity updates, always do a small 1048 * write. If the stripe contains a failure but the access 1049 * does not, do a small write. The first conditional 1050 * (numStripeUnitsAccessed <= numDataCol/2) uses a 1051 * less-than-or-equal rather than just a less-than because 1052 * when G is 3 or 4, numDataCol/2 is 1, and I want 1053 * single-stripe-unit updates to use just one disk. */ 1054 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 1055 if (((asmp->numStripeUnitsAccessed <= 1056 (layoutPtr->numDataCol / 2)) && 1057 (layoutPtr->numDataCol != 1)) || 1058 (asmp->parityInfo->next != NULL) || 1059 rf_CheckStripeForFailures(raidPtr, asmp)) { 1060 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 1061 } else 1062 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 1063 } else 1064 if (asmp->numParityFailed == 1) 1065 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 1066 else 1067 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 1068 *createFunc = NULL; 1069 else 1070 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 1071 } 1072 } 1073 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 1074