1 /* $NetBSD: rf_paritylogging.c,v 1.28 2007/03/04 06:02:39 christos Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.28 2007/03/04 06:02:39 christos Exp $"); 36 37 #include "rf_archs.h" 38 39 #if RF_INCLUDE_PARITYLOGGING > 0 40 41 #include <dev/raidframe/raidframevar.h> 42 43 #include "rf_raid.h" 44 #include "rf_dag.h" 45 #include "rf_dagutils.h" 46 #include "rf_dagfuncs.h" 47 #include "rf_dagffrd.h" 48 #include "rf_dagffwr.h" 49 #include "rf_dagdegrd.h" 50 #include "rf_dagdegwr.h" 51 #include "rf_paritylog.h" 52 #include "rf_paritylogDiskMgr.h" 53 #include "rf_paritylogging.h" 54 #include "rf_parityloggingdags.h" 55 #include "rf_general.h" 56 #include "rf_map.h" 57 #include "rf_utils.h" 58 #include "rf_shutdown.h" 59 60 typedef struct RF_ParityLoggingConfigInfo_s { 61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 62 * IdentifyStripe */ 63 } RF_ParityLoggingConfigInfo_t; 64 65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 72 73 int 74 rf_ConfigureParityLogging( 75 RF_ShutdownList_t ** listp, 76 RF_Raid_t * raidPtr, 77 RF_Config_t * cfgPtr) 78 { 79 int i, j, startdisk, rc; 80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 83 RF_ParityLoggingConfigInfo_t *info; 84 RF_ParityLog_t *l = NULL, *next; 85 void *lHeapPtr; 86 87 if (rf_numParityRegions <= 0) 88 return(EINVAL); 89 90 /* 91 * We create multiple entries on the shutdown list here, since 92 * this configuration routine is fairly complicated in and of 93 * itself, and this makes backing out of a failed configuration 94 * much simpler. 95 */ 96 97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 98 99 /* create a parity logging configuration structure */ 100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 101 (RF_ParityLoggingConfigInfo_t *), 102 raidPtr->cleanupList); 103 if (info == NULL) 104 return (ENOMEM); 105 layoutPtr->layoutSpecificInfo = (void *) info; 106 107 /* the stripe identifier must identify the disks in each stripe, IN 108 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 109 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 110 (raidPtr->numCol), 111 raidPtr->cleanupList); 112 if (info->stripeIdentifier == NULL) 113 return (ENOMEM); 114 115 startdisk = 0; 116 for (i = 0; i < (raidPtr->numCol); i++) { 117 for (j = 0; j < (raidPtr->numCol); j++) { 118 info->stripeIdentifier[i][j] = (startdisk + j) % 119 (raidPtr->numCol - 1); 120 } 121 if ((--startdisk) < 0) 122 startdisk = raidPtr->numCol - 1 - 1; 123 } 124 125 /* fill in the remaining layout parameters */ 126 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 127 layoutPtr->numParityCol = 1; 128 layoutPtr->numParityLogCol = 1; 129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 130 layoutPtr->numParityLogCol; 131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 132 layoutPtr->sectorsPerStripeUnit; 133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 135 layoutPtr->sectorsPerStripeUnit; 136 137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 139 140 /* configure parity log parameters 141 * 142 * parameter comment/constraints 143 * ------------------------------------------- 144 * numParityRegions* all regions (except possibly last) 145 * of equal size 146 * totalInCoreLogCapacity* amount of memory in bytes available 147 * for in-core logs (default 1 MB) 148 * numSectorsPerLog# capacity of an in-core log in sectors 149 * (1 * disk track) 150 * numParityLogs total number of in-core logs, 151 * should be at least numParityRegions 152 * regionLogCapacity size of a region log (except possibly 153 * last one) in sectors 154 * totalLogCapacity total amount of log space in sectors 155 * 156 * where '*' denotes a user settable parameter. 157 * Note that logs are fixed to be the size of a disk track, 158 * value #defined in rf_paritylog.h 159 * 160 */ 161 162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 164 if (rf_parityLogDebug) 165 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 166 167 /* reduce fragmentation within a disk region by adjusting the number 168 * of regions in an attempt to allow an integral number of logs to fit 169 * into a disk region */ 170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 171 if (fragmentation > 0) 172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 173 if (((totalLogCapacity / (rf_numParityRegions + i)) % 174 raidPtr->numSectorsPerLog) < fragmentation) { 175 rf_numParityRegions++; 176 raidPtr->regionLogCapacity = totalLogCapacity / 177 rf_numParityRegions; 178 fragmentation = raidPtr->regionLogCapacity % 179 raidPtr->numSectorsPerLog; 180 } 181 if (((totalLogCapacity / (rf_numParityRegions - i)) % 182 raidPtr->numSectorsPerLog) < fragmentation) { 183 rf_numParityRegions--; 184 raidPtr->regionLogCapacity = totalLogCapacity / 185 rf_numParityRegions; 186 fragmentation = raidPtr->regionLogCapacity % 187 raidPtr->numSectorsPerLog; 188 } 189 } 190 /* ensure integral number of regions per log */ 191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 192 raidPtr->numSectorsPerLog) * 193 raidPtr->numSectorsPerLog; 194 195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 197 /* to avoid deadlock, must ensure that enough logs exist for each 198 * region to have one simultaneously */ 199 if (raidPtr->numParityLogs < rf_numParityRegions) 200 raidPtr->numParityLogs = rf_numParityRegions; 201 202 /* create region information structs */ 203 printf("Allocating %d bytes for in-core parity region info\n", 204 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 205 RF_Malloc(raidPtr->regionInfo, 206 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 207 (RF_RegionInfo_t *)); 208 if (raidPtr->regionInfo == NULL) 209 return (ENOMEM); 210 211 /* last region may not be full capacity */ 212 lastRegionCapacity = raidPtr->regionLogCapacity; 213 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 214 lastRegionCapacity > totalLogCapacity) 215 lastRegionCapacity = lastRegionCapacity - 216 raidPtr->numSectorsPerLog; 217 218 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 219 rf_numParityRegions; 220 maxRegionParityRange = raidPtr->regionParityRange; 221 222 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 224 regionParityRange++; */ 225 226 /* build pool of unused parity logs */ 227 printf("Allocating %d bytes for %d parity logs\n", 228 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 229 raidPtr->bytesPerSector, 230 raidPtr->numParityLogs); 231 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 232 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 233 (void *)); 234 if (raidPtr->parityLogBufferHeap == NULL) 235 return (ENOMEM); 236 lHeapPtr = raidPtr->parityLogBufferHeap; 237 rf_mutex_init(&raidPtr->parityLogPool.mutex); 238 for (i = 0; i < raidPtr->numParityLogs; i++) { 239 if (i == 0) { 240 RF_Malloc(raidPtr->parityLogPool.parityLogs, 241 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 242 if (raidPtr->parityLogPool.parityLogs == NULL) { 243 RF_Free(raidPtr->parityLogBufferHeap, 244 raidPtr->numParityLogs * 245 raidPtr->numSectorsPerLog * 246 raidPtr->bytesPerSector); 247 return (ENOMEM); 248 } 249 l = raidPtr->parityLogPool.parityLogs; 250 } else { 251 RF_Malloc(l->next, sizeof(RF_ParityLog_t), 252 (RF_ParityLog_t *)); 253 if (l->next == NULL) { 254 RF_Free(raidPtr->parityLogBufferHeap, 255 raidPtr->numParityLogs * 256 raidPtr->numSectorsPerLog * 257 raidPtr->bytesPerSector); 258 for (l = raidPtr->parityLogPool.parityLogs; 259 l; 260 l = next) { 261 next = l->next; 262 if (l->records) 263 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 264 RF_Free(l, sizeof(RF_ParityLog_t)); 265 } 266 return (ENOMEM); 267 } 268 l = l->next; 269 } 270 l->bufPtr = lHeapPtr; 271 lHeapPtr = (char *)lHeapPtr + raidPtr->numSectorsPerLog * 272 raidPtr->bytesPerSector; 273 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 274 sizeof(RF_ParityLogRecord_t)), 275 (RF_ParityLogRecord_t *)); 276 if (l->records == NULL) { 277 RF_Free(raidPtr->parityLogBufferHeap, 278 raidPtr->numParityLogs * 279 raidPtr->numSectorsPerLog * 280 raidPtr->bytesPerSector); 281 for (l = raidPtr->parityLogPool.parityLogs; 282 l; 283 l = next) { 284 next = l->next; 285 if (l->records) 286 RF_Free(l->records, 287 (raidPtr->numSectorsPerLog * 288 sizeof(RF_ParityLogRecord_t))); 289 RF_Free(l, sizeof(RF_ParityLog_t)); 290 } 291 return (ENOMEM); 292 } 293 } 294 rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 295 /* build pool of region buffers */ 296 rf_mutex_init(&raidPtr->regionBufferPool.mutex); 297 raidPtr->regionBufferPool.cond = 0; 298 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 299 raidPtr->bytesPerSector; 300 printf("regionBufferPool.bufferSize %d\n", 301 raidPtr->regionBufferPool.bufferSize); 302 303 /* for now, only one region at a time may be reintegrated */ 304 raidPtr->regionBufferPool.totalBuffers = 1; 305 306 raidPtr->regionBufferPool.availableBuffers = 307 raidPtr->regionBufferPool.totalBuffers; 308 raidPtr->regionBufferPool.availBuffersIndex = 0; 309 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 310 printf("Allocating %d bytes for regionBufferPool\n", 311 (int) (raidPtr->regionBufferPool.totalBuffers * 312 sizeof(void *))); 313 RF_Malloc(raidPtr->regionBufferPool.buffers, 314 raidPtr->regionBufferPool.totalBuffers * sizeof(void *), 315 (void **)); 316 if (raidPtr->regionBufferPool.buffers == NULL) { 317 return (ENOMEM); 318 } 319 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 320 printf("Allocating %d bytes for regionBufferPool#%d\n", 321 (int) (raidPtr->regionBufferPool.bufferSize * 322 sizeof(char)), i); 323 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 324 raidPtr->regionBufferPool.bufferSize * sizeof(char), 325 (void *)); 326 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 327 for (j = 0; j < i; j++) { 328 RF_Free(raidPtr->regionBufferPool.buffers[i], 329 raidPtr->regionBufferPool.bufferSize * 330 sizeof(char)); 331 } 332 RF_Free(raidPtr->regionBufferPool.buffers, 333 raidPtr->regionBufferPool.totalBuffers * 334 sizeof(void *)); 335 return (ENOMEM); 336 } 337 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 338 (long) raidPtr->regionBufferPool.buffers[i]); 339 } 340 rf_ShutdownCreate(listp, 341 rf_ShutdownParityLoggingRegionBufferPool, 342 raidPtr); 343 /* build pool of parity buffers */ 344 parityBufferCapacity = maxRegionParityRange; 345 rf_mutex_init(&raidPtr->parityBufferPool.mutex); 346 raidPtr->parityBufferPool.cond = 0; 347 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 348 raidPtr->bytesPerSector; 349 printf("parityBufferPool.bufferSize %d\n", 350 raidPtr->parityBufferPool.bufferSize); 351 352 /* for now, only one region at a time may be reintegrated */ 353 raidPtr->parityBufferPool.totalBuffers = 1; 354 355 raidPtr->parityBufferPool.availableBuffers = 356 raidPtr->parityBufferPool.totalBuffers; 357 raidPtr->parityBufferPool.availBuffersIndex = 0; 358 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 359 printf("Allocating %d bytes for parityBufferPool of %d units\n", 360 (int) (raidPtr->parityBufferPool.totalBuffers * 361 sizeof(void *)), 362 raidPtr->parityBufferPool.totalBuffers ); 363 RF_Malloc(raidPtr->parityBufferPool.buffers, 364 raidPtr->parityBufferPool.totalBuffers * sizeof(void *), 365 (void **)); 366 if (raidPtr->parityBufferPool.buffers == NULL) { 367 return (ENOMEM); 368 } 369 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 370 printf("Allocating %d bytes for parityBufferPool#%d\n", 371 (int) (raidPtr->parityBufferPool.bufferSize * 372 sizeof(char)),i); 373 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 374 raidPtr->parityBufferPool.bufferSize * sizeof(char), 375 (void *)); 376 if (raidPtr->parityBufferPool.buffers == NULL) { 377 for (j = 0; j < i; j++) { 378 RF_Free(raidPtr->parityBufferPool.buffers[i], 379 raidPtr->regionBufferPool.bufferSize * 380 sizeof(char)); 381 } 382 RF_Free(raidPtr->parityBufferPool.buffers, 383 raidPtr->regionBufferPool.totalBuffers * 384 sizeof(void *)); 385 return (ENOMEM); 386 } 387 printf("parityBufferPool.buffers[%d] = %lx\n", i, 388 (long) raidPtr->parityBufferPool.buffers[i]); 389 } 390 rf_ShutdownCreate(listp, 391 rf_ShutdownParityLoggingParityBufferPool, 392 raidPtr); 393 /* initialize parityLogDiskQueue */ 394 rf_mutex_init(&raidPtr->parityLogDiskQueue.mutex); 395 raidPtr->parityLogDiskQueue.cond = 0; 396 raidPtr->parityLogDiskQueue.flushQueue = NULL; 397 raidPtr->parityLogDiskQueue.reintQueue = NULL; 398 raidPtr->parityLogDiskQueue.bufHead = NULL; 399 raidPtr->parityLogDiskQueue.bufTail = NULL; 400 raidPtr->parityLogDiskQueue.reintHead = NULL; 401 raidPtr->parityLogDiskQueue.reintTail = NULL; 402 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 403 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 404 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 405 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 406 raidPtr->parityLogDiskQueue.freeDataList = NULL; 407 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 408 409 rf_ShutdownCreate(listp, 410 rf_ShutdownParityLoggingDiskQueue, 411 raidPtr); 412 for (i = 0; i < rf_numParityRegions; i++) { 413 rf_mutex_init(&raidPtr->regionInfo[i].mutex); 414 rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 415 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 416 raidPtr->regionInfo[i].regionStartAddr = 417 raidPtr->regionLogCapacity * i; 418 raidPtr->regionInfo[i].parityStartAddr = 419 raidPtr->regionParityRange * i; 420 if (i < rf_numParityRegions - 1) { 421 raidPtr->regionInfo[i].capacity = 422 raidPtr->regionLogCapacity; 423 raidPtr->regionInfo[i].numSectorsParity = 424 raidPtr->regionParityRange; 425 } else { 426 raidPtr->regionInfo[i].capacity = 427 lastRegionCapacity; 428 raidPtr->regionInfo[i].numSectorsParity = 429 raidPtr->sectorsPerDisk - 430 raidPtr->regionParityRange * i; 431 if (raidPtr->regionInfo[i].numSectorsParity > 432 maxRegionParityRange) 433 maxRegionParityRange = 434 raidPtr->regionInfo[i].numSectorsParity; 435 } 436 raidPtr->regionInfo[i].diskCount = 0; 437 RF_ASSERT(raidPtr->regionInfo[i].capacity + 438 raidPtr->regionInfo[i].regionStartAddr <= 439 totalLogCapacity); 440 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 441 raidPtr->regionInfo[i].numSectorsParity <= 442 raidPtr->sectorsPerDisk); 443 printf("Allocating %d bytes for region %d\n", 444 (int) (raidPtr->regionInfo[i].capacity * 445 sizeof(RF_DiskMap_t)), i); 446 RF_Malloc(raidPtr->regionInfo[i].diskMap, 447 (raidPtr->regionInfo[i].capacity * 448 sizeof(RF_DiskMap_t)), 449 (RF_DiskMap_t *)); 450 if (raidPtr->regionInfo[i].diskMap == NULL) { 451 for (j = 0; j < i; j++) 452 FreeRegionInfo(raidPtr, j); 453 RF_Free(raidPtr->regionInfo, 454 (rf_numParityRegions * 455 sizeof(RF_RegionInfo_t))); 456 return (ENOMEM); 457 } 458 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 459 raidPtr->regionInfo[i].coreLog = NULL; 460 } 461 rf_ShutdownCreate(listp, 462 rf_ShutdownParityLoggingRegionInfo, 463 raidPtr); 464 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 465 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 466 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 467 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 468 if (rc) { 469 raidPtr->parityLogDiskQueue.threadState = 0; 470 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 471 __FILE__, __LINE__, rc); 472 return (ENOMEM); 473 } 474 /* wait for thread to start */ 475 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 476 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 477 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 478 raidPtr->parityLogDiskQueue.mutex); 479 } 480 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 481 482 rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 483 if (rf_parityLogDebug) { 484 printf(" size of disk log in sectors: %d\n", 485 (int) totalLogCapacity); 486 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 487 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 488 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 489 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 490 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 491 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 492 } 493 rf_EnableParityLogging(raidPtr); 494 495 return (0); 496 } 497 498 static void 499 FreeRegionInfo( 500 RF_Raid_t * raidPtr, 501 RF_RegionId_t regionID) 502 { 503 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 504 RF_Free(raidPtr->regionInfo[regionID].diskMap, 505 (raidPtr->regionInfo[regionID].capacity * 506 sizeof(RF_DiskMap_t))); 507 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 508 rf_ReleaseParityLogs(raidPtr, 509 raidPtr->regionInfo[regionID].coreLog); 510 raidPtr->regionInfo[regionID].coreLog = NULL; 511 } else { 512 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 513 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 514 } 515 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 516 } 517 518 519 static void 520 FreeParityLogQueue( 521 RF_Raid_t * raidPtr, 522 RF_ParityLogQueue_t * queue) 523 { 524 RF_ParityLog_t *l1, *l2; 525 526 RF_LOCK_MUTEX(queue->mutex); 527 l1 = queue->parityLogs; 528 while (l1) { 529 l2 = l1; 530 l1 = l2->next; 531 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 532 sizeof(RF_ParityLogRecord_t))); 533 RF_Free(l2, sizeof(RF_ParityLog_t)); 534 } 535 RF_UNLOCK_MUTEX(queue->mutex); 536 } 537 538 539 static void 540 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 541 { 542 int i; 543 544 RF_LOCK_MUTEX(queue->mutex); 545 if (queue->availableBuffers != queue->totalBuffers) { 546 printf("Attempt to free region queue which is still in use!\n"); 547 RF_ASSERT(0); 548 } 549 for (i = 0; i < queue->totalBuffers; i++) 550 RF_Free(queue->buffers[i], queue->bufferSize); 551 RF_Free(queue->buffers, queue->totalBuffers * sizeof(void *)); 552 RF_UNLOCK_MUTEX(queue->mutex); 553 } 554 555 static void 556 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 557 { 558 RF_Raid_t *raidPtr; 559 RF_RegionId_t i; 560 561 raidPtr = (RF_Raid_t *) arg; 562 if (rf_parityLogDebug) { 563 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 564 raidPtr->raidid); 565 } 566 /* free region information structs */ 567 for (i = 0; i < rf_numParityRegions; i++) 568 FreeRegionInfo(raidPtr, i); 569 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 570 sizeof(raidPtr->regionInfo))); 571 raidPtr->regionInfo = NULL; 572 } 573 574 static void 575 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 576 { 577 RF_Raid_t *raidPtr; 578 579 raidPtr = (RF_Raid_t *) arg; 580 if (rf_parityLogDebug) { 581 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 582 } 583 /* free contents of parityLogPool */ 584 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 585 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 586 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 587 } 588 589 static void 590 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 591 { 592 RF_Raid_t *raidPtr; 593 594 raidPtr = (RF_Raid_t *) arg; 595 if (rf_parityLogDebug) { 596 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 597 raidPtr->raidid); 598 } 599 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 600 } 601 602 static void 603 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 604 { 605 RF_Raid_t *raidPtr; 606 607 raidPtr = (RF_Raid_t *) arg; 608 if (rf_parityLogDebug) { 609 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 610 raidPtr->raidid); 611 } 612 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 613 } 614 615 static void 616 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 617 { 618 RF_ParityLogData_t *d; 619 RF_CommonLogData_t *c; 620 RF_Raid_t *raidPtr; 621 622 raidPtr = (RF_Raid_t *) arg; 623 if (rf_parityLogDebug) { 624 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 625 raidPtr->raidid); 626 } 627 /* free disk manager stuff */ 628 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 629 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 630 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 631 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 632 while (raidPtr->parityLogDiskQueue.freeDataList) { 633 d = raidPtr->parityLogDiskQueue.freeDataList; 634 raidPtr->parityLogDiskQueue.freeDataList = 635 raidPtr->parityLogDiskQueue.freeDataList->next; 636 RF_Free(d, sizeof(RF_ParityLogData_t)); 637 } 638 while (raidPtr->parityLogDiskQueue.freeCommonList) { 639 c = raidPtr->parityLogDiskQueue.freeCommonList; 640 raidPtr->parityLogDiskQueue.freeCommonList = 641 raidPtr->parityLogDiskQueue.freeCommonList->next; 642 RF_Free(c, sizeof(RF_CommonLogData_t)); 643 } 644 } 645 646 static void 647 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 648 { 649 RF_Raid_t *raidPtr; 650 651 raidPtr = (RF_Raid_t *) arg; 652 if (rf_parityLogDebug) { 653 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 654 } 655 /* shutdown disk thread */ 656 /* This has the desirable side-effect of forcing all regions to be 657 * reintegrated. This is necessary since all parity log maps are 658 * currently held in volatile memory. */ 659 660 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 661 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 662 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 663 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 664 /* 665 * pLogDiskThread will now terminate when queues are cleared 666 * now wait for it to be done 667 */ 668 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 669 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 670 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 671 raidPtr->parityLogDiskQueue.mutex); 672 } 673 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 674 if (rf_parityLogDebug) { 675 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 676 } 677 } 678 679 int 680 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 681 { 682 return (20); 683 } 684 685 RF_HeadSepLimit_t 686 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 687 { 688 return (10); 689 } 690 /* return the region ID for a given RAID address */ 691 RF_RegionId_t 692 rf_MapRegionIDParityLogging( 693 RF_Raid_t * raidPtr, 694 RF_SectorNum_t address) 695 { 696 RF_RegionId_t regionID; 697 698 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 699 regionID = address / raidPtr->regionParityRange; 700 if (regionID == rf_numParityRegions) { 701 /* last region may be larger than other regions */ 702 regionID--; 703 } 704 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 705 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 706 raidPtr->regionInfo[regionID].numSectorsParity); 707 RF_ASSERT(regionID < rf_numParityRegions); 708 return (regionID); 709 } 710 711 712 /* given a logical RAID sector, determine physical disk address of data */ 713 void 714 rf_MapSectorParityLogging( 715 RF_Raid_t * raidPtr, 716 RF_RaidAddr_t raidSector, 717 RF_RowCol_t * col, 718 RF_SectorNum_t * diskSector, 719 int remap) 720 { 721 RF_StripeNum_t SUID = raidSector / 722 raidPtr->Layout.sectorsPerStripeUnit; 723 /* *col = (SUID % (raidPtr->numCol - 724 * raidPtr->Layout.numParityLogCol)); */ 725 *col = SUID % raidPtr->Layout.numDataCol; 726 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 727 raidPtr->Layout.sectorsPerStripeUnit + 728 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 729 } 730 731 732 /* given a logical RAID sector, determine physical disk address of parity */ 733 void 734 rf_MapParityParityLogging( 735 RF_Raid_t * raidPtr, 736 RF_RaidAddr_t raidSector, 737 RF_RowCol_t * col, 738 RF_SectorNum_t * diskSector, 739 int remap) 740 { 741 RF_StripeNum_t SUID = raidSector / 742 raidPtr->Layout.sectorsPerStripeUnit; 743 744 /* *col = 745 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 746 * r->numCol - raidPtr->Layout.numParityLogCol); */ 747 *col = raidPtr->Layout.numDataCol; 748 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 749 raidPtr->Layout.sectorsPerStripeUnit + 750 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 751 } 752 753 754 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 755 void 756 rf_MapLogParityLogging( 757 RF_Raid_t * raidPtr, 758 RF_RegionId_t regionID, 759 RF_SectorNum_t regionOffset, 760 RF_RowCol_t * col, 761 RF_SectorNum_t * startSector) 762 { 763 *col = raidPtr->numCol - 1; 764 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 765 } 766 767 768 /* given a regionID, determine the physical disk address of the logged 769 parity for that region */ 770 void 771 rf_MapRegionParity( 772 RF_Raid_t * raidPtr, 773 RF_RegionId_t regionID, 774 RF_RowCol_t * col, 775 RF_SectorNum_t * startSector, 776 RF_SectorCount_t * numSector) 777 { 778 *col = raidPtr->numCol - 2; 779 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 780 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 781 } 782 783 784 /* given a logical RAID address, determine the participating disks in 785 the stripe */ 786 void 787 rf_IdentifyStripeParityLogging( 788 RF_Raid_t * raidPtr, 789 RF_RaidAddr_t addr, 790 RF_RowCol_t ** diskids) 791 { 792 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 793 addr); 794 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 795 raidPtr->Layout.layoutSpecificInfo; 796 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 797 } 798 799 800 void 801 rf_MapSIDToPSIDParityLogging( 802 RF_RaidLayout_t * layoutPtr, 803 RF_StripeNum_t stripeID, 804 RF_StripeNum_t * psID, 805 RF_ReconUnitNum_t * which_ru) 806 { 807 *which_ru = 0; 808 *psID = stripeID; 809 } 810 811 812 /* select an algorithm for performing an access. Returns two pointers, 813 * one to a function that will return information about the DAG, and 814 * another to a function that will create the dag. 815 */ 816 void 817 rf_ParityLoggingDagSelect( 818 RF_Raid_t * raidPtr, 819 RF_IoType_t type, 820 RF_AccessStripeMap_t * asmp, 821 RF_VoidFuncPtr * createFunc) 822 { 823 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 824 RF_PhysDiskAddr_t *failedPDA = NULL; 825 RF_RowCol_t fcol; 826 RF_RowStatus_t rstat; 827 int prior_recon; 828 829 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 830 831 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 832 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 833 *createFunc = NULL; 834 return; 835 } else 836 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 837 838 /* if under recon & already reconstructed, redirect 839 * the access to the spare drive and eliminate the 840 * failure indication */ 841 failedPDA = asmp->failedPDAs[0]; 842 fcol = failedPDA->col; 843 rstat = raidPtr->status; 844 prior_recon = (rstat == rf_rs_reconfigured) || ( 845 (rstat == rf_rs_reconstructing) ? 846 rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 847 ); 848 if (prior_recon) { 849 RF_RowCol_t oc = failedPDA->col; 850 RF_SectorNum_t oo = failedPDA->startSector; 851 if (layoutPtr->map->flags & 852 RF_DISTRIBUTE_SPARE) { 853 /* redirect to dist spare space */ 854 855 if (failedPDA == asmp->parityInfo) { 856 857 /* parity has failed */ 858 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, 859 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 860 861 if (asmp->parityInfo->next) { /* redir 2nd component, 862 * if any */ 863 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 864 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 865 p->col = failedPDA->col; 866 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 867 SUoffs; /* cheating: 868 * startSector is not 869 * really a RAID address */ 870 } 871 } else 872 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 873 RF_ASSERT(0); /* should not ever 874 * happen */ 875 } else { 876 877 /* data has failed */ 878 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, 879 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 880 881 } 882 883 } else { 884 /* redirect to dedicated spare space */ 885 886 failedPDA->col = raidPtr->Disks[fcol].spareCol; 887 888 /* the parity may have two distinct 889 * components, both of which may need 890 * to be redirected */ 891 if (asmp->parityInfo->next) { 892 if (failedPDA == asmp->parityInfo) { 893 failedPDA->next->col = failedPDA->col; 894 } else 895 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 896 asmp->parityInfo->col = failedPDA->col; 897 } 898 } 899 } 900 901 RF_ASSERT(failedPDA->col != -1); 902 903 if (rf_dagDebug || rf_mapDebug) { 904 printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n", 905 raidPtr->raidid, type, oc, (long) oo, failedPDA->col, (long) failedPDA->startSector); 906 } 907 asmp->numDataFailed = asmp->numParityFailed = 0; 908 } 909 } 910 if (type == RF_IO_TYPE_READ) { 911 912 if (asmp->numDataFailed == 0) 913 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 914 else 915 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 916 917 } else { 918 919 920 /* if mirroring, always use large writes. If the access 921 * requires two distinct parity updates, always do a small 922 * write. If the stripe contains a failure but the access 923 * does not, do a small write. The first conditional 924 * (numStripeUnitsAccessed <= numDataCol/2) uses a 925 * less-than-or-equal rather than just a less-than because 926 * when G is 3 or 4, numDataCol/2 is 1, and I want 927 * single-stripe-unit updates to use just one disk. */ 928 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 929 if (((asmp->numStripeUnitsAccessed <= 930 (layoutPtr->numDataCol / 2)) && 931 (layoutPtr->numDataCol != 1)) || 932 (asmp->parityInfo->next != NULL) || 933 rf_CheckStripeForFailures(raidPtr, asmp)) { 934 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 935 } else 936 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 937 } else 938 if (asmp->numParityFailed == 1) 939 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 940 else 941 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 942 *createFunc = NULL; 943 else 944 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 945 } 946 } 947 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 948