1 /* $NetBSD: rf_paritylogging.c,v 1.22 2004/02/29 04:03:50 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.22 2004/02/29 04:03:50 oster Exp $"); 36 37 #include "rf_archs.h" 38 39 #if RF_INCLUDE_PARITYLOGGING > 0 40 41 #include <dev/raidframe/raidframevar.h> 42 43 #include "rf_raid.h" 44 #include "rf_dag.h" 45 #include "rf_dagutils.h" 46 #include "rf_dagfuncs.h" 47 #include "rf_dagffrd.h" 48 #include "rf_dagffwr.h" 49 #include "rf_dagdegrd.h" 50 #include "rf_dagdegwr.h" 51 #include "rf_paritylog.h" 52 #include "rf_paritylogDiskMgr.h" 53 #include "rf_paritylogging.h" 54 #include "rf_parityloggingdags.h" 55 #include "rf_general.h" 56 #include "rf_map.h" 57 #include "rf_utils.h" 58 #include "rf_shutdown.h" 59 60 typedef struct RF_ParityLoggingConfigInfo_s { 61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 62 * IdentifyStripe */ 63 } RF_ParityLoggingConfigInfo_t; 64 65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 72 73 int 74 rf_ConfigureParityLogging( 75 RF_ShutdownList_t ** listp, 76 RF_Raid_t * raidPtr, 77 RF_Config_t * cfgPtr) 78 { 79 int i, j, startdisk, rc; 80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 83 RF_ParityLoggingConfigInfo_t *info; 84 RF_ParityLog_t *l = NULL, *next; 85 caddr_t lHeapPtr; 86 87 if (rf_numParityRegions <= 0) 88 return(EINVAL); 89 90 /* 91 * We create multiple entries on the shutdown list here, since 92 * this configuration routine is fairly complicated in and of 93 * itself, and this makes backing out of a failed configuration 94 * much simpler. 95 */ 96 97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 98 99 /* create a parity logging configuration structure */ 100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 101 (RF_ParityLoggingConfigInfo_t *), 102 raidPtr->cleanupList); 103 if (info == NULL) 104 return (ENOMEM); 105 layoutPtr->layoutSpecificInfo = (void *) info; 106 107 /* the stripe identifier must identify the disks in each stripe, IN 108 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 109 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 110 (raidPtr->numCol), 111 raidPtr->cleanupList); 112 if (info->stripeIdentifier == NULL) 113 return (ENOMEM); 114 115 startdisk = 0; 116 for (i = 0; i < (raidPtr->numCol); i++) { 117 for (j = 0; j < (raidPtr->numCol); j++) { 118 info->stripeIdentifier[i][j] = (startdisk + j) % 119 (raidPtr->numCol - 1); 120 } 121 if ((--startdisk) < 0) 122 startdisk = raidPtr->numCol - 1 - 1; 123 } 124 125 /* fill in the remaining layout parameters */ 126 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 127 layoutPtr->numParityCol = 1; 128 layoutPtr->numParityLogCol = 1; 129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 130 layoutPtr->numParityLogCol; 131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 132 layoutPtr->sectorsPerStripeUnit; 133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 135 layoutPtr->sectorsPerStripeUnit; 136 137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 139 140 /* configure parity log parameters 141 * 142 * parameter comment/constraints 143 * ------------------------------------------- 144 * numParityRegions* all regions (except possibly last) 145 * of equal size 146 * totalInCoreLogCapacity* amount of memory in bytes available 147 * for in-core logs (default 1 MB) 148 * numSectorsPerLog# capacity of an in-core log in sectors 149 * (1 * disk track) 150 * numParityLogs total number of in-core logs, 151 * should be at least numParityRegions 152 * regionLogCapacity size of a region log (except possibly 153 * last one) in sectors 154 * totalLogCapacity total amount of log space in sectors 155 * 156 * where '*' denotes a user settable parameter. 157 * Note that logs are fixed to be the size of a disk track, 158 * value #defined in rf_paritylog.h 159 * 160 */ 161 162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 164 if (rf_parityLogDebug) 165 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 166 167 /* reduce fragmentation within a disk region by adjusting the number 168 * of regions in an attempt to allow an integral number of logs to fit 169 * into a disk region */ 170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 171 if (fragmentation > 0) 172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 173 if (((totalLogCapacity / (rf_numParityRegions + i)) % 174 raidPtr->numSectorsPerLog) < fragmentation) { 175 rf_numParityRegions++; 176 raidPtr->regionLogCapacity = totalLogCapacity / 177 rf_numParityRegions; 178 fragmentation = raidPtr->regionLogCapacity % 179 raidPtr->numSectorsPerLog; 180 } 181 if (((totalLogCapacity / (rf_numParityRegions - i)) % 182 raidPtr->numSectorsPerLog) < fragmentation) { 183 rf_numParityRegions--; 184 raidPtr->regionLogCapacity = totalLogCapacity / 185 rf_numParityRegions; 186 fragmentation = raidPtr->regionLogCapacity % 187 raidPtr->numSectorsPerLog; 188 } 189 } 190 /* ensure integral number of regions per log */ 191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 192 raidPtr->numSectorsPerLog) * 193 raidPtr->numSectorsPerLog; 194 195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 197 /* to avoid deadlock, must ensure that enough logs exist for each 198 * region to have one simultaneously */ 199 if (raidPtr->numParityLogs < rf_numParityRegions) 200 raidPtr->numParityLogs = rf_numParityRegions; 201 202 /* create region information structs */ 203 printf("Allocating %d bytes for in-core parity region info\n", 204 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 205 RF_Malloc(raidPtr->regionInfo, 206 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 207 (RF_RegionInfo_t *)); 208 if (raidPtr->regionInfo == NULL) 209 return (ENOMEM); 210 211 /* last region may not be full capacity */ 212 lastRegionCapacity = raidPtr->regionLogCapacity; 213 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 214 lastRegionCapacity > totalLogCapacity) 215 lastRegionCapacity = lastRegionCapacity - 216 raidPtr->numSectorsPerLog; 217 218 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 219 rf_numParityRegions; 220 maxRegionParityRange = raidPtr->regionParityRange; 221 222 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 224 regionParityRange++; */ 225 226 /* build pool of unused parity logs */ 227 printf("Allocating %d bytes for %d parity logs\n", 228 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 229 raidPtr->bytesPerSector, 230 raidPtr->numParityLogs); 231 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 232 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 233 (caddr_t)); 234 if (raidPtr->parityLogBufferHeap == NULL) 235 return (ENOMEM); 236 lHeapPtr = raidPtr->parityLogBufferHeap; 237 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); 238 if (rc) { 239 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 240 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 241 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 242 return (ENOMEM); 243 } 244 for (i = 0; i < raidPtr->numParityLogs; i++) { 245 if (i == 0) { 246 RF_Malloc(raidPtr->parityLogPool.parityLogs, 247 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 248 if (raidPtr->parityLogPool.parityLogs == NULL) { 249 RF_Free(raidPtr->parityLogBufferHeap, 250 raidPtr->numParityLogs * 251 raidPtr->numSectorsPerLog * 252 raidPtr->bytesPerSector); 253 return (ENOMEM); 254 } 255 l = raidPtr->parityLogPool.parityLogs; 256 } else { 257 RF_Malloc(l->next, sizeof(RF_ParityLog_t), 258 (RF_ParityLog_t *)); 259 if (l->next == NULL) { 260 RF_Free(raidPtr->parityLogBufferHeap, 261 raidPtr->numParityLogs * 262 raidPtr->numSectorsPerLog * 263 raidPtr->bytesPerSector); 264 for (l = raidPtr->parityLogPool.parityLogs; 265 l; 266 l = next) { 267 next = l->next; 268 if (l->records) 269 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 270 RF_Free(l, sizeof(RF_ParityLog_t)); 271 } 272 return (ENOMEM); 273 } 274 l = l->next; 275 } 276 l->bufPtr = lHeapPtr; 277 lHeapPtr += raidPtr->numSectorsPerLog * 278 raidPtr->bytesPerSector; 279 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 280 sizeof(RF_ParityLogRecord_t)), 281 (RF_ParityLogRecord_t *)); 282 if (l->records == NULL) { 283 RF_Free(raidPtr->parityLogBufferHeap, 284 raidPtr->numParityLogs * 285 raidPtr->numSectorsPerLog * 286 raidPtr->bytesPerSector); 287 for (l = raidPtr->parityLogPool.parityLogs; 288 l; 289 l = next) { 290 next = l->next; 291 if (l->records) 292 RF_Free(l->records, 293 (raidPtr->numSectorsPerLog * 294 sizeof(RF_ParityLogRecord_t))); 295 RF_Free(l, sizeof(RF_ParityLog_t)); 296 } 297 return (ENOMEM); 298 } 299 } 300 rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 301 /* build pool of region buffers */ 302 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); 303 if (rc) { 304 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 305 return (ENOMEM); 306 } 307 raidPtr->regionBufferPool.cond = 0; 308 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 309 raidPtr->bytesPerSector; 310 printf("regionBufferPool.bufferSize %d\n", 311 raidPtr->regionBufferPool.bufferSize); 312 313 /* for now, only one region at a time may be reintegrated */ 314 raidPtr->regionBufferPool.totalBuffers = 1; 315 316 raidPtr->regionBufferPool.availableBuffers = 317 raidPtr->regionBufferPool.totalBuffers; 318 raidPtr->regionBufferPool.availBuffersIndex = 0; 319 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 320 printf("Allocating %d bytes for regionBufferPool\n", 321 (int) (raidPtr->regionBufferPool.totalBuffers * 322 sizeof(caddr_t))); 323 RF_Malloc(raidPtr->regionBufferPool.buffers, 324 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 325 (caddr_t *)); 326 if (raidPtr->regionBufferPool.buffers == NULL) { 327 return (ENOMEM); 328 } 329 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 330 printf("Allocating %d bytes for regionBufferPool#%d\n", 331 (int) (raidPtr->regionBufferPool.bufferSize * 332 sizeof(char)), i); 333 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 334 raidPtr->regionBufferPool.bufferSize * sizeof(char), 335 (caddr_t)); 336 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 337 for (j = 0; j < i; j++) { 338 RF_Free(raidPtr->regionBufferPool.buffers[i], 339 raidPtr->regionBufferPool.bufferSize * 340 sizeof(char)); 341 } 342 RF_Free(raidPtr->regionBufferPool.buffers, 343 raidPtr->regionBufferPool.totalBuffers * 344 sizeof(caddr_t)); 345 return (ENOMEM); 346 } 347 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 348 (long) raidPtr->regionBufferPool.buffers[i]); 349 } 350 rf_ShutdownCreate(listp, 351 rf_ShutdownParityLoggingRegionBufferPool, 352 raidPtr); 353 /* build pool of parity buffers */ 354 parityBufferCapacity = maxRegionParityRange; 355 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); 356 if (rc) { 357 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 358 return (rc); 359 } 360 raidPtr->parityBufferPool.cond = 0; 361 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 362 raidPtr->bytesPerSector; 363 printf("parityBufferPool.bufferSize %d\n", 364 raidPtr->parityBufferPool.bufferSize); 365 366 /* for now, only one region at a time may be reintegrated */ 367 raidPtr->parityBufferPool.totalBuffers = 1; 368 369 raidPtr->parityBufferPool.availableBuffers = 370 raidPtr->parityBufferPool.totalBuffers; 371 raidPtr->parityBufferPool.availBuffersIndex = 0; 372 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 373 printf("Allocating %d bytes for parityBufferPool of %d units\n", 374 (int) (raidPtr->parityBufferPool.totalBuffers * 375 sizeof(caddr_t)), 376 raidPtr->parityBufferPool.totalBuffers ); 377 RF_Malloc(raidPtr->parityBufferPool.buffers, 378 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 379 (caddr_t *)); 380 if (raidPtr->parityBufferPool.buffers == NULL) { 381 return (ENOMEM); 382 } 383 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 384 printf("Allocating %d bytes for parityBufferPool#%d\n", 385 (int) (raidPtr->parityBufferPool.bufferSize * 386 sizeof(char)),i); 387 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 388 raidPtr->parityBufferPool.bufferSize * sizeof(char), 389 (caddr_t)); 390 if (raidPtr->parityBufferPool.buffers == NULL) { 391 for (j = 0; j < i; j++) { 392 RF_Free(raidPtr->parityBufferPool.buffers[i], 393 raidPtr->regionBufferPool.bufferSize * 394 sizeof(char)); 395 } 396 RF_Free(raidPtr->parityBufferPool.buffers, 397 raidPtr->regionBufferPool.totalBuffers * 398 sizeof(caddr_t)); 399 return (ENOMEM); 400 } 401 printf("parityBufferPool.buffers[%d] = %lx\n", i, 402 (long) raidPtr->parityBufferPool.buffers[i]); 403 } 404 rf_ShutdownCreate(listp, 405 rf_ShutdownParityLoggingParityBufferPool, 406 raidPtr); 407 /* initialize parityLogDiskQueue */ 408 rf_mutex_init(&raidPtr->parityLogDiskQueue.mutex); 409 raidPtr->parityLogDiskQueue.cond = 0; 410 raidPtr->parityLogDiskQueue.flushQueue = NULL; 411 raidPtr->parityLogDiskQueue.reintQueue = NULL; 412 raidPtr->parityLogDiskQueue.bufHead = NULL; 413 raidPtr->parityLogDiskQueue.bufTail = NULL; 414 raidPtr->parityLogDiskQueue.reintHead = NULL; 415 raidPtr->parityLogDiskQueue.reintTail = NULL; 416 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 417 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 418 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 419 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 420 raidPtr->parityLogDiskQueue.freeDataList = NULL; 421 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 422 423 rf_ShutdownCreate(listp, 424 rf_ShutdownParityLoggingDiskQueue, 425 raidPtr); 426 for (i = 0; i < rf_numParityRegions; i++) { 427 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); 428 if (rc) { 429 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 430 for (j = 0; j < i; j++) 431 FreeRegionInfo(raidPtr, j); 432 RF_Free(raidPtr->regionInfo, 433 (rf_numParityRegions * 434 sizeof(RF_RegionInfo_t))); 435 return (ENOMEM); 436 } 437 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 438 if (rc) { 439 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 440 for (j = 0; j < i; j++) 441 FreeRegionInfo(raidPtr, j); 442 RF_Free(raidPtr->regionInfo, 443 (rf_numParityRegions * 444 sizeof(RF_RegionInfo_t))); 445 return (ENOMEM); 446 } 447 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 448 raidPtr->regionInfo[i].regionStartAddr = 449 raidPtr->regionLogCapacity * i; 450 raidPtr->regionInfo[i].parityStartAddr = 451 raidPtr->regionParityRange * i; 452 if (i < rf_numParityRegions - 1) { 453 raidPtr->regionInfo[i].capacity = 454 raidPtr->regionLogCapacity; 455 raidPtr->regionInfo[i].numSectorsParity = 456 raidPtr->regionParityRange; 457 } else { 458 raidPtr->regionInfo[i].capacity = 459 lastRegionCapacity; 460 raidPtr->regionInfo[i].numSectorsParity = 461 raidPtr->sectorsPerDisk - 462 raidPtr->regionParityRange * i; 463 if (raidPtr->regionInfo[i].numSectorsParity > 464 maxRegionParityRange) 465 maxRegionParityRange = 466 raidPtr->regionInfo[i].numSectorsParity; 467 } 468 raidPtr->regionInfo[i].diskCount = 0; 469 RF_ASSERT(raidPtr->regionInfo[i].capacity + 470 raidPtr->regionInfo[i].regionStartAddr <= 471 totalLogCapacity); 472 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 473 raidPtr->regionInfo[i].numSectorsParity <= 474 raidPtr->sectorsPerDisk); 475 printf("Allocating %d bytes for region %d\n", 476 (int) (raidPtr->regionInfo[i].capacity * 477 sizeof(RF_DiskMap_t)), i); 478 RF_Malloc(raidPtr->regionInfo[i].diskMap, 479 (raidPtr->regionInfo[i].capacity * 480 sizeof(RF_DiskMap_t)), 481 (RF_DiskMap_t *)); 482 if (raidPtr->regionInfo[i].diskMap == NULL) { 483 for (j = 0; j < i; j++) 484 FreeRegionInfo(raidPtr, j); 485 RF_Free(raidPtr->regionInfo, 486 (rf_numParityRegions * 487 sizeof(RF_RegionInfo_t))); 488 return (ENOMEM); 489 } 490 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 491 raidPtr->regionInfo[i].coreLog = NULL; 492 } 493 rf_ShutdownCreate(listp, 494 rf_ShutdownParityLoggingRegionInfo, 495 raidPtr); 496 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 497 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 498 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 499 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 500 if (rc) { 501 raidPtr->parityLogDiskQueue.threadState = 0; 502 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 503 __FILE__, __LINE__, rc); 504 return (ENOMEM); 505 } 506 /* wait for thread to start */ 507 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 508 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 509 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 510 raidPtr->parityLogDiskQueue.mutex); 511 } 512 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 513 514 rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 515 if (rf_parityLogDebug) { 516 printf(" size of disk log in sectors: %d\n", 517 (int) totalLogCapacity); 518 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 519 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 520 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 521 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 522 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 523 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 524 } 525 rf_EnableParityLogging(raidPtr); 526 527 return (0); 528 } 529 530 static void 531 FreeRegionInfo( 532 RF_Raid_t * raidPtr, 533 RF_RegionId_t regionID) 534 { 535 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 536 RF_Free(raidPtr->regionInfo[regionID].diskMap, 537 (raidPtr->regionInfo[regionID].capacity * 538 sizeof(RF_DiskMap_t))); 539 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 540 rf_ReleaseParityLogs(raidPtr, 541 raidPtr->regionInfo[regionID].coreLog); 542 raidPtr->regionInfo[regionID].coreLog = NULL; 543 } else { 544 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 545 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 546 } 547 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 548 } 549 550 551 static void 552 FreeParityLogQueue( 553 RF_Raid_t * raidPtr, 554 RF_ParityLogQueue_t * queue) 555 { 556 RF_ParityLog_t *l1, *l2; 557 558 RF_LOCK_MUTEX(queue->mutex); 559 l1 = queue->parityLogs; 560 while (l1) { 561 l2 = l1; 562 l1 = l2->next; 563 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 564 sizeof(RF_ParityLogRecord_t))); 565 RF_Free(l2, sizeof(RF_ParityLog_t)); 566 } 567 RF_UNLOCK_MUTEX(queue->mutex); 568 } 569 570 571 static void 572 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 573 { 574 int i; 575 576 RF_LOCK_MUTEX(queue->mutex); 577 if (queue->availableBuffers != queue->totalBuffers) { 578 printf("Attempt to free region queue which is still in use!\n"); 579 RF_ASSERT(0); 580 } 581 for (i = 0; i < queue->totalBuffers; i++) 582 RF_Free(queue->buffers[i], queue->bufferSize); 583 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); 584 RF_UNLOCK_MUTEX(queue->mutex); 585 } 586 587 static void 588 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 589 { 590 RF_Raid_t *raidPtr; 591 RF_RegionId_t i; 592 593 raidPtr = (RF_Raid_t *) arg; 594 if (rf_parityLogDebug) { 595 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 596 raidPtr->raidid); 597 } 598 /* free region information structs */ 599 for (i = 0; i < rf_numParityRegions; i++) 600 FreeRegionInfo(raidPtr, i); 601 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 602 sizeof(raidPtr->regionInfo))); 603 raidPtr->regionInfo = NULL; 604 } 605 606 static void 607 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 608 { 609 RF_Raid_t *raidPtr; 610 611 raidPtr = (RF_Raid_t *) arg; 612 if (rf_parityLogDebug) { 613 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 614 } 615 /* free contents of parityLogPool */ 616 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 617 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 618 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 619 } 620 621 static void 622 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 623 { 624 RF_Raid_t *raidPtr; 625 626 raidPtr = (RF_Raid_t *) arg; 627 if (rf_parityLogDebug) { 628 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 629 raidPtr->raidid); 630 } 631 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 632 } 633 634 static void 635 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 636 { 637 RF_Raid_t *raidPtr; 638 639 raidPtr = (RF_Raid_t *) arg; 640 if (rf_parityLogDebug) { 641 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 642 raidPtr->raidid); 643 } 644 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 645 } 646 647 static void 648 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 649 { 650 RF_ParityLogData_t *d; 651 RF_CommonLogData_t *c; 652 RF_Raid_t *raidPtr; 653 654 raidPtr = (RF_Raid_t *) arg; 655 if (rf_parityLogDebug) { 656 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 657 raidPtr->raidid); 658 } 659 /* free disk manager stuff */ 660 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 661 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 662 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 663 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 664 while (raidPtr->parityLogDiskQueue.freeDataList) { 665 d = raidPtr->parityLogDiskQueue.freeDataList; 666 raidPtr->parityLogDiskQueue.freeDataList = 667 raidPtr->parityLogDiskQueue.freeDataList->next; 668 RF_Free(d, sizeof(RF_ParityLogData_t)); 669 } 670 while (raidPtr->parityLogDiskQueue.freeCommonList) { 671 c = raidPtr->parityLogDiskQueue.freeCommonList; 672 raidPtr->parityLogDiskQueue.freeCommonList = 673 raidPtr->parityLogDiskQueue.freeCommonList->next; 674 RF_Free(c, sizeof(RF_CommonLogData_t)); 675 } 676 } 677 678 static void 679 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 680 { 681 RF_Raid_t *raidPtr; 682 683 raidPtr = (RF_Raid_t *) arg; 684 if (rf_parityLogDebug) { 685 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 686 } 687 /* shutdown disk thread */ 688 /* This has the desirable side-effect of forcing all regions to be 689 * reintegrated. This is necessary since all parity log maps are 690 * currently held in volatile memory. */ 691 692 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 693 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 694 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 695 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 696 /* 697 * pLogDiskThread will now terminate when queues are cleared 698 * now wait for it to be done 699 */ 700 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 701 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 702 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 703 raidPtr->parityLogDiskQueue.mutex); 704 } 705 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 706 if (rf_parityLogDebug) { 707 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 708 } 709 } 710 711 int 712 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 713 { 714 return (20); 715 } 716 717 RF_HeadSepLimit_t 718 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 719 { 720 return (10); 721 } 722 /* return the region ID for a given RAID address */ 723 RF_RegionId_t 724 rf_MapRegionIDParityLogging( 725 RF_Raid_t * raidPtr, 726 RF_SectorNum_t address) 727 { 728 RF_RegionId_t regionID; 729 730 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 731 regionID = address / raidPtr->regionParityRange; 732 if (regionID == rf_numParityRegions) { 733 /* last region may be larger than other regions */ 734 regionID--; 735 } 736 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 737 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 738 raidPtr->regionInfo[regionID].numSectorsParity); 739 RF_ASSERT(regionID < rf_numParityRegions); 740 return (regionID); 741 } 742 743 744 /* given a logical RAID sector, determine physical disk address of data */ 745 void 746 rf_MapSectorParityLogging( 747 RF_Raid_t * raidPtr, 748 RF_RaidAddr_t raidSector, 749 RF_RowCol_t * col, 750 RF_SectorNum_t * diskSector, 751 int remap) 752 { 753 RF_StripeNum_t SUID = raidSector / 754 raidPtr->Layout.sectorsPerStripeUnit; 755 /* *col = (SUID % (raidPtr->numCol - 756 * raidPtr->Layout.numParityLogCol)); */ 757 *col = SUID % raidPtr->Layout.numDataCol; 758 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 759 raidPtr->Layout.sectorsPerStripeUnit + 760 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 761 } 762 763 764 /* given a logical RAID sector, determine physical disk address of parity */ 765 void 766 rf_MapParityParityLogging( 767 RF_Raid_t * raidPtr, 768 RF_RaidAddr_t raidSector, 769 RF_RowCol_t * col, 770 RF_SectorNum_t * diskSector, 771 int remap) 772 { 773 RF_StripeNum_t SUID = raidSector / 774 raidPtr->Layout.sectorsPerStripeUnit; 775 776 /* *col = 777 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 778 * r->numCol - raidPtr->Layout.numParityLogCol); */ 779 *col = raidPtr->Layout.numDataCol; 780 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 781 raidPtr->Layout.sectorsPerStripeUnit + 782 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 783 } 784 785 786 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 787 void 788 rf_MapLogParityLogging( 789 RF_Raid_t * raidPtr, 790 RF_RegionId_t regionID, 791 RF_SectorNum_t regionOffset, 792 RF_RowCol_t * col, 793 RF_SectorNum_t * startSector) 794 { 795 *col = raidPtr->numCol - 1; 796 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 797 } 798 799 800 /* given a regionID, determine the physical disk address of the logged 801 parity for that region */ 802 void 803 rf_MapRegionParity( 804 RF_Raid_t * raidPtr, 805 RF_RegionId_t regionID, 806 RF_RowCol_t * col, 807 RF_SectorNum_t * startSector, 808 RF_SectorCount_t * numSector) 809 { 810 *col = raidPtr->numCol - 2; 811 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 812 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 813 } 814 815 816 /* given a logical RAID address, determine the participating disks in 817 the stripe */ 818 void 819 rf_IdentifyStripeParityLogging( 820 RF_Raid_t * raidPtr, 821 RF_RaidAddr_t addr, 822 RF_RowCol_t ** diskids) 823 { 824 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 825 addr); 826 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 827 raidPtr->Layout.layoutSpecificInfo; 828 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 829 } 830 831 832 void 833 rf_MapSIDToPSIDParityLogging( 834 RF_RaidLayout_t * layoutPtr, 835 RF_StripeNum_t stripeID, 836 RF_StripeNum_t * psID, 837 RF_ReconUnitNum_t * which_ru) 838 { 839 *which_ru = 0; 840 *psID = stripeID; 841 } 842 843 844 /* select an algorithm for performing an access. Returns two pointers, 845 * one to a function that will return information about the DAG, and 846 * another to a function that will create the dag. 847 */ 848 void 849 rf_ParityLoggingDagSelect( 850 RF_Raid_t * raidPtr, 851 RF_IoType_t type, 852 RF_AccessStripeMap_t * asmp, 853 RF_VoidFuncPtr * createFunc) 854 { 855 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 856 RF_PhysDiskAddr_t *failedPDA = NULL; 857 RF_RowCol_t fcol; 858 RF_RowStatus_t rstat; 859 int prior_recon; 860 861 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 862 863 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 864 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 865 *createFunc = NULL; 866 return; 867 } else 868 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 869 870 /* if under recon & already reconstructed, redirect 871 * the access to the spare drive and eliminate the 872 * failure indication */ 873 failedPDA = asmp->failedPDAs[0]; 874 fcol = failedPDA->col; 875 rstat = raidPtr->status; 876 prior_recon = (rstat == rf_rs_reconfigured) || ( 877 (rstat == rf_rs_reconstructing) ? 878 rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 879 ); 880 if (prior_recon) { 881 RF_RowCol_t oc = failedPDA->col; 882 RF_SectorNum_t oo = failedPDA->startSector; 883 if (layoutPtr->map->flags & 884 RF_DISTRIBUTE_SPARE) { 885 /* redirect to dist spare space */ 886 887 if (failedPDA == asmp->parityInfo) { 888 889 /* parity has failed */ 890 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, 891 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 892 893 if (asmp->parityInfo->next) { /* redir 2nd component, 894 * if any */ 895 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 896 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 897 p->col = failedPDA->col; 898 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 899 SUoffs; /* cheating: 900 * startSector is not 901 * really a RAID address */ 902 } 903 } else 904 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 905 RF_ASSERT(0); /* should not ever 906 * happen */ 907 } else { 908 909 /* data has failed */ 910 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, 911 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 912 913 } 914 915 } else { 916 /* redirect to dedicated spare space */ 917 918 failedPDA->col = raidPtr->Disks[fcol].spareCol; 919 920 /* the parity may have two distinct 921 * components, both of which may need 922 * to be redirected */ 923 if (asmp->parityInfo->next) { 924 if (failedPDA == asmp->parityInfo) { 925 failedPDA->next->col = failedPDA->col; 926 } else 927 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 928 asmp->parityInfo->col = failedPDA->col; 929 } 930 } 931 } 932 933 RF_ASSERT(failedPDA->col != -1); 934 935 if (rf_dagDebug || rf_mapDebug) { 936 printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n", 937 raidPtr->raidid, type, oc, (long) oo, failedPDA->col, (long) failedPDA->startSector); 938 } 939 asmp->numDataFailed = asmp->numParityFailed = 0; 940 } 941 } 942 if (type == RF_IO_TYPE_READ) { 943 944 if (asmp->numDataFailed == 0) 945 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 946 else 947 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 948 949 } else { 950 951 952 /* if mirroring, always use large writes. If the access 953 * requires two distinct parity updates, always do a small 954 * write. If the stripe contains a failure but the access 955 * does not, do a small write. The first conditional 956 * (numStripeUnitsAccessed <= numDataCol/2) uses a 957 * less-than-or-equal rather than just a less-than because 958 * when G is 3 or 4, numDataCol/2 is 1, and I want 959 * single-stripe-unit updates to use just one disk. */ 960 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 961 if (((asmp->numStripeUnitsAccessed <= 962 (layoutPtr->numDataCol / 2)) && 963 (layoutPtr->numDataCol != 1)) || 964 (asmp->parityInfo->next != NULL) || 965 rf_CheckStripeForFailures(raidPtr, asmp)) { 966 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 967 } else 968 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 969 } else 970 if (asmp->numParityFailed == 1) 971 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 972 else 973 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 974 *createFunc = NULL; 975 else 976 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 977 } 978 } 979 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 980