1 /* $NetBSD: rf_paritylogging.c,v 1.13 2002/09/14 17:53:58 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.13 2002/09/14 17:53:58 oster Exp $"); 36 37 #include "rf_archs.h" 38 39 #if RF_INCLUDE_PARITYLOGGING > 0 40 41 #include <dev/raidframe/raidframevar.h> 42 43 #include "rf_raid.h" 44 #include "rf_dag.h" 45 #include "rf_dagutils.h" 46 #include "rf_dagfuncs.h" 47 #include "rf_dagffrd.h" 48 #include "rf_dagffwr.h" 49 #include "rf_dagdegrd.h" 50 #include "rf_dagdegwr.h" 51 #include "rf_paritylog.h" 52 #include "rf_paritylogDiskMgr.h" 53 #include "rf_paritylogging.h" 54 #include "rf_parityloggingdags.h" 55 #include "rf_general.h" 56 #include "rf_map.h" 57 #include "rf_utils.h" 58 #include "rf_shutdown.h" 59 60 typedef struct RF_ParityLoggingConfigInfo_s { 61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 62 * IdentifyStripe */ 63 } RF_ParityLoggingConfigInfo_t; 64 65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 72 73 int 74 rf_ConfigureParityLogging( 75 RF_ShutdownList_t ** listp, 76 RF_Raid_t * raidPtr, 77 RF_Config_t * cfgPtr) 78 { 79 int i, j, startdisk, rc; 80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 83 RF_ParityLoggingConfigInfo_t *info; 84 RF_ParityLog_t *l = NULL, *next; 85 caddr_t lHeapPtr; 86 87 if (rf_numParityRegions <= 0) 88 return(EINVAL); 89 90 /* 91 * We create multiple entries on the shutdown list here, since 92 * this configuration routine is fairly complicated in and of 93 * itself, and this makes backing out of a failed configuration 94 * much simpler. 95 */ 96 97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 98 99 /* create a parity logging configuration structure */ 100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 101 (RF_ParityLoggingConfigInfo_t *), 102 raidPtr->cleanupList); 103 if (info == NULL) 104 return (ENOMEM); 105 layoutPtr->layoutSpecificInfo = (void *) info; 106 107 RF_ASSERT(raidPtr->numRow == 1); 108 109 /* the stripe identifier must identify the disks in each stripe, IN 110 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 111 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 112 (raidPtr->numCol), 113 raidPtr->cleanupList); 114 if (info->stripeIdentifier == NULL) 115 return (ENOMEM); 116 117 startdisk = 0; 118 for (i = 0; i < (raidPtr->numCol); i++) { 119 for (j = 0; j < (raidPtr->numCol); j++) { 120 info->stripeIdentifier[i][j] = (startdisk + j) % 121 (raidPtr->numCol - 1); 122 } 123 if ((--startdisk) < 0) 124 startdisk = raidPtr->numCol - 1 - 1; 125 } 126 127 /* fill in the remaining layout parameters */ 128 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 129 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << 130 raidPtr->logBytesPerSector; 131 layoutPtr->numParityCol = 1; 132 layoutPtr->numParityLogCol = 1; 133 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 134 layoutPtr->numParityLogCol; 135 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 136 layoutPtr->sectorsPerStripeUnit; 137 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 138 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 139 layoutPtr->sectorsPerStripeUnit; 140 141 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 142 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 143 144 /* configure parity log parameters 145 * 146 * parameter comment/constraints 147 * ------------------------------------------- 148 * numParityRegions* all regions (except possibly last) 149 * of equal size 150 * totalInCoreLogCapacity* amount of memory in bytes available 151 * for in-core logs (default 1 MB) 152 * numSectorsPerLog# capacity of an in-core log in sectors 153 * (1 * disk track) 154 * numParityLogs total number of in-core logs, 155 * should be at least numParityRegions 156 * regionLogCapacity size of a region log (except possibly 157 * last one) in sectors 158 * totalLogCapacity total amount of log space in sectors 159 * 160 * where '*' denotes a user settable parameter. 161 * Note that logs are fixed to be the size of a disk track, 162 * value #defined in rf_paritylog.h 163 * 164 */ 165 166 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 167 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 168 if (rf_parityLogDebug) 169 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 170 171 /* reduce fragmentation within a disk region by adjusting the number 172 * of regions in an attempt to allow an integral number of logs to fit 173 * into a disk region */ 174 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 175 if (fragmentation > 0) 176 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 177 if (((totalLogCapacity / (rf_numParityRegions + i)) % 178 raidPtr->numSectorsPerLog) < fragmentation) { 179 rf_numParityRegions++; 180 raidPtr->regionLogCapacity = totalLogCapacity / 181 rf_numParityRegions; 182 fragmentation = raidPtr->regionLogCapacity % 183 raidPtr->numSectorsPerLog; 184 } 185 if (((totalLogCapacity / (rf_numParityRegions - i)) % 186 raidPtr->numSectorsPerLog) < fragmentation) { 187 rf_numParityRegions--; 188 raidPtr->regionLogCapacity = totalLogCapacity / 189 rf_numParityRegions; 190 fragmentation = raidPtr->regionLogCapacity % 191 raidPtr->numSectorsPerLog; 192 } 193 } 194 /* ensure integral number of regions per log */ 195 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 196 raidPtr->numSectorsPerLog) * 197 raidPtr->numSectorsPerLog; 198 199 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 200 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 201 /* to avoid deadlock, must ensure that enough logs exist for each 202 * region to have one simultaneously */ 203 if (raidPtr->numParityLogs < rf_numParityRegions) 204 raidPtr->numParityLogs = rf_numParityRegions; 205 206 /* create region information structs */ 207 printf("Allocating %d bytes for in-core parity region info\n", 208 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 209 RF_Malloc(raidPtr->regionInfo, 210 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 211 (RF_RegionInfo_t *)); 212 if (raidPtr->regionInfo == NULL) 213 return (ENOMEM); 214 215 /* last region may not be full capacity */ 216 lastRegionCapacity = raidPtr->regionLogCapacity; 217 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 218 lastRegionCapacity > totalLogCapacity) 219 lastRegionCapacity = lastRegionCapacity - 220 raidPtr->numSectorsPerLog; 221 222 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 223 rf_numParityRegions; 224 maxRegionParityRange = raidPtr->regionParityRange; 225 226 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 227 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 228 regionParityRange++; */ 229 230 /* build pool of unused parity logs */ 231 printf("Allocating %d bytes for %d parity logs\n", 232 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 233 raidPtr->bytesPerSector, 234 raidPtr->numParityLogs); 235 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 236 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 237 (caddr_t)); 238 if (raidPtr->parityLogBufferHeap == NULL) 239 return (ENOMEM); 240 lHeapPtr = raidPtr->parityLogBufferHeap; 241 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); 242 if (rc) { 243 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 244 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 245 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 246 return (ENOMEM); 247 } 248 for (i = 0; i < raidPtr->numParityLogs; i++) { 249 if (i == 0) { 250 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, 251 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 252 if (raidPtr->parityLogPool.parityLogs == NULL) { 253 RF_Free(raidPtr->parityLogBufferHeap, 254 raidPtr->numParityLogs * 255 raidPtr->numSectorsPerLog * 256 raidPtr->bytesPerSector); 257 return (ENOMEM); 258 } 259 l = raidPtr->parityLogPool.parityLogs; 260 } else { 261 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), 262 (RF_ParityLog_t *)); 263 if (l->next == NULL) { 264 RF_Free(raidPtr->parityLogBufferHeap, 265 raidPtr->numParityLogs * 266 raidPtr->numSectorsPerLog * 267 raidPtr->bytesPerSector); 268 for (l = raidPtr->parityLogPool.parityLogs; 269 l; 270 l = next) { 271 next = l->next; 272 if (l->records) 273 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 274 RF_Free(l, sizeof(RF_ParityLog_t)); 275 } 276 return (ENOMEM); 277 } 278 l = l->next; 279 } 280 l->bufPtr = lHeapPtr; 281 lHeapPtr += raidPtr->numSectorsPerLog * 282 raidPtr->bytesPerSector; 283 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 284 sizeof(RF_ParityLogRecord_t)), 285 (RF_ParityLogRecord_t *)); 286 if (l->records == NULL) { 287 RF_Free(raidPtr->parityLogBufferHeap, 288 raidPtr->numParityLogs * 289 raidPtr->numSectorsPerLog * 290 raidPtr->bytesPerSector); 291 for (l = raidPtr->parityLogPool.parityLogs; 292 l; 293 l = next) { 294 next = l->next; 295 if (l->records) 296 RF_Free(l->records, 297 (raidPtr->numSectorsPerLog * 298 sizeof(RF_ParityLogRecord_t))); 299 RF_Free(l, sizeof(RF_ParityLog_t)); 300 } 301 return (ENOMEM); 302 } 303 } 304 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 305 if (rc) { 306 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 307 __LINE__, rc); 308 rf_ShutdownParityLoggingPool(raidPtr); 309 return (rc); 310 } 311 /* build pool of region buffers */ 312 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); 313 if (rc) { 314 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 315 return (ENOMEM); 316 } 317 rc = rf_cond_init(&raidPtr->regionBufferPool.cond); 318 if (rc) { 319 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 320 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 321 return (ENOMEM); 322 } 323 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 324 raidPtr->bytesPerSector; 325 printf("regionBufferPool.bufferSize %d\n", 326 raidPtr->regionBufferPool.bufferSize); 327 328 /* for now, only one region at a time may be reintegrated */ 329 raidPtr->regionBufferPool.totalBuffers = 1; 330 331 raidPtr->regionBufferPool.availableBuffers = 332 raidPtr->regionBufferPool.totalBuffers; 333 raidPtr->regionBufferPool.availBuffersIndex = 0; 334 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 335 printf("Allocating %d bytes for regionBufferPool\n", 336 (int) (raidPtr->regionBufferPool.totalBuffers * 337 sizeof(caddr_t))); 338 RF_Malloc(raidPtr->regionBufferPool.buffers, 339 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 340 (caddr_t *)); 341 if (raidPtr->regionBufferPool.buffers == NULL) { 342 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 343 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 344 return (ENOMEM); 345 } 346 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 347 printf("Allocating %d bytes for regionBufferPool#%d\n", 348 (int) (raidPtr->regionBufferPool.bufferSize * 349 sizeof(char)), i); 350 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 351 raidPtr->regionBufferPool.bufferSize * sizeof(char), 352 (caddr_t)); 353 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 354 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); 355 rf_cond_destroy(&raidPtr->regionBufferPool.cond); 356 for (j = 0; j < i; j++) { 357 RF_Free(raidPtr->regionBufferPool.buffers[i], 358 raidPtr->regionBufferPool.bufferSize * 359 sizeof(char)); 360 } 361 RF_Free(raidPtr->regionBufferPool.buffers, 362 raidPtr->regionBufferPool.totalBuffers * 363 sizeof(caddr_t)); 364 return (ENOMEM); 365 } 366 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 367 (long) raidPtr->regionBufferPool.buffers[i]); 368 } 369 rc = rf_ShutdownCreate(listp, 370 rf_ShutdownParityLoggingRegionBufferPool, 371 raidPtr); 372 if (rc) { 373 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 374 __LINE__, rc); 375 rf_ShutdownParityLoggingRegionBufferPool(raidPtr); 376 return (rc); 377 } 378 /* build pool of parity buffers */ 379 parityBufferCapacity = maxRegionParityRange; 380 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); 381 if (rc) { 382 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 383 return (rc); 384 } 385 rc = rf_cond_init(&raidPtr->parityBufferPool.cond); 386 if (rc) { 387 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 388 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 389 return (ENOMEM); 390 } 391 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 392 raidPtr->bytesPerSector; 393 printf("parityBufferPool.bufferSize %d\n", 394 raidPtr->parityBufferPool.bufferSize); 395 396 /* for now, only one region at a time may be reintegrated */ 397 raidPtr->parityBufferPool.totalBuffers = 1; 398 399 raidPtr->parityBufferPool.availableBuffers = 400 raidPtr->parityBufferPool.totalBuffers; 401 raidPtr->parityBufferPool.availBuffersIndex = 0; 402 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 403 printf("Allocating %d bytes for parityBufferPool of %d units\n", 404 (int) (raidPtr->parityBufferPool.totalBuffers * 405 sizeof(caddr_t)), 406 raidPtr->parityBufferPool.totalBuffers ); 407 RF_Malloc(raidPtr->parityBufferPool.buffers, 408 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 409 (caddr_t *)); 410 if (raidPtr->parityBufferPool.buffers == NULL) { 411 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 412 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 413 return (ENOMEM); 414 } 415 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 416 printf("Allocating %d bytes for parityBufferPool#%d\n", 417 (int) (raidPtr->parityBufferPool.bufferSize * 418 sizeof(char)),i); 419 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 420 raidPtr->parityBufferPool.bufferSize * sizeof(char), 421 (caddr_t)); 422 if (raidPtr->parityBufferPool.buffers == NULL) { 423 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); 424 rf_cond_destroy(&raidPtr->parityBufferPool.cond); 425 for (j = 0; j < i; j++) { 426 RF_Free(raidPtr->parityBufferPool.buffers[i], 427 raidPtr->regionBufferPool.bufferSize * 428 sizeof(char)); 429 } 430 RF_Free(raidPtr->parityBufferPool.buffers, 431 raidPtr->regionBufferPool.totalBuffers * 432 sizeof(caddr_t)); 433 return (ENOMEM); 434 } 435 printf("parityBufferPool.buffers[%d] = %lx\n", i, 436 (long) raidPtr->parityBufferPool.buffers[i]); 437 } 438 rc = rf_ShutdownCreate(listp, 439 rf_ShutdownParityLoggingParityBufferPool, 440 raidPtr); 441 if (rc) { 442 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 443 __LINE__, rc); 444 rf_ShutdownParityLoggingParityBufferPool(raidPtr); 445 return (rc); 446 } 447 /* initialize parityLogDiskQueue */ 448 rc = rf_create_managed_mutex(listp, 449 &raidPtr->parityLogDiskQueue.mutex); 450 if (rc) { 451 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 452 return (rc); 453 } 454 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond); 455 if (rc) { 456 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc); 457 return (rc); 458 } 459 raidPtr->parityLogDiskQueue.flushQueue = NULL; 460 raidPtr->parityLogDiskQueue.reintQueue = NULL; 461 raidPtr->parityLogDiskQueue.bufHead = NULL; 462 raidPtr->parityLogDiskQueue.bufTail = NULL; 463 raidPtr->parityLogDiskQueue.reintHead = NULL; 464 raidPtr->parityLogDiskQueue.reintTail = NULL; 465 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 466 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 467 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 468 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 469 raidPtr->parityLogDiskQueue.freeDataList = NULL; 470 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 471 472 rc = rf_ShutdownCreate(listp, 473 rf_ShutdownParityLoggingDiskQueue, 474 raidPtr); 475 if (rc) { 476 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 477 __LINE__, rc); 478 return (rc); 479 } 480 for (i = 0; i < rf_numParityRegions; i++) { 481 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); 482 if (rc) { 483 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 484 for (j = 0; j < i; j++) 485 FreeRegionInfo(raidPtr, j); 486 RF_Free(raidPtr->regionInfo, 487 (rf_numParityRegions * 488 sizeof(RF_RegionInfo_t))); 489 return (ENOMEM); 490 } 491 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 492 if (rc) { 493 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 494 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 495 for (j = 0; j < i; j++) 496 FreeRegionInfo(raidPtr, j); 497 RF_Free(raidPtr->regionInfo, 498 (rf_numParityRegions * 499 sizeof(RF_RegionInfo_t))); 500 return (ENOMEM); 501 } 502 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 503 raidPtr->regionInfo[i].regionStartAddr = 504 raidPtr->regionLogCapacity * i; 505 raidPtr->regionInfo[i].parityStartAddr = 506 raidPtr->regionParityRange * i; 507 if (i < rf_numParityRegions - 1) { 508 raidPtr->regionInfo[i].capacity = 509 raidPtr->regionLogCapacity; 510 raidPtr->regionInfo[i].numSectorsParity = 511 raidPtr->regionParityRange; 512 } else { 513 raidPtr->regionInfo[i].capacity = 514 lastRegionCapacity; 515 raidPtr->regionInfo[i].numSectorsParity = 516 raidPtr->sectorsPerDisk - 517 raidPtr->regionParityRange * i; 518 if (raidPtr->regionInfo[i].numSectorsParity > 519 maxRegionParityRange) 520 maxRegionParityRange = 521 raidPtr->regionInfo[i].numSectorsParity; 522 } 523 raidPtr->regionInfo[i].diskCount = 0; 524 RF_ASSERT(raidPtr->regionInfo[i].capacity + 525 raidPtr->regionInfo[i].regionStartAddr <= 526 totalLogCapacity); 527 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 528 raidPtr->regionInfo[i].numSectorsParity <= 529 raidPtr->sectorsPerDisk); 530 printf("Allocating %d bytes for region %d\n", 531 (int) (raidPtr->regionInfo[i].capacity * 532 sizeof(RF_DiskMap_t)), i); 533 RF_Malloc(raidPtr->regionInfo[i].diskMap, 534 (raidPtr->regionInfo[i].capacity * 535 sizeof(RF_DiskMap_t)), 536 (RF_DiskMap_t *)); 537 if (raidPtr->regionInfo[i].diskMap == NULL) { 538 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); 539 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex); 540 for (j = 0; j < i; j++) 541 FreeRegionInfo(raidPtr, j); 542 RF_Free(raidPtr->regionInfo, 543 (rf_numParityRegions * 544 sizeof(RF_RegionInfo_t))); 545 return (ENOMEM); 546 } 547 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 548 raidPtr->regionInfo[i].coreLog = NULL; 549 } 550 rc = rf_ShutdownCreate(listp, 551 rf_ShutdownParityLoggingRegionInfo, 552 raidPtr); 553 if (rc) { 554 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 555 __LINE__, rc); 556 rf_ShutdownParityLoggingRegionInfo(raidPtr); 557 return (rc); 558 } 559 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 560 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 561 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 562 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 563 if (rc) { 564 raidPtr->parityLogDiskQueue.threadState = 0; 565 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 566 __FILE__, __LINE__, rc); 567 return (ENOMEM); 568 } 569 /* wait for thread to start */ 570 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 571 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 572 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 573 raidPtr->parityLogDiskQueue.mutex); 574 } 575 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 576 577 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 578 if (rc) { 579 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc); 580 rf_ShutdownParityLogging(raidPtr); 581 return (rc); 582 } 583 if (rf_parityLogDebug) { 584 printf(" size of disk log in sectors: %d\n", 585 (int) totalLogCapacity); 586 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 587 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 588 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 589 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 590 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 591 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 592 } 593 rf_EnableParityLogging(raidPtr); 594 595 return (0); 596 } 597 598 static void 599 FreeRegionInfo( 600 RF_Raid_t * raidPtr, 601 RF_RegionId_t regionID) 602 { 603 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 604 RF_Free(raidPtr->regionInfo[regionID].diskMap, 605 (raidPtr->regionInfo[regionID].capacity * 606 sizeof(RF_DiskMap_t))); 607 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 608 rf_ReleaseParityLogs(raidPtr, 609 raidPtr->regionInfo[regionID].coreLog); 610 raidPtr->regionInfo[regionID].coreLog = NULL; 611 } else { 612 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 613 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 614 } 615 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 616 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex); 617 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex); 618 } 619 620 621 static void 622 FreeParityLogQueue( 623 RF_Raid_t * raidPtr, 624 RF_ParityLogQueue_t * queue) 625 { 626 RF_ParityLog_t *l1, *l2; 627 628 RF_LOCK_MUTEX(queue->mutex); 629 l1 = queue->parityLogs; 630 while (l1) { 631 l2 = l1; 632 l1 = l2->next; 633 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 634 sizeof(RF_ParityLogRecord_t))); 635 RF_Free(l2, sizeof(RF_ParityLog_t)); 636 } 637 RF_UNLOCK_MUTEX(queue->mutex); 638 rf_mutex_destroy(&queue->mutex); 639 } 640 641 642 static void 643 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 644 { 645 int i; 646 647 RF_LOCK_MUTEX(queue->mutex); 648 if (queue->availableBuffers != queue->totalBuffers) { 649 printf("Attempt to free region queue which is still in use!\n"); 650 RF_ASSERT(0); 651 } 652 for (i = 0; i < queue->totalBuffers; i++) 653 RF_Free(queue->buffers[i], queue->bufferSize); 654 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); 655 RF_UNLOCK_MUTEX(queue->mutex); 656 rf_mutex_destroy(&queue->mutex); 657 } 658 659 static void 660 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 661 { 662 RF_Raid_t *raidPtr; 663 RF_RegionId_t i; 664 665 raidPtr = (RF_Raid_t *) arg; 666 if (rf_parityLogDebug) { 667 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 668 raidPtr->raidid); 669 } 670 /* free region information structs */ 671 for (i = 0; i < rf_numParityRegions; i++) 672 FreeRegionInfo(raidPtr, i); 673 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 674 sizeof(raidPtr->regionInfo))); 675 raidPtr->regionInfo = NULL; 676 } 677 678 static void 679 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 680 { 681 RF_Raid_t *raidPtr; 682 683 raidPtr = (RF_Raid_t *) arg; 684 if (rf_parityLogDebug) { 685 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 686 } 687 /* free contents of parityLogPool */ 688 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 689 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 690 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 691 } 692 693 static void 694 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 695 { 696 RF_Raid_t *raidPtr; 697 698 raidPtr = (RF_Raid_t *) arg; 699 if (rf_parityLogDebug) { 700 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 701 raidPtr->raidid); 702 } 703 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 704 } 705 706 static void 707 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 708 { 709 RF_Raid_t *raidPtr; 710 711 raidPtr = (RF_Raid_t *) arg; 712 if (rf_parityLogDebug) { 713 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 714 raidPtr->raidid); 715 } 716 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 717 } 718 719 static void 720 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 721 { 722 RF_ParityLogData_t *d; 723 RF_CommonLogData_t *c; 724 RF_Raid_t *raidPtr; 725 726 raidPtr = (RF_Raid_t *) arg; 727 if (rf_parityLogDebug) { 728 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 729 raidPtr->raidid); 730 } 731 /* free disk manager stuff */ 732 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 733 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 734 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 735 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 736 while (raidPtr->parityLogDiskQueue.freeDataList) { 737 d = raidPtr->parityLogDiskQueue.freeDataList; 738 raidPtr->parityLogDiskQueue.freeDataList = 739 raidPtr->parityLogDiskQueue.freeDataList->next; 740 RF_Free(d, sizeof(RF_ParityLogData_t)); 741 } 742 while (raidPtr->parityLogDiskQueue.freeCommonList) { 743 c = raidPtr->parityLogDiskQueue.freeCommonList; 744 rf_mutex_destroy(&c->mutex); 745 raidPtr->parityLogDiskQueue.freeCommonList = 746 raidPtr->parityLogDiskQueue.freeCommonList->next; 747 RF_Free(c, sizeof(RF_CommonLogData_t)); 748 } 749 } 750 751 static void 752 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 753 { 754 RF_Raid_t *raidPtr; 755 756 raidPtr = (RF_Raid_t *) arg; 757 if (rf_parityLogDebug) { 758 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 759 } 760 /* shutdown disk thread */ 761 /* This has the desirable side-effect of forcing all regions to be 762 * reintegrated. This is necessary since all parity log maps are 763 * currently held in volatile memory. */ 764 765 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 766 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 767 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 768 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 769 /* 770 * pLogDiskThread will now terminate when queues are cleared 771 * now wait for it to be done 772 */ 773 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 774 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 775 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 776 raidPtr->parityLogDiskQueue.mutex); 777 } 778 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 779 if (rf_parityLogDebug) { 780 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 781 } 782 } 783 784 int 785 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 786 { 787 return (20); 788 } 789 790 RF_HeadSepLimit_t 791 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 792 { 793 return (10); 794 } 795 /* return the region ID for a given RAID address */ 796 RF_RegionId_t 797 rf_MapRegionIDParityLogging( 798 RF_Raid_t * raidPtr, 799 RF_SectorNum_t address) 800 { 801 RF_RegionId_t regionID; 802 803 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 804 regionID = address / raidPtr->regionParityRange; 805 if (regionID == rf_numParityRegions) { 806 /* last region may be larger than other regions */ 807 regionID--; 808 } 809 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 810 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 811 raidPtr->regionInfo[regionID].numSectorsParity); 812 RF_ASSERT(regionID < rf_numParityRegions); 813 return (regionID); 814 } 815 816 817 /* given a logical RAID sector, determine physical disk address of data */ 818 void 819 rf_MapSectorParityLogging( 820 RF_Raid_t * raidPtr, 821 RF_RaidAddr_t raidSector, 822 RF_RowCol_t * row, 823 RF_RowCol_t * col, 824 RF_SectorNum_t * diskSector, 825 int remap) 826 { 827 RF_StripeNum_t SUID = raidSector / 828 raidPtr->Layout.sectorsPerStripeUnit; 829 *row = 0; 830 /* *col = (SUID % (raidPtr->numCol - 831 * raidPtr->Layout.numParityLogCol)); */ 832 *col = SUID % raidPtr->Layout.numDataCol; 833 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 834 raidPtr->Layout.sectorsPerStripeUnit + 835 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 836 } 837 838 839 /* given a logical RAID sector, determine physical disk address of parity */ 840 void 841 rf_MapParityParityLogging( 842 RF_Raid_t * raidPtr, 843 RF_RaidAddr_t raidSector, 844 RF_RowCol_t * row, 845 RF_RowCol_t * col, 846 RF_SectorNum_t * diskSector, 847 int remap) 848 { 849 RF_StripeNum_t SUID = raidSector / 850 raidPtr->Layout.sectorsPerStripeUnit; 851 852 *row = 0; 853 /* *col = 854 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 855 * r->numCol - raidPtr->Layout.numParityLogCol); */ 856 *col = raidPtr->Layout.numDataCol; 857 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 858 raidPtr->Layout.sectorsPerStripeUnit + 859 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 860 } 861 862 863 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 864 void 865 rf_MapLogParityLogging( 866 RF_Raid_t * raidPtr, 867 RF_RegionId_t regionID, 868 RF_SectorNum_t regionOffset, 869 RF_RowCol_t * row, 870 RF_RowCol_t * col, 871 RF_SectorNum_t * startSector) 872 { 873 *row = 0; 874 *col = raidPtr->numCol - 1; 875 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 876 } 877 878 879 /* given a regionID, determine the physical disk address of the logged 880 parity for that region */ 881 void 882 rf_MapRegionParity( 883 RF_Raid_t * raidPtr, 884 RF_RegionId_t regionID, 885 RF_RowCol_t * row, 886 RF_RowCol_t * col, 887 RF_SectorNum_t * startSector, 888 RF_SectorCount_t * numSector) 889 { 890 *row = 0; 891 *col = raidPtr->numCol - 2; 892 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 893 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 894 } 895 896 897 /* given a logical RAID address, determine the participating disks in 898 the stripe */ 899 void 900 rf_IdentifyStripeParityLogging( 901 RF_Raid_t * raidPtr, 902 RF_RaidAddr_t addr, 903 RF_RowCol_t ** diskids, 904 RF_RowCol_t * outRow) 905 { 906 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 907 addr); 908 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 909 raidPtr->Layout.layoutSpecificInfo; 910 *outRow = 0; 911 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 912 } 913 914 915 void 916 rf_MapSIDToPSIDParityLogging( 917 RF_RaidLayout_t * layoutPtr, 918 RF_StripeNum_t stripeID, 919 RF_StripeNum_t * psID, 920 RF_ReconUnitNum_t * which_ru) 921 { 922 *which_ru = 0; 923 *psID = stripeID; 924 } 925 926 927 /* select an algorithm for performing an access. Returns two pointers, 928 * one to a function that will return information about the DAG, and 929 * another to a function that will create the dag. 930 */ 931 void 932 rf_ParityLoggingDagSelect( 933 RF_Raid_t * raidPtr, 934 RF_IoType_t type, 935 RF_AccessStripeMap_t * asmp, 936 RF_VoidFuncPtr * createFunc) 937 { 938 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 939 RF_PhysDiskAddr_t *failedPDA = NULL; 940 RF_RowCol_t frow, fcol; 941 RF_RowStatus_t rstat; 942 int prior_recon; 943 944 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 945 946 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 947 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 948 /* *infoFunc = */ *createFunc = NULL; 949 return; 950 } else 951 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 952 953 /* if under recon & already reconstructed, redirect 954 * the access to the spare drive and eliminate the 955 * failure indication */ 956 failedPDA = asmp->failedPDAs[0]; 957 frow = failedPDA->row; 958 fcol = failedPDA->col; 959 rstat = raidPtr->status[failedPDA->row]; 960 prior_recon = (rstat == rf_rs_reconfigured) || ( 961 (rstat == rf_rs_reconstructing) ? 962 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 963 ); 964 if (prior_recon) { 965 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col; 966 RF_SectorNum_t oo = failedPDA->startSector; 967 if (layoutPtr->map->flags & 968 RF_DISTRIBUTE_SPARE) { 969 /* redirect to dist spare space */ 970 971 if (failedPDA == asmp->parityInfo) { 972 973 /* parity has failed */ 974 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 975 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 976 977 if (asmp->parityInfo->next) { /* redir 2nd component, 978 * if any */ 979 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 980 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 981 p->row = failedPDA->row; 982 p->col = failedPDA->col; 983 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 984 SUoffs; /* cheating: 985 * startSector is not 986 * really a RAID address */ 987 } 988 } else 989 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 990 RF_ASSERT(0); /* should not ever 991 * happen */ 992 } else { 993 994 /* data has failed */ 995 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row, 996 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 997 998 } 999 1000 } else { 1001 /* redirect to dedicated spare space */ 1002 1003 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; 1004 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; 1005 1006 /* the parity may have two distinct 1007 * components, both of which may need 1008 * to be redirected */ 1009 if (asmp->parityInfo->next) { 1010 if (failedPDA == asmp->parityInfo) { 1011 failedPDA->next->row = failedPDA->row; 1012 failedPDA->next->col = failedPDA->col; 1013 } else 1014 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 1015 asmp->parityInfo->row = failedPDA->row; 1016 asmp->parityInfo->col = failedPDA->col; 1017 } 1018 } 1019 } 1020 1021 RF_ASSERT(failedPDA->col != -1); 1022 1023 if (rf_dagDebug || rf_mapDebug) { 1024 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", 1025 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector); 1026 } 1027 asmp->numDataFailed = asmp->numParityFailed = 0; 1028 } 1029 } 1030 if (type == RF_IO_TYPE_READ) { 1031 1032 if (asmp->numDataFailed == 0) 1033 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 1034 else 1035 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 1036 1037 } else { 1038 1039 1040 /* if mirroring, always use large writes. If the access 1041 * requires two distinct parity updates, always do a small 1042 * write. If the stripe contains a failure but the access 1043 * does not, do a small write. The first conditional 1044 * (numStripeUnitsAccessed <= numDataCol/2) uses a 1045 * less-than-or-equal rather than just a less-than because 1046 * when G is 3 or 4, numDataCol/2 is 1, and I want 1047 * single-stripe-unit updates to use just one disk. */ 1048 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 1049 if (((asmp->numStripeUnitsAccessed <= 1050 (layoutPtr->numDataCol / 2)) && 1051 (layoutPtr->numDataCol != 1)) || 1052 (asmp->parityInfo->next != NULL) || 1053 rf_CheckStripeForFailures(raidPtr, asmp)) { 1054 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 1055 } else 1056 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 1057 } else 1058 if (asmp->numParityFailed == 1) 1059 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 1060 else 1061 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 1062 *createFunc = NULL; 1063 else 1064 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 1065 } 1066 } 1067 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 1068