1 /* $NetBSD: rf_paritylogging.c,v 1.21 2003/12/29 05:48:13 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 30 /* 31 parity logging configuration, dag selection, and mapping is implemented here 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.21 2003/12/29 05:48:13 oster Exp $"); 36 37 #include "rf_archs.h" 38 39 #if RF_INCLUDE_PARITYLOGGING > 0 40 41 #include <dev/raidframe/raidframevar.h> 42 43 #include "rf_raid.h" 44 #include "rf_dag.h" 45 #include "rf_dagutils.h" 46 #include "rf_dagfuncs.h" 47 #include "rf_dagffrd.h" 48 #include "rf_dagffwr.h" 49 #include "rf_dagdegrd.h" 50 #include "rf_dagdegwr.h" 51 #include "rf_paritylog.h" 52 #include "rf_paritylogDiskMgr.h" 53 #include "rf_paritylogging.h" 54 #include "rf_parityloggingdags.h" 55 #include "rf_general.h" 56 #include "rf_map.h" 57 #include "rf_utils.h" 58 #include "rf_shutdown.h" 59 60 typedef struct RF_ParityLoggingConfigInfo_s { 61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by 62 * IdentifyStripe */ 63 } RF_ParityLoggingConfigInfo_t; 64 65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID); 66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); 67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); 68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); 69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); 70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); 71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); 72 73 int 74 rf_ConfigureParityLogging( 75 RF_ShutdownList_t ** listp, 76 RF_Raid_t * raidPtr, 77 RF_Config_t * cfgPtr) 78 { 79 int i, j, startdisk, rc; 80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; 81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; 82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 83 RF_ParityLoggingConfigInfo_t *info; 84 RF_ParityLog_t *l = NULL, *next; 85 caddr_t lHeapPtr; 86 87 if (rf_numParityRegions <= 0) 88 return(EINVAL); 89 90 /* 91 * We create multiple entries on the shutdown list here, since 92 * this configuration routine is fairly complicated in and of 93 * itself, and this makes backing out of a failed configuration 94 * much simpler. 95 */ 96 97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; 98 99 /* create a parity logging configuration structure */ 100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), 101 (RF_ParityLoggingConfigInfo_t *), 102 raidPtr->cleanupList); 103 if (info == NULL) 104 return (ENOMEM); 105 layoutPtr->layoutSpecificInfo = (void *) info; 106 107 /* the stripe identifier must identify the disks in each stripe, IN 108 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */ 109 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), 110 (raidPtr->numCol), 111 raidPtr->cleanupList); 112 if (info->stripeIdentifier == NULL) 113 return (ENOMEM); 114 115 startdisk = 0; 116 for (i = 0; i < (raidPtr->numCol); i++) { 117 for (j = 0; j < (raidPtr->numCol); j++) { 118 info->stripeIdentifier[i][j] = (startdisk + j) % 119 (raidPtr->numCol - 1); 120 } 121 if ((--startdisk) < 0) 122 startdisk = raidPtr->numCol - 1 - 1; 123 } 124 125 /* fill in the remaining layout parameters */ 126 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; 127 layoutPtr->numParityCol = 1; 128 layoutPtr->numParityLogCol = 1; 129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - 130 layoutPtr->numParityLogCol; 131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * 132 layoutPtr->sectorsPerStripeUnit; 133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; 134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * 135 layoutPtr->sectorsPerStripeUnit; 136 137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * 138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; 139 140 /* configure parity log parameters 141 * 142 * parameter comment/constraints 143 * ------------------------------------------- 144 * numParityRegions* all regions (except possibly last) 145 * of equal size 146 * totalInCoreLogCapacity* amount of memory in bytes available 147 * for in-core logs (default 1 MB) 148 * numSectorsPerLog# capacity of an in-core log in sectors 149 * (1 * disk track) 150 * numParityLogs total number of in-core logs, 151 * should be at least numParityRegions 152 * regionLogCapacity size of a region log (except possibly 153 * last one) in sectors 154 * totalLogCapacity total amount of log space in sectors 155 * 156 * where '*' denotes a user settable parameter. 157 * Note that logs are fixed to be the size of a disk track, 158 * value #defined in rf_paritylog.h 159 * 160 */ 161 162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; 163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; 164 if (rf_parityLogDebug) 165 printf("bytes per sector %d\n", raidPtr->bytesPerSector); 166 167 /* reduce fragmentation within a disk region by adjusting the number 168 * of regions in an attempt to allow an integral number of logs to fit 169 * into a disk region */ 170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; 171 if (fragmentation > 0) 172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) { 173 if (((totalLogCapacity / (rf_numParityRegions + i)) % 174 raidPtr->numSectorsPerLog) < fragmentation) { 175 rf_numParityRegions++; 176 raidPtr->regionLogCapacity = totalLogCapacity / 177 rf_numParityRegions; 178 fragmentation = raidPtr->regionLogCapacity % 179 raidPtr->numSectorsPerLog; 180 } 181 if (((totalLogCapacity / (rf_numParityRegions - i)) % 182 raidPtr->numSectorsPerLog) < fragmentation) { 183 rf_numParityRegions--; 184 raidPtr->regionLogCapacity = totalLogCapacity / 185 rf_numParityRegions; 186 fragmentation = raidPtr->regionLogCapacity % 187 raidPtr->numSectorsPerLog; 188 } 189 } 190 /* ensure integral number of regions per log */ 191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / 192 raidPtr->numSectorsPerLog) * 193 raidPtr->numSectorsPerLog; 194 195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / 196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); 197 /* to avoid deadlock, must ensure that enough logs exist for each 198 * region to have one simultaneously */ 199 if (raidPtr->numParityLogs < rf_numParityRegions) 200 raidPtr->numParityLogs = rf_numParityRegions; 201 202 /* create region information structs */ 203 printf("Allocating %d bytes for in-core parity region info\n", 204 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t))); 205 RF_Malloc(raidPtr->regionInfo, 206 (rf_numParityRegions * sizeof(RF_RegionInfo_t)), 207 (RF_RegionInfo_t *)); 208 if (raidPtr->regionInfo == NULL) 209 return (ENOMEM); 210 211 /* last region may not be full capacity */ 212 lastRegionCapacity = raidPtr->regionLogCapacity; 213 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + 214 lastRegionCapacity > totalLogCapacity) 215 lastRegionCapacity = lastRegionCapacity - 216 raidPtr->numSectorsPerLog; 217 218 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / 219 rf_numParityRegions; 220 maxRegionParityRange = raidPtr->regionParityRange; 221 222 /* i can't remember why this line is in the code -wvcii 6/30/95 */ 223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) 224 regionParityRange++; */ 225 226 /* build pool of unused parity logs */ 227 printf("Allocating %d bytes for %d parity logs\n", 228 raidPtr->numParityLogs * raidPtr->numSectorsPerLog * 229 raidPtr->bytesPerSector, 230 raidPtr->numParityLogs); 231 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 232 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, 233 (caddr_t)); 234 if (raidPtr->parityLogBufferHeap == NULL) 235 return (ENOMEM); 236 lHeapPtr = raidPtr->parityLogBufferHeap; 237 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); 238 if (rc) { 239 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 240 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 241 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 242 return (ENOMEM); 243 } 244 for (i = 0; i < raidPtr->numParityLogs; i++) { 245 if (i == 0) { 246 RF_Malloc(raidPtr->parityLogPool.parityLogs, 247 sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); 248 if (raidPtr->parityLogPool.parityLogs == NULL) { 249 RF_Free(raidPtr->parityLogBufferHeap, 250 raidPtr->numParityLogs * 251 raidPtr->numSectorsPerLog * 252 raidPtr->bytesPerSector); 253 return (ENOMEM); 254 } 255 l = raidPtr->parityLogPool.parityLogs; 256 } else { 257 RF_Malloc(l->next, sizeof(RF_ParityLog_t), 258 (RF_ParityLog_t *)); 259 if (l->next == NULL) { 260 RF_Free(raidPtr->parityLogBufferHeap, 261 raidPtr->numParityLogs * 262 raidPtr->numSectorsPerLog * 263 raidPtr->bytesPerSector); 264 for (l = raidPtr->parityLogPool.parityLogs; 265 l; 266 l = next) { 267 next = l->next; 268 if (l->records) 269 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); 270 RF_Free(l, sizeof(RF_ParityLog_t)); 271 } 272 return (ENOMEM); 273 } 274 l = l->next; 275 } 276 l->bufPtr = lHeapPtr; 277 lHeapPtr += raidPtr->numSectorsPerLog * 278 raidPtr->bytesPerSector; 279 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * 280 sizeof(RF_ParityLogRecord_t)), 281 (RF_ParityLogRecord_t *)); 282 if (l->records == NULL) { 283 RF_Free(raidPtr->parityLogBufferHeap, 284 raidPtr->numParityLogs * 285 raidPtr->numSectorsPerLog * 286 raidPtr->bytesPerSector); 287 for (l = raidPtr->parityLogPool.parityLogs; 288 l; 289 l = next) { 290 next = l->next; 291 if (l->records) 292 RF_Free(l->records, 293 (raidPtr->numSectorsPerLog * 294 sizeof(RF_ParityLogRecord_t))); 295 RF_Free(l, sizeof(RF_ParityLog_t)); 296 } 297 return (ENOMEM); 298 } 299 } 300 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); 301 if (rc) { 302 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 303 __LINE__, rc); 304 rf_ShutdownParityLoggingPool(raidPtr); 305 return (rc); 306 } 307 /* build pool of region buffers */ 308 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); 309 if (rc) { 310 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 311 return (ENOMEM); 312 } 313 raidPtr->regionBufferPool.cond = 0; 314 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * 315 raidPtr->bytesPerSector; 316 printf("regionBufferPool.bufferSize %d\n", 317 raidPtr->regionBufferPool.bufferSize); 318 319 /* for now, only one region at a time may be reintegrated */ 320 raidPtr->regionBufferPool.totalBuffers = 1; 321 322 raidPtr->regionBufferPool.availableBuffers = 323 raidPtr->regionBufferPool.totalBuffers; 324 raidPtr->regionBufferPool.availBuffersIndex = 0; 325 raidPtr->regionBufferPool.emptyBuffersIndex = 0; 326 printf("Allocating %d bytes for regionBufferPool\n", 327 (int) (raidPtr->regionBufferPool.totalBuffers * 328 sizeof(caddr_t))); 329 RF_Malloc(raidPtr->regionBufferPool.buffers, 330 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), 331 (caddr_t *)); 332 if (raidPtr->regionBufferPool.buffers == NULL) { 333 return (ENOMEM); 334 } 335 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { 336 printf("Allocating %d bytes for regionBufferPool#%d\n", 337 (int) (raidPtr->regionBufferPool.bufferSize * 338 sizeof(char)), i); 339 RF_Malloc(raidPtr->regionBufferPool.buffers[i], 340 raidPtr->regionBufferPool.bufferSize * sizeof(char), 341 (caddr_t)); 342 if (raidPtr->regionBufferPool.buffers[i] == NULL) { 343 for (j = 0; j < i; j++) { 344 RF_Free(raidPtr->regionBufferPool.buffers[i], 345 raidPtr->regionBufferPool.bufferSize * 346 sizeof(char)); 347 } 348 RF_Free(raidPtr->regionBufferPool.buffers, 349 raidPtr->regionBufferPool.totalBuffers * 350 sizeof(caddr_t)); 351 return (ENOMEM); 352 } 353 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, 354 (long) raidPtr->regionBufferPool.buffers[i]); 355 } 356 rc = rf_ShutdownCreate(listp, 357 rf_ShutdownParityLoggingRegionBufferPool, 358 raidPtr); 359 if (rc) { 360 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 361 __LINE__, rc); 362 rf_ShutdownParityLoggingRegionBufferPool(raidPtr); 363 return (rc); 364 } 365 /* build pool of parity buffers */ 366 parityBufferCapacity = maxRegionParityRange; 367 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); 368 if (rc) { 369 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 370 return (rc); 371 } 372 raidPtr->parityBufferPool.cond = 0; 373 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * 374 raidPtr->bytesPerSector; 375 printf("parityBufferPool.bufferSize %d\n", 376 raidPtr->parityBufferPool.bufferSize); 377 378 /* for now, only one region at a time may be reintegrated */ 379 raidPtr->parityBufferPool.totalBuffers = 1; 380 381 raidPtr->parityBufferPool.availableBuffers = 382 raidPtr->parityBufferPool.totalBuffers; 383 raidPtr->parityBufferPool.availBuffersIndex = 0; 384 raidPtr->parityBufferPool.emptyBuffersIndex = 0; 385 printf("Allocating %d bytes for parityBufferPool of %d units\n", 386 (int) (raidPtr->parityBufferPool.totalBuffers * 387 sizeof(caddr_t)), 388 raidPtr->parityBufferPool.totalBuffers ); 389 RF_Malloc(raidPtr->parityBufferPool.buffers, 390 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), 391 (caddr_t *)); 392 if (raidPtr->parityBufferPool.buffers == NULL) { 393 return (ENOMEM); 394 } 395 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { 396 printf("Allocating %d bytes for parityBufferPool#%d\n", 397 (int) (raidPtr->parityBufferPool.bufferSize * 398 sizeof(char)),i); 399 RF_Malloc(raidPtr->parityBufferPool.buffers[i], 400 raidPtr->parityBufferPool.bufferSize * sizeof(char), 401 (caddr_t)); 402 if (raidPtr->parityBufferPool.buffers == NULL) { 403 for (j = 0; j < i; j++) { 404 RF_Free(raidPtr->parityBufferPool.buffers[i], 405 raidPtr->regionBufferPool.bufferSize * 406 sizeof(char)); 407 } 408 RF_Free(raidPtr->parityBufferPool.buffers, 409 raidPtr->regionBufferPool.totalBuffers * 410 sizeof(caddr_t)); 411 return (ENOMEM); 412 } 413 printf("parityBufferPool.buffers[%d] = %lx\n", i, 414 (long) raidPtr->parityBufferPool.buffers[i]); 415 } 416 rc = rf_ShutdownCreate(listp, 417 rf_ShutdownParityLoggingParityBufferPool, 418 raidPtr); 419 if (rc) { 420 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 421 __LINE__, rc); 422 rf_ShutdownParityLoggingParityBufferPool(raidPtr); 423 return (rc); 424 } 425 /* initialize parityLogDiskQueue */ 426 rf_mutex_init(&raidPtr->parityLogDiskQueue.mutex); 427 raidPtr->parityLogDiskQueue.cond = 0; 428 raidPtr->parityLogDiskQueue.flushQueue = NULL; 429 raidPtr->parityLogDiskQueue.reintQueue = NULL; 430 raidPtr->parityLogDiskQueue.bufHead = NULL; 431 raidPtr->parityLogDiskQueue.bufTail = NULL; 432 raidPtr->parityLogDiskQueue.reintHead = NULL; 433 raidPtr->parityLogDiskQueue.reintTail = NULL; 434 raidPtr->parityLogDiskQueue.logBlockHead = NULL; 435 raidPtr->parityLogDiskQueue.logBlockTail = NULL; 436 raidPtr->parityLogDiskQueue.reintBlockHead = NULL; 437 raidPtr->parityLogDiskQueue.reintBlockTail = NULL; 438 raidPtr->parityLogDiskQueue.freeDataList = NULL; 439 raidPtr->parityLogDiskQueue.freeCommonList = NULL; 440 441 rc = rf_ShutdownCreate(listp, 442 rf_ShutdownParityLoggingDiskQueue, 443 raidPtr); 444 if (rc) { 445 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 446 __LINE__, rc); 447 return (rc); 448 } 449 for (i = 0; i < rf_numParityRegions; i++) { 450 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); 451 if (rc) { 452 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 453 for (j = 0; j < i; j++) 454 FreeRegionInfo(raidPtr, j); 455 RF_Free(raidPtr->regionInfo, 456 (rf_numParityRegions * 457 sizeof(RF_RegionInfo_t))); 458 return (ENOMEM); 459 } 460 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); 461 if (rc) { 462 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc); 463 for (j = 0; j < i; j++) 464 FreeRegionInfo(raidPtr, j); 465 RF_Free(raidPtr->regionInfo, 466 (rf_numParityRegions * 467 sizeof(RF_RegionInfo_t))); 468 return (ENOMEM); 469 } 470 raidPtr->regionInfo[i].reintInProgress = RF_FALSE; 471 raidPtr->regionInfo[i].regionStartAddr = 472 raidPtr->regionLogCapacity * i; 473 raidPtr->regionInfo[i].parityStartAddr = 474 raidPtr->regionParityRange * i; 475 if (i < rf_numParityRegions - 1) { 476 raidPtr->regionInfo[i].capacity = 477 raidPtr->regionLogCapacity; 478 raidPtr->regionInfo[i].numSectorsParity = 479 raidPtr->regionParityRange; 480 } else { 481 raidPtr->regionInfo[i].capacity = 482 lastRegionCapacity; 483 raidPtr->regionInfo[i].numSectorsParity = 484 raidPtr->sectorsPerDisk - 485 raidPtr->regionParityRange * i; 486 if (raidPtr->regionInfo[i].numSectorsParity > 487 maxRegionParityRange) 488 maxRegionParityRange = 489 raidPtr->regionInfo[i].numSectorsParity; 490 } 491 raidPtr->regionInfo[i].diskCount = 0; 492 RF_ASSERT(raidPtr->regionInfo[i].capacity + 493 raidPtr->regionInfo[i].regionStartAddr <= 494 totalLogCapacity); 495 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + 496 raidPtr->regionInfo[i].numSectorsParity <= 497 raidPtr->sectorsPerDisk); 498 printf("Allocating %d bytes for region %d\n", 499 (int) (raidPtr->regionInfo[i].capacity * 500 sizeof(RF_DiskMap_t)), i); 501 RF_Malloc(raidPtr->regionInfo[i].diskMap, 502 (raidPtr->regionInfo[i].capacity * 503 sizeof(RF_DiskMap_t)), 504 (RF_DiskMap_t *)); 505 if (raidPtr->regionInfo[i].diskMap == NULL) { 506 for (j = 0; j < i; j++) 507 FreeRegionInfo(raidPtr, j); 508 RF_Free(raidPtr->regionInfo, 509 (rf_numParityRegions * 510 sizeof(RF_RegionInfo_t))); 511 return (ENOMEM); 512 } 513 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; 514 raidPtr->regionInfo[i].coreLog = NULL; 515 } 516 rc = rf_ShutdownCreate(listp, 517 rf_ShutdownParityLoggingRegionInfo, 518 raidPtr); 519 if (rc) { 520 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, 521 __LINE__, rc); 522 rf_ShutdownParityLoggingRegionInfo(raidPtr); 523 return (rc); 524 } 525 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); 526 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; 527 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, 528 rf_ParityLoggingDiskManager, raidPtr,"rf_log"); 529 if (rc) { 530 raidPtr->parityLogDiskQueue.threadState = 0; 531 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", 532 __FILE__, __LINE__, rc); 533 return (ENOMEM); 534 } 535 /* wait for thread to start */ 536 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 537 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) { 538 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 539 raidPtr->parityLogDiskQueue.mutex); 540 } 541 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 542 543 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); 544 if (rc) { 545 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc); 546 rf_ShutdownParityLogging(raidPtr); 547 return (rc); 548 } 549 if (rf_parityLogDebug) { 550 printf(" size of disk log in sectors: %d\n", 551 (int) totalLogCapacity); 552 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions); 553 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity); 554 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation); 555 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); 556 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); 557 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); 558 } 559 rf_EnableParityLogging(raidPtr); 560 561 return (0); 562 } 563 564 static void 565 FreeRegionInfo( 566 RF_Raid_t * raidPtr, 567 RF_RegionId_t regionID) 568 { 569 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 570 RF_Free(raidPtr->regionInfo[regionID].diskMap, 571 (raidPtr->regionInfo[regionID].capacity * 572 sizeof(RF_DiskMap_t))); 573 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { 574 rf_ReleaseParityLogs(raidPtr, 575 raidPtr->regionInfo[regionID].coreLog); 576 raidPtr->regionInfo[regionID].coreLog = NULL; 577 } else { 578 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); 579 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); 580 } 581 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); 582 } 583 584 585 static void 586 FreeParityLogQueue( 587 RF_Raid_t * raidPtr, 588 RF_ParityLogQueue_t * queue) 589 { 590 RF_ParityLog_t *l1, *l2; 591 592 RF_LOCK_MUTEX(queue->mutex); 593 l1 = queue->parityLogs; 594 while (l1) { 595 l2 = l1; 596 l1 = l2->next; 597 RF_Free(l2->records, (raidPtr->numSectorsPerLog * 598 sizeof(RF_ParityLogRecord_t))); 599 RF_Free(l2, sizeof(RF_ParityLog_t)); 600 } 601 RF_UNLOCK_MUTEX(queue->mutex); 602 } 603 604 605 static void 606 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue) 607 { 608 int i; 609 610 RF_LOCK_MUTEX(queue->mutex); 611 if (queue->availableBuffers != queue->totalBuffers) { 612 printf("Attempt to free region queue which is still in use!\n"); 613 RF_ASSERT(0); 614 } 615 for (i = 0; i < queue->totalBuffers; i++) 616 RF_Free(queue->buffers[i], queue->bufferSize); 617 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); 618 RF_UNLOCK_MUTEX(queue->mutex); 619 } 620 621 static void 622 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) 623 { 624 RF_Raid_t *raidPtr; 625 RF_RegionId_t i; 626 627 raidPtr = (RF_Raid_t *) arg; 628 if (rf_parityLogDebug) { 629 printf("raid%d: ShutdownParityLoggingRegionInfo\n", 630 raidPtr->raidid); 631 } 632 /* free region information structs */ 633 for (i = 0; i < rf_numParityRegions; i++) 634 FreeRegionInfo(raidPtr, i); 635 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * 636 sizeof(raidPtr->regionInfo))); 637 raidPtr->regionInfo = NULL; 638 } 639 640 static void 641 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) 642 { 643 RF_Raid_t *raidPtr; 644 645 raidPtr = (RF_Raid_t *) arg; 646 if (rf_parityLogDebug) { 647 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid); 648 } 649 /* free contents of parityLogPool */ 650 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); 651 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * 652 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); 653 } 654 655 static void 656 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) 657 { 658 RF_Raid_t *raidPtr; 659 660 raidPtr = (RF_Raid_t *) arg; 661 if (rf_parityLogDebug) { 662 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n", 663 raidPtr->raidid); 664 } 665 FreeRegionBufferQueue(&raidPtr->regionBufferPool); 666 } 667 668 static void 669 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) 670 { 671 RF_Raid_t *raidPtr; 672 673 raidPtr = (RF_Raid_t *) arg; 674 if (rf_parityLogDebug) { 675 printf("raid%d: ShutdownParityLoggingParityBufferPool\n", 676 raidPtr->raidid); 677 } 678 FreeRegionBufferQueue(&raidPtr->parityBufferPool); 679 } 680 681 static void 682 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) 683 { 684 RF_ParityLogData_t *d; 685 RF_CommonLogData_t *c; 686 RF_Raid_t *raidPtr; 687 688 raidPtr = (RF_Raid_t *) arg; 689 if (rf_parityLogDebug) { 690 printf("raid%d: ShutdownParityLoggingDiskQueue\n", 691 raidPtr->raidid); 692 } 693 /* free disk manager stuff */ 694 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); 695 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); 696 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); 697 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); 698 while (raidPtr->parityLogDiskQueue.freeDataList) { 699 d = raidPtr->parityLogDiskQueue.freeDataList; 700 raidPtr->parityLogDiskQueue.freeDataList = 701 raidPtr->parityLogDiskQueue.freeDataList->next; 702 RF_Free(d, sizeof(RF_ParityLogData_t)); 703 } 704 while (raidPtr->parityLogDiskQueue.freeCommonList) { 705 c = raidPtr->parityLogDiskQueue.freeCommonList; 706 raidPtr->parityLogDiskQueue.freeCommonList = 707 raidPtr->parityLogDiskQueue.freeCommonList->next; 708 RF_Free(c, sizeof(RF_CommonLogData_t)); 709 } 710 } 711 712 static void 713 rf_ShutdownParityLogging(RF_ThreadArg_t arg) 714 { 715 RF_Raid_t *raidPtr; 716 717 raidPtr = (RF_Raid_t *) arg; 718 if (rf_parityLogDebug) { 719 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid); 720 } 721 /* shutdown disk thread */ 722 /* This has the desirable side-effect of forcing all regions to be 723 * reintegrated. This is necessary since all parity log maps are 724 * currently held in volatile memory. */ 725 726 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 727 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; 728 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 729 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); 730 /* 731 * pLogDiskThread will now terminate when queues are cleared 732 * now wait for it to be done 733 */ 734 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 735 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) { 736 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, 737 raidPtr->parityLogDiskQueue.mutex); 738 } 739 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); 740 if (rf_parityLogDebug) { 741 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid); 742 } 743 } 744 745 int 746 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr) 747 { 748 return (20); 749 } 750 751 RF_HeadSepLimit_t 752 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr) 753 { 754 return (10); 755 } 756 /* return the region ID for a given RAID address */ 757 RF_RegionId_t 758 rf_MapRegionIDParityLogging( 759 RF_Raid_t * raidPtr, 760 RF_SectorNum_t address) 761 { 762 RF_RegionId_t regionID; 763 764 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ 765 regionID = address / raidPtr->regionParityRange; 766 if (regionID == rf_numParityRegions) { 767 /* last region may be larger than other regions */ 768 regionID--; 769 } 770 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); 771 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + 772 raidPtr->regionInfo[regionID].numSectorsParity); 773 RF_ASSERT(regionID < rf_numParityRegions); 774 return (regionID); 775 } 776 777 778 /* given a logical RAID sector, determine physical disk address of data */ 779 void 780 rf_MapSectorParityLogging( 781 RF_Raid_t * raidPtr, 782 RF_RaidAddr_t raidSector, 783 RF_RowCol_t * col, 784 RF_SectorNum_t * diskSector, 785 int remap) 786 { 787 RF_StripeNum_t SUID = raidSector / 788 raidPtr->Layout.sectorsPerStripeUnit; 789 /* *col = (SUID % (raidPtr->numCol - 790 * raidPtr->Layout.numParityLogCol)); */ 791 *col = SUID % raidPtr->Layout.numDataCol; 792 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 793 raidPtr->Layout.sectorsPerStripeUnit + 794 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 795 } 796 797 798 /* given a logical RAID sector, determine physical disk address of parity */ 799 void 800 rf_MapParityParityLogging( 801 RF_Raid_t * raidPtr, 802 RF_RaidAddr_t raidSector, 803 RF_RowCol_t * col, 804 RF_SectorNum_t * diskSector, 805 int remap) 806 { 807 RF_StripeNum_t SUID = raidSector / 808 raidPtr->Layout.sectorsPerStripeUnit; 809 810 /* *col = 811 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt 812 * r->numCol - raidPtr->Layout.numParityLogCol); */ 813 *col = raidPtr->Layout.numDataCol; 814 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * 815 raidPtr->Layout.sectorsPerStripeUnit + 816 (raidSector % raidPtr->Layout.sectorsPerStripeUnit); 817 } 818 819 820 /* given a regionID and sector offset, determine the physical disk address of the parity log */ 821 void 822 rf_MapLogParityLogging( 823 RF_Raid_t * raidPtr, 824 RF_RegionId_t regionID, 825 RF_SectorNum_t regionOffset, 826 RF_RowCol_t * col, 827 RF_SectorNum_t * startSector) 828 { 829 *col = raidPtr->numCol - 1; 830 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; 831 } 832 833 834 /* given a regionID, determine the physical disk address of the logged 835 parity for that region */ 836 void 837 rf_MapRegionParity( 838 RF_Raid_t * raidPtr, 839 RF_RegionId_t regionID, 840 RF_RowCol_t * col, 841 RF_SectorNum_t * startSector, 842 RF_SectorCount_t * numSector) 843 { 844 *col = raidPtr->numCol - 2; 845 *startSector = raidPtr->regionInfo[regionID].parityStartAddr; 846 *numSector = raidPtr->regionInfo[regionID].numSectorsParity; 847 } 848 849 850 /* given a logical RAID address, determine the participating disks in 851 the stripe */ 852 void 853 rf_IdentifyStripeParityLogging( 854 RF_Raid_t * raidPtr, 855 RF_RaidAddr_t addr, 856 RF_RowCol_t ** diskids) 857 { 858 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, 859 addr); 860 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) 861 raidPtr->Layout.layoutSpecificInfo; 862 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol]; 863 } 864 865 866 void 867 rf_MapSIDToPSIDParityLogging( 868 RF_RaidLayout_t * layoutPtr, 869 RF_StripeNum_t stripeID, 870 RF_StripeNum_t * psID, 871 RF_ReconUnitNum_t * which_ru) 872 { 873 *which_ru = 0; 874 *psID = stripeID; 875 } 876 877 878 /* select an algorithm for performing an access. Returns two pointers, 879 * one to a function that will return information about the DAG, and 880 * another to a function that will create the dag. 881 */ 882 void 883 rf_ParityLoggingDagSelect( 884 RF_Raid_t * raidPtr, 885 RF_IoType_t type, 886 RF_AccessStripeMap_t * asmp, 887 RF_VoidFuncPtr * createFunc) 888 { 889 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 890 RF_PhysDiskAddr_t *failedPDA = NULL; 891 RF_RowCol_t fcol; 892 RF_RowStatus_t rstat; 893 int prior_recon; 894 895 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 896 897 if (asmp->numDataFailed + asmp->numParityFailed > 1) { 898 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); 899 *createFunc = NULL; 900 return; 901 } else 902 if (asmp->numDataFailed + asmp->numParityFailed == 1) { 903 904 /* if under recon & already reconstructed, redirect 905 * the access to the spare drive and eliminate the 906 * failure indication */ 907 failedPDA = asmp->failedPDAs[0]; 908 fcol = failedPDA->col; 909 rstat = raidPtr->status; 910 prior_recon = (rstat == rf_rs_reconfigured) || ( 911 (rstat == rf_rs_reconstructing) ? 912 rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0 913 ); 914 if (prior_recon) { 915 RF_RowCol_t oc = failedPDA->col; 916 RF_SectorNum_t oo = failedPDA->startSector; 917 if (layoutPtr->map->flags & 918 RF_DISTRIBUTE_SPARE) { 919 /* redirect to dist spare space */ 920 921 if (failedPDA == asmp->parityInfo) { 922 923 /* parity has failed */ 924 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, 925 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 926 927 if (asmp->parityInfo->next) { /* redir 2nd component, 928 * if any */ 929 RF_PhysDiskAddr_t *p = asmp->parityInfo->next; 930 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; 931 p->col = failedPDA->col; 932 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + 933 SUoffs; /* cheating: 934 * startSector is not 935 * really a RAID address */ 936 } 937 } else 938 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { 939 RF_ASSERT(0); /* should not ever 940 * happen */ 941 } else { 942 943 /* data has failed */ 944 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, 945 &failedPDA->col, &failedPDA->startSector, RF_REMAP); 946 947 } 948 949 } else { 950 /* redirect to dedicated spare space */ 951 952 failedPDA->col = raidPtr->Disks[fcol].spareCol; 953 954 /* the parity may have two distinct 955 * components, both of which may need 956 * to be redirected */ 957 if (asmp->parityInfo->next) { 958 if (failedPDA == asmp->parityInfo) { 959 failedPDA->next->col = failedPDA->col; 960 } else 961 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ 962 asmp->parityInfo->col = failedPDA->col; 963 } 964 } 965 } 966 967 RF_ASSERT(failedPDA->col != -1); 968 969 if (rf_dagDebug || rf_mapDebug) { 970 printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n", 971 raidPtr->raidid, type, oc, (long) oo, failedPDA->col, (long) failedPDA->startSector); 972 } 973 asmp->numDataFailed = asmp->numParityFailed = 0; 974 } 975 } 976 if (type == RF_IO_TYPE_READ) { 977 978 if (asmp->numDataFailed == 0) 979 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; 980 else 981 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG; 982 983 } else { 984 985 986 /* if mirroring, always use large writes. If the access 987 * requires two distinct parity updates, always do a small 988 * write. If the stripe contains a failure but the access 989 * does not, do a small write. The first conditional 990 * (numStripeUnitsAccessed <= numDataCol/2) uses a 991 * less-than-or-equal rather than just a less-than because 992 * when G is 3 or 4, numDataCol/2 is 1, and I want 993 * single-stripe-unit updates to use just one disk. */ 994 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) { 995 if (((asmp->numStripeUnitsAccessed <= 996 (layoutPtr->numDataCol / 2)) && 997 (layoutPtr->numDataCol != 1)) || 998 (asmp->parityInfo->next != NULL) || 999 rf_CheckStripeForFailures(raidPtr, asmp)) { 1000 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG; 1001 } else 1002 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG; 1003 } else 1004 if (asmp->numParityFailed == 1) 1005 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG; 1006 else 1007 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) 1008 *createFunc = NULL; 1009 else 1010 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG; 1011 } 1012 } 1013 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 1014