1 /* $NetBSD: rf_dagdegrd.c,v 1.29 2013/09/15 12:13:56 martin Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /* 30 * rf_dagdegrd.c 31 * 32 * code for creating degraded read DAGs 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegrd.c,v 1.29 2013/09/15 12:13:56 martin Exp $"); 37 38 #include <dev/raidframe/raidframevar.h> 39 40 #include "rf_archs.h" 41 #include "rf_raid.h" 42 #include "rf_dag.h" 43 #include "rf_dagutils.h" 44 #include "rf_dagfuncs.h" 45 #include "rf_debugMem.h" 46 #include "rf_general.h" 47 #include "rf_dagdegrd.h" 48 #include "rf_map.h" 49 50 51 /****************************************************************************** 52 * 53 * General comments on DAG creation: 54 * 55 * All DAGs in this file use roll-away error recovery. Each DAG has a single 56 * commit node, usually called "Cmt." If an error occurs before the Cmt node 57 * is reached, the execution engine will halt forward execution and work 58 * backward through the graph, executing the undo functions. Assuming that 59 * each node in the graph prior to the Cmt node are undoable and atomic - or - 60 * does not make changes to permanent state, the graph will fail atomically. 61 * If an error occurs after the Cmt node executes, the engine will roll-forward 62 * through the graph, blindly executing nodes until it reaches the end. 63 * If a graph reaches the end, it is assumed to have completed successfully. 64 * 65 * A graph has only 1 Cmt node. 66 * 67 */ 68 69 70 /****************************************************************************** 71 * 72 * The following wrappers map the standard DAG creation interface to the 73 * DAG creation routines. Additionally, these wrappers enable experimentation 74 * with new DAG structures by providing an extra level of indirection, allowing 75 * the DAG creation routines to be replaced at this single point. 76 */ 77 78 void 79 rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t *raidPtr, 80 RF_AccessStripeMap_t *asmap, 81 RF_DagHeader_t *dag_h, 82 void *bp, 83 RF_RaidAccessFlags_t flags, 84 RF_AllocListElem_t *allocList) 85 { 86 rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 87 &rf_xorRecoveryFuncs); 88 } 89 90 91 /****************************************************************************** 92 * 93 * DAG creation code begins here 94 */ 95 96 97 /****************************************************************************** 98 * Create a degraded read DAG for RAID level 1 99 * 100 * Hdr -> Nil -> R(p/s)d -> Commit -> Trm 101 * 102 * The "Rd" node reads data from the surviving disk in the mirror pair 103 * Rpd - read of primary copy 104 * Rsd - read of secondary copy 105 * 106 * Parameters: raidPtr - description of the physical array 107 * asmap - logical & physical addresses for this access 108 * bp - buffer ptr (for holding write data) 109 * flags - general flags (e.g. disk locking) 110 * allocList - list of memory allocated in DAG creation 111 *****************************************************************************/ 112 113 void 114 rf_CreateRaidOneDegradedReadDAG(RF_Raid_t *raidPtr, 115 RF_AccessStripeMap_t *asmap, 116 RF_DagHeader_t *dag_h, 117 void *bp, 118 RF_RaidAccessFlags_t flags, 119 RF_AllocListElem_t *allocList) 120 { 121 RF_DagNode_t *rdNode, *blockNode, *commitNode, *termNode; 122 RF_StripeNum_t parityStripeID; 123 RF_ReconUnitNum_t which_ru; 124 RF_PhysDiskAddr_t *pda; 125 int useMirror; 126 127 useMirror = 0; 128 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 129 asmap->raidAddress, &which_ru); 130 #if RF_DEBUG_DAG 131 if (rf_dagDebug) { 132 printf("[Creating RAID level 1 degraded read DAG]\n"); 133 } 134 #endif 135 dag_h->creator = "RaidOneDegradedReadDAG"; 136 /* alloc the Wnd nodes and the Wmir node */ 137 if (asmap->numDataFailed == 0) 138 useMirror = RF_FALSE; 139 else 140 useMirror = RF_TRUE; 141 142 /* total number of nodes = 1 + (block + commit + terminator) */ 143 144 rdNode = rf_AllocDAGNode(); 145 rdNode->list_next = dag_h->nodes; 146 dag_h->nodes = rdNode; 147 148 blockNode = rf_AllocDAGNode(); 149 blockNode->list_next = dag_h->nodes; 150 dag_h->nodes = blockNode; 151 152 commitNode = rf_AllocDAGNode(); 153 commitNode->list_next = dag_h->nodes; 154 dag_h->nodes = commitNode; 155 156 termNode = rf_AllocDAGNode(); 157 termNode->list_next = dag_h->nodes; 158 dag_h->nodes = termNode; 159 160 /* this dag can not commit until the commit node is reached. errors 161 * prior to the commit point imply the dag has failed and must be 162 * retried */ 163 dag_h->numCommitNodes = 1; 164 dag_h->numCommits = 0; 165 dag_h->numSuccedents = 1; 166 167 /* initialize the block, commit, and terminator nodes */ 168 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 169 NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); 170 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 171 NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 172 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 173 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 174 175 pda = asmap->physInfo; 176 RF_ASSERT(pda != NULL); 177 /* parityInfo must describe entire parity unit */ 178 RF_ASSERT(asmap->parityInfo->next == NULL); 179 180 /* initialize the data node */ 181 if (!useMirror) { 182 /* read primary copy of data */ 183 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 184 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); 185 rdNode->params[0].p = pda; 186 rdNode->params[1].p = pda->bufPtr; 187 rdNode->params[2].v = parityStripeID; 188 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 189 which_ru); 190 } else { 191 /* read secondary copy of data */ 192 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 193 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); 194 rdNode->params[0].p = asmap->parityInfo; 195 rdNode->params[1].p = pda->bufPtr; 196 rdNode->params[2].v = parityStripeID; 197 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 198 which_ru); 199 } 200 201 /* connect header to block node */ 202 RF_ASSERT(dag_h->numSuccedents == 1); 203 RF_ASSERT(blockNode->numAntecedents == 0); 204 dag_h->succedents[0] = blockNode; 205 206 /* connect block node to rdnode */ 207 RF_ASSERT(blockNode->numSuccedents == 1); 208 RF_ASSERT(rdNode->numAntecedents == 1); 209 blockNode->succedents[0] = rdNode; 210 rdNode->antecedents[0] = blockNode; 211 rdNode->antType[0] = rf_control; 212 213 /* connect rdnode to commit node */ 214 RF_ASSERT(rdNode->numSuccedents == 1); 215 RF_ASSERT(commitNode->numAntecedents == 1); 216 rdNode->succedents[0] = commitNode; 217 commitNode->antecedents[0] = rdNode; 218 commitNode->antType[0] = rf_control; 219 220 /* connect commit node to terminator */ 221 RF_ASSERT(commitNode->numSuccedents == 1); 222 RF_ASSERT(termNode->numAntecedents == 1); 223 RF_ASSERT(termNode->numSuccedents == 0); 224 commitNode->succedents[0] = termNode; 225 termNode->antecedents[0] = commitNode; 226 termNode->antType[0] = rf_control; 227 } 228 229 230 231 /****************************************************************************** 232 * 233 * creates a DAG to perform a degraded-mode read of data within one stripe. 234 * This DAG is as follows: 235 * 236 * Hdr -> Block -> Rud -> Xor -> Cmt -> T 237 * -> Rrd -> 238 * -> Rp --> 239 * 240 * Each R node is a successor of the L node 241 * One successor arc from each R node goes to C, and the other to X 242 * There is one Rud for each chunk of surviving user data requested by the 243 * user, and one Rrd for each chunk of surviving user data _not_ being read by 244 * the user 245 * R = read, ud = user data, rd = recovery (surviving) data, p = parity 246 * X = XOR, C = Commit, T = terminate 247 * 248 * The block node guarantees a single source node. 249 * 250 * Note: The target buffer for the XOR node is set to the actual user buffer 251 * where the failed data is supposed to end up. This buffer is zero'd by the 252 * code here. Thus, if you create a degraded read dag, use it, and then 253 * re-use, you have to be sure to zero the target buffer prior to the re-use. 254 * 255 * The recfunc argument at the end specifies the name and function used for 256 * the redundancy 257 * recovery function. 258 * 259 *****************************************************************************/ 260 261 void 262 rf_CreateDegradedReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 263 RF_DagHeader_t *dag_h, void *bp, 264 RF_RaidAccessFlags_t flags, 265 RF_AllocListElem_t *allocList, 266 const RF_RedFuncs_t *recFunc) 267 { 268 RF_DagNode_t *rudNodes, *rrdNodes, *xorNode, *blockNode; 269 RF_DagNode_t *commitNode, *rpNode, *termNode; 270 RF_DagNode_t *tmpNode, *tmprudNode, *tmprrdNode; 271 int nRrdNodes, nRudNodes, nXorBufs, i; 272 int j, paramNum; 273 RF_SectorCount_t sectorsPerSU; 274 RF_ReconUnitNum_t which_ru; 275 char overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */ 276 RF_AccessStripeMapHeader_t *new_asm_h[2]; 277 RF_PhysDiskAddr_t *pda, *parityPDA; 278 RF_StripeNum_t parityStripeID; 279 RF_PhysDiskAddr_t *failedPDA; 280 RF_RaidLayout_t *layoutPtr; 281 char *rpBuf; 282 283 layoutPtr = &(raidPtr->Layout); 284 /* failedPDA points to the pda within the asm that targets the failed 285 * disk */ 286 failedPDA = asmap->failedPDAs[0]; 287 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, 288 asmap->raidAddress, &which_ru); 289 sectorsPerSU = layoutPtr->sectorsPerStripeUnit; 290 291 #if RF_DEBUG_DAG 292 if (rf_dagDebug) { 293 printf("[Creating degraded read DAG]\n"); 294 } 295 #endif 296 RF_ASSERT(asmap->numDataFailed == 1); 297 dag_h->creator = "DegradedReadDAG"; 298 299 /* 300 * generate two ASMs identifying the surviving data we need 301 * in order to recover the lost data 302 */ 303 304 /* overlappingPDAs array must be zero'd */ 305 memset(overlappingPDAs, 0, RF_MAXCOL); 306 rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs, 307 &rpBuf, overlappingPDAs, allocList); 308 309 /* 310 * create all the nodes at once 311 * 312 * -1 because no access is generated for the failed pda 313 */ 314 nRudNodes = asmap->numStripeUnitsAccessed - 1; 315 nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + 316 ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); 317 318 blockNode = rf_AllocDAGNode(); 319 blockNode->list_next = dag_h->nodes; 320 dag_h->nodes = blockNode; 321 322 commitNode = rf_AllocDAGNode(); 323 commitNode->list_next = dag_h->nodes; 324 dag_h->nodes = commitNode; 325 326 xorNode = rf_AllocDAGNode(); 327 xorNode->list_next = dag_h->nodes; 328 dag_h->nodes = xorNode; 329 330 rpNode = rf_AllocDAGNode(); 331 rpNode->list_next = dag_h->nodes; 332 dag_h->nodes = rpNode; 333 334 termNode = rf_AllocDAGNode(); 335 termNode->list_next = dag_h->nodes; 336 dag_h->nodes = termNode; 337 338 for (i = 0; i < nRudNodes; i++) { 339 tmpNode = rf_AllocDAGNode(); 340 tmpNode->list_next = dag_h->nodes; 341 dag_h->nodes = tmpNode; 342 } 343 rudNodes = dag_h->nodes; 344 345 for (i = 0; i < nRrdNodes; i++) { 346 tmpNode = rf_AllocDAGNode(); 347 tmpNode->list_next = dag_h->nodes; 348 dag_h->nodes = tmpNode; 349 } 350 rrdNodes = dag_h->nodes; 351 352 /* initialize nodes */ 353 dag_h->numCommitNodes = 1; 354 dag_h->numCommits = 0; 355 /* this dag can not commit until the commit node is reached errors 356 * prior to the commit point imply the dag has failed */ 357 dag_h->numSuccedents = 1; 358 359 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 360 NULL, nRudNodes + nRrdNodes + 1, 0, 0, 0, dag_h, "Nil", allocList); 361 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 362 NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 363 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 364 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 365 rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc, 366 NULL, 1, nRudNodes + nRrdNodes + 1, 2 * nXorBufs + 2, 1, dag_h, 367 recFunc->SimpleName, allocList); 368 369 /* fill in the Rud nodes */ 370 tmprudNode = rudNodes; 371 for (pda = asmap->physInfo, i = 0; i < nRudNodes; i++, pda = pda->next) { 372 if (pda == failedPDA) { 373 i--; 374 continue; 375 } 376 rf_InitNode(tmprudNode, rf_wait, RF_FALSE, rf_DiskReadFunc, 377 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, 378 "Rud", allocList); 379 RF_ASSERT(pda); 380 tmprudNode->params[0].p = pda; 381 tmprudNode->params[1].p = pda->bufPtr; 382 tmprudNode->params[2].v = parityStripeID; 383 tmprudNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 384 tmprudNode = tmprudNode->list_next; 385 } 386 387 /* fill in the Rrd nodes */ 388 i = 0; 389 tmprrdNode = rrdNodes; 390 if (new_asm_h[0]) { 391 for (pda = new_asm_h[0]->stripeMap->physInfo; 392 i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed; 393 i++, pda = pda->next) { 394 rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, 395 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 396 dag_h, "Rrd", allocList); 397 RF_ASSERT(pda); 398 tmprrdNode->params[0].p = pda; 399 tmprrdNode->params[1].p = pda->bufPtr; 400 tmprrdNode->params[2].v = parityStripeID; 401 tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 402 tmprrdNode = tmprrdNode->list_next; 403 } 404 } 405 if (new_asm_h[1]) { 406 /* tmprrdNode = rrdNodes; */ /* don't set this here -- old code was using i+j, which means 407 we need to just continue using tmprrdNode for the next 'j' elements. */ 408 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo; 409 j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed; 410 j++, pda = pda->next) { 411 rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, 412 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 413 dag_h, "Rrd", allocList); 414 RF_ASSERT(pda); 415 tmprrdNode->params[0].p = pda; 416 tmprrdNode->params[1].p = pda->bufPtr; 417 tmprrdNode->params[2].v = parityStripeID; 418 tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 419 tmprrdNode = tmprrdNode->list_next; 420 } 421 } 422 /* make a PDA for the parity unit */ 423 parityPDA = rf_AllocPhysDiskAddr(); 424 parityPDA->next = dag_h->pda_cleanup_list; 425 dag_h->pda_cleanup_list = parityPDA; 426 parityPDA->col = asmap->parityInfo->col; 427 parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) 428 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); 429 parityPDA->numSector = failedPDA->numSector; 430 431 /* initialize the Rp node */ 432 rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 433 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList); 434 rpNode->params[0].p = parityPDA; 435 rpNode->params[1].p = rpBuf; 436 rpNode->params[2].v = parityStripeID; 437 rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 438 439 /* 440 * the last and nastiest step is to assign all 441 * the parameters of the Xor node 442 */ 443 paramNum = 0; 444 tmprrdNode = rrdNodes; 445 for (i = 0; i < nRrdNodes; i++) { 446 /* all the Rrd nodes need to be xored together */ 447 xorNode->params[paramNum++] = tmprrdNode->params[0]; 448 xorNode->params[paramNum++] = tmprrdNode->params[1]; 449 tmprrdNode = tmprrdNode->list_next; 450 } 451 tmprudNode = rudNodes; 452 for (i = 0; i < nRudNodes; i++) { 453 /* any Rud nodes that overlap the failed access need to be 454 * xored in */ 455 if (overlappingPDAs[i]) { 456 pda = rf_AllocPhysDiskAddr(); 457 memcpy((char *) pda, (char *) tmprudNode->params[0].p, sizeof(RF_PhysDiskAddr_t)); 458 /* add it into the pda_cleanup_list *after* the copy, TYVM */ 459 pda->next = dag_h->pda_cleanup_list; 460 dag_h->pda_cleanup_list = pda; 461 rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); 462 xorNode->params[paramNum++].p = pda; 463 xorNode->params[paramNum++].p = pda->bufPtr; 464 } 465 tmprudNode = tmprudNode->list_next; 466 } 467 468 /* install parity pda as last set of params to be xor'd */ 469 xorNode->params[paramNum++].p = parityPDA; 470 xorNode->params[paramNum++].p = rpBuf; 471 472 /* 473 * the last 2 params to the recovery xor node are 474 * the failed PDA and the raidPtr 475 */ 476 xorNode->params[paramNum++].p = failedPDA; 477 xorNode->params[paramNum++].p = raidPtr; 478 RF_ASSERT(paramNum == 2 * nXorBufs + 2); 479 480 /* 481 * The xor node uses results[0] as the target buffer. 482 * Set pointer and zero the buffer. In the kernel, this 483 * may be a user buffer in which case we have to remap it. 484 */ 485 xorNode->results[0] = failedPDA->bufPtr; 486 memset(failedPDA->bufPtr, 0, rf_RaidAddressToByte(raidPtr, 487 failedPDA->numSector)); 488 489 /* connect nodes to form graph */ 490 /* connect the header to the block node */ 491 RF_ASSERT(dag_h->numSuccedents == 1); 492 RF_ASSERT(blockNode->numAntecedents == 0); 493 dag_h->succedents[0] = blockNode; 494 495 /* connect the block node to the read nodes */ 496 RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes)); 497 RF_ASSERT(rpNode->numAntecedents == 1); 498 blockNode->succedents[0] = rpNode; 499 rpNode->antecedents[0] = blockNode; 500 rpNode->antType[0] = rf_control; 501 tmprrdNode = rrdNodes; 502 for (i = 0; i < nRrdNodes; i++) { 503 RF_ASSERT(tmprrdNode->numSuccedents == 1); 504 blockNode->succedents[1 + i] = tmprrdNode; 505 tmprrdNode->antecedents[0] = blockNode; 506 tmprrdNode->antType[0] = rf_control; 507 tmprrdNode = tmprrdNode->list_next; 508 } 509 tmprudNode = rudNodes; 510 for (i = 0; i < nRudNodes; i++) { 511 RF_ASSERT(tmprudNode->numSuccedents == 1); 512 blockNode->succedents[1 + nRrdNodes + i] = tmprudNode; 513 tmprudNode->antecedents[0] = blockNode; 514 tmprudNode->antType[0] = rf_control; 515 tmprudNode = tmprudNode->list_next; 516 } 517 518 /* connect the read nodes to the xor node */ 519 RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes)); 520 RF_ASSERT(rpNode->numSuccedents == 1); 521 rpNode->succedents[0] = xorNode; 522 xorNode->antecedents[0] = rpNode; 523 xorNode->antType[0] = rf_trueData; 524 tmprrdNode = rrdNodes; 525 for (i = 0; i < nRrdNodes; i++) { 526 RF_ASSERT(tmprrdNode->numSuccedents == 1); 527 tmprrdNode->succedents[0] = xorNode; 528 xorNode->antecedents[1 + i] = tmprrdNode; 529 xorNode->antType[1 + i] = rf_trueData; 530 tmprrdNode = tmprrdNode->list_next; 531 } 532 tmprudNode = rudNodes; 533 for (i = 0; i < nRudNodes; i++) { 534 RF_ASSERT(tmprudNode->numSuccedents == 1); 535 tmprudNode->succedents[0] = xorNode; 536 xorNode->antecedents[1 + nRrdNodes + i] = tmprudNode; 537 xorNode->antType[1 + nRrdNodes + i] = rf_trueData; 538 tmprudNode = tmprudNode->list_next; 539 } 540 541 /* connect the xor node to the commit node */ 542 RF_ASSERT(xorNode->numSuccedents == 1); 543 RF_ASSERT(commitNode->numAntecedents == 1); 544 xorNode->succedents[0] = commitNode; 545 commitNode->antecedents[0] = xorNode; 546 commitNode->antType[0] = rf_control; 547 548 /* connect the termNode to the commit node */ 549 RF_ASSERT(commitNode->numSuccedents == 1); 550 RF_ASSERT(termNode->numAntecedents == 1); 551 RF_ASSERT(termNode->numSuccedents == 0); 552 commitNode->succedents[0] = termNode; 553 termNode->antType[0] = rf_control; 554 termNode->antecedents[0] = commitNode; 555 } 556 557 #if (RF_INCLUDE_CHAINDECLUSTER > 0) 558 /****************************************************************************** 559 * Create a degraded read DAG for Chained Declustering 560 * 561 * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm 562 * 563 * The "Rd" node reads data from the surviving disk in the mirror pair 564 * Rpd - read of primary copy 565 * Rsd - read of secondary copy 566 * 567 * Parameters: raidPtr - description of the physical array 568 * asmap - logical & physical addresses for this access 569 * bp - buffer ptr (for holding write data) 570 * flags - general flags (e.g. disk locking) 571 * allocList - list of memory allocated in DAG creation 572 *****************************************************************************/ 573 574 void 575 rf_CreateRaidCDegradedReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 576 RF_DagHeader_t *dag_h, void *bp, 577 RF_RaidAccessFlags_t flags, 578 RF_AllocListElem_t *allocList) 579 { 580 RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode; 581 RF_StripeNum_t parityStripeID; 582 int useMirror, i, shiftable; 583 RF_ReconUnitNum_t which_ru; 584 RF_PhysDiskAddr_t *pda; 585 586 if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { 587 shiftable = RF_TRUE; 588 } else { 589 shiftable = RF_FALSE; 590 } 591 useMirror = 0; 592 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 593 asmap->raidAddress, &which_ru); 594 595 #if RF_DEBUG_DAG 596 if (rf_dagDebug) { 597 printf("[Creating RAID C degraded read DAG]\n"); 598 } 599 #endif 600 dag_h->creator = "RaidCDegradedReadDAG"; 601 /* alloc the Wnd nodes and the Wmir node */ 602 if (asmap->numDataFailed == 0) 603 useMirror = RF_FALSE; 604 else 605 useMirror = RF_TRUE; 606 607 /* total number of nodes = 1 + (block + commit + terminator) */ 608 RF_MallocAndAdd(nodes, 4 * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); 609 i = 0; 610 rdNode = &nodes[i]; 611 i++; 612 blockNode = &nodes[i]; 613 i++; 614 commitNode = &nodes[i]; 615 i++; 616 termNode = &nodes[i]; 617 i++; 618 619 /* 620 * This dag can not commit until the commit node is reached. 621 * Errors prior to the commit point imply the dag has failed 622 * and must be retried. 623 */ 624 dag_h->numCommitNodes = 1; 625 dag_h->numCommits = 0; 626 dag_h->numSuccedents = 1; 627 628 /* initialize the block, commit, and terminator nodes */ 629 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 630 NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); 631 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 632 NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 633 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 634 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 635 636 pda = asmap->physInfo; 637 RF_ASSERT(pda != NULL); 638 /* parityInfo must describe entire parity unit */ 639 RF_ASSERT(asmap->parityInfo->next == NULL); 640 641 /* initialize the data node */ 642 if (!useMirror) { 643 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 644 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); 645 if (shiftable && rf_compute_workload_shift(raidPtr, pda)) { 646 /* shift this read to the next disk in line */ 647 rdNode->params[0].p = asmap->parityInfo; 648 rdNode->params[1].p = pda->bufPtr; 649 rdNode->params[2].v = parityStripeID; 650 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 651 } else { 652 /* read primary copy */ 653 rdNode->params[0].p = pda; 654 rdNode->params[1].p = pda->bufPtr; 655 rdNode->params[2].v = parityStripeID; 656 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 657 } 658 } else { 659 /* read secondary copy of data */ 660 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 661 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); 662 rdNode->params[0].p = asmap->parityInfo; 663 rdNode->params[1].p = pda->bufPtr; 664 rdNode->params[2].v = parityStripeID; 665 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 666 } 667 668 /* connect header to block node */ 669 RF_ASSERT(dag_h->numSuccedents == 1); 670 RF_ASSERT(blockNode->numAntecedents == 0); 671 dag_h->succedents[0] = blockNode; 672 673 /* connect block node to rdnode */ 674 RF_ASSERT(blockNode->numSuccedents == 1); 675 RF_ASSERT(rdNode->numAntecedents == 1); 676 blockNode->succedents[0] = rdNode; 677 rdNode->antecedents[0] = blockNode; 678 rdNode->antType[0] = rf_control; 679 680 /* connect rdnode to commit node */ 681 RF_ASSERT(rdNode->numSuccedents == 1); 682 RF_ASSERT(commitNode->numAntecedents == 1); 683 rdNode->succedents[0] = commitNode; 684 commitNode->antecedents[0] = rdNode; 685 commitNode->antType[0] = rf_control; 686 687 /* connect commit node to terminator */ 688 RF_ASSERT(commitNode->numSuccedents == 1); 689 RF_ASSERT(termNode->numAntecedents == 1); 690 RF_ASSERT(termNode->numSuccedents == 0); 691 commitNode->succedents[0] = termNode; 692 termNode->antecedents[0] = commitNode; 693 termNode->antType[0] = rf_control; 694 } 695 #endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */ 696 697 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) 698 /* 699 * XXX move this elsewhere? 700 */ 701 void 702 rf_DD_GenerateFailedAccessASMs(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 703 RF_PhysDiskAddr_t **pdap, int *nNodep, 704 RF_PhysDiskAddr_t **pqpdap, int *nPQNodep, 705 RF_AllocListElem_t *allocList) 706 { 707 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 708 int PDAPerDisk, i; 709 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 710 int numDataCol = layoutPtr->numDataCol; 711 int state; 712 RF_SectorNum_t suoff, suend; 713 unsigned firstDataCol, napdas, count; 714 RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0; 715 RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; 716 RF_PhysDiskAddr_t *pda_p; 717 RF_PhysDiskAddr_t *phys_p; 718 RF_RaidAddr_t sosAddr; 719 720 /* determine how many pda's we will have to generate per unaccess 721 * stripe. If there is only one failed data unit, it is one; if two, 722 * possibly two, depending whether they overlap. */ 723 724 fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector); 725 fone_end = fone_start + fone->numSector; 726 727 #define CONS_PDA(if,start,num) \ 728 pda_p->col = asmap->if->col; \ 729 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ 730 pda_p->numSector = num; \ 731 pda_p->next = NULL; \ 732 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList) 733 734 if (asmap->numDataFailed == 1) { 735 PDAPerDisk = 1; 736 state = 1; 737 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); 738 pda_p = *pqpdap; 739 /* build p */ 740 CONS_PDA(parityInfo, fone_start, fone->numSector); 741 pda_p->type = RF_PDA_TYPE_PARITY; 742 pda_p++; 743 /* build q */ 744 CONS_PDA(qInfo, fone_start, fone->numSector); 745 pda_p->type = RF_PDA_TYPE_Q; 746 } else { 747 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector); 748 ftwo_end = ftwo_start + ftwo->numSector; 749 if (fone->numSector + ftwo->numSector > secPerSU) { 750 PDAPerDisk = 1; 751 state = 2; 752 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); 753 pda_p = *pqpdap; 754 CONS_PDA(parityInfo, 0, secPerSU); 755 pda_p->type = RF_PDA_TYPE_PARITY; 756 pda_p++; 757 CONS_PDA(qInfo, 0, secPerSU); 758 pda_p->type = RF_PDA_TYPE_Q; 759 } else { 760 PDAPerDisk = 2; 761 state = 3; 762 /* four of them, fone, then ftwo */ 763 RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); 764 pda_p = *pqpdap; 765 CONS_PDA(parityInfo, fone_start, fone->numSector); 766 pda_p->type = RF_PDA_TYPE_PARITY; 767 pda_p++; 768 CONS_PDA(qInfo, fone_start, fone->numSector); 769 pda_p->type = RF_PDA_TYPE_Q; 770 pda_p++; 771 CONS_PDA(parityInfo, ftwo_start, ftwo->numSector); 772 pda_p->type = RF_PDA_TYPE_PARITY; 773 pda_p++; 774 CONS_PDA(qInfo, ftwo_start, ftwo->numSector); 775 pda_p->type = RF_PDA_TYPE_Q; 776 } 777 } 778 /* figure out number of nonaccessed pda */ 779 napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo == NULL ? 1 : 0)); 780 *nPQNodep = PDAPerDisk; 781 782 /* sweep over the over accessed pda's, figuring out the number of 783 * additional pda's to generate. Of course, skip the failed ones */ 784 785 count = 0; 786 for (pda_p = asmap->physInfo; pda_p; pda_p = pda_p->next) { 787 if ((pda_p == fone) || (pda_p == ftwo)) 788 continue; 789 suoff = rf_StripeUnitOffset(layoutPtr, pda_p->startSector); 790 suend = suoff + pda_p->numSector; 791 switch (state) { 792 case 1: /* one failed PDA to overlap */ 793 /* if a PDA doesn't contain the failed unit, it can 794 * only miss the start or end, not both */ 795 if ((suoff > fone_start) || (suend < fone_end)) 796 count++; 797 break; 798 case 2: /* whole stripe */ 799 if (suoff) /* leak at begining */ 800 count++; 801 if (suend < numDataCol) /* leak at end */ 802 count++; 803 break; 804 case 3: /* two disjoint units */ 805 if ((suoff > fone_start) || (suend < fone_end)) 806 count++; 807 if ((suoff > ftwo_start) || (suend < ftwo_end)) 808 count++; 809 break; 810 default: 811 RF_PANIC(); 812 } 813 } 814 815 napdas += count; 816 *nNodep = napdas; 817 if (napdas == 0) 818 return; /* short circuit */ 819 820 /* allocate up our list of pda's */ 821 822 RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t), 823 (RF_PhysDiskAddr_t *), allocList); 824 *pdap = pda_p; 825 826 /* linkem together */ 827 for (i = 0; i < (napdas - 1); i++) 828 pda_p[i].next = pda_p + (i + 1); 829 830 /* march through the one's up to the first accessed disk */ 831 firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), asmap->physInfo->raidAddress) % numDataCol; 832 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 833 for (i = 0; i < firstDataCol; i++) { 834 if ((pda_p - (*pdap)) == napdas) 835 continue; 836 pda_p->type = RF_PDA_TYPE_DATA; 837 pda_p->raidAddress = sosAddr + (i * secPerSU); 838 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 839 /* skip over dead disks */ 840 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status)) 841 continue; 842 switch (state) { 843 case 1: /* fone */ 844 pda_p->numSector = fone->numSector; 845 pda_p->raidAddress += fone_start; 846 pda_p->startSector += fone_start; 847 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 848 break; 849 case 2: /* full stripe */ 850 pda_p->numSector = secPerSU; 851 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList); 852 break; 853 case 3: /* two slabs */ 854 pda_p->numSector = fone->numSector; 855 pda_p->raidAddress += fone_start; 856 pda_p->startSector += fone_start; 857 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 858 pda_p++; 859 pda_p->type = RF_PDA_TYPE_DATA; 860 pda_p->raidAddress = sosAddr + (i * secPerSU); 861 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 862 pda_p->numSector = ftwo->numSector; 863 pda_p->raidAddress += ftwo_start; 864 pda_p->startSector += ftwo_start; 865 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 866 break; 867 default: 868 RF_PANIC(); 869 } 870 pda_p++; 871 } 872 873 /* march through the touched stripe units */ 874 for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) { 875 if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1])) 876 continue; 877 suoff = rf_StripeUnitOffset(layoutPtr, phys_p->startSector); 878 suend = suoff + phys_p->numSector; 879 switch (state) { 880 case 1: /* single buffer */ 881 if (suoff > fone_start) { 882 RF_ASSERT(suend >= fone_end); 883 /* The data read starts after the mapped 884 * access, snip off the begining */ 885 pda_p->numSector = suoff - fone_start; 886 pda_p->raidAddress = sosAddr + (i * secPerSU) + fone_start; 887 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 888 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 889 pda_p++; 890 } 891 if (suend < fone_end) { 892 RF_ASSERT(suoff <= fone_start); 893 /* The data read stops before the end of the 894 * failed access, extend */ 895 pda_p->numSector = fone_end - suend; 896 pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 897 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 898 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 899 pda_p++; 900 } 901 break; 902 case 2: /* whole stripe unit */ 903 RF_ASSERT((suoff == 0) || (suend == secPerSU)); 904 if (suend < secPerSU) { /* short read, snip from end 905 * on */ 906 pda_p->numSector = secPerSU - suend; 907 pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 908 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 909 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 910 pda_p++; 911 } else 912 if (suoff > 0) { /* short at front */ 913 pda_p->numSector = suoff; 914 pda_p->raidAddress = sosAddr + (i * secPerSU); 915 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 916 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 917 pda_p++; 918 } 919 break; 920 case 3: /* two nonoverlapping failures */ 921 if ((suoff > fone_start) || (suend < fone_end)) { 922 if (suoff > fone_start) { 923 RF_ASSERT(suend >= fone_end); 924 /* The data read starts after the 925 * mapped access, snip off the 926 * begining */ 927 pda_p->numSector = suoff - fone_start; 928 pda_p->raidAddress = sosAddr + (i * secPerSU) + fone_start; 929 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 930 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 931 pda_p++; 932 } 933 if (suend < fone_end) { 934 RF_ASSERT(suoff <= fone_start); 935 /* The data read stops before the end 936 * of the failed access, extend */ 937 pda_p->numSector = fone_end - suend; 938 pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 939 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 940 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 941 pda_p++; 942 } 943 } 944 if ((suoff > ftwo_start) || (suend < ftwo_end)) { 945 if (suoff > ftwo_start) { 946 RF_ASSERT(suend >= ftwo_end); 947 /* The data read starts after the 948 * mapped access, snip off the 949 * begining */ 950 pda_p->numSector = suoff - ftwo_start; 951 pda_p->raidAddress = sosAddr + (i * secPerSU) + ftwo_start; 952 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 953 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 954 pda_p++; 955 } 956 if (suend < ftwo_end) { 957 RF_ASSERT(suoff <= ftwo_start); 958 /* The data read stops before the end 959 * of the failed access, extend */ 960 pda_p->numSector = ftwo_end - suend; 961 pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 962 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 963 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 964 pda_p++; 965 } 966 } 967 break; 968 default: 969 RF_PANIC(); 970 } 971 } 972 973 /* after the last accessed disk */ 974 for (; i < numDataCol; i++) { 975 if ((pda_p - (*pdap)) == napdas) 976 continue; 977 pda_p->type = RF_PDA_TYPE_DATA; 978 pda_p->raidAddress = sosAddr + (i * secPerSU); 979 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 980 /* skip over dead disks */ 981 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status)) 982 continue; 983 switch (state) { 984 case 1: /* fone */ 985 pda_p->numSector = fone->numSector; 986 pda_p->raidAddress += fone_start; 987 pda_p->startSector += fone_start; 988 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 989 break; 990 case 2: /* full stripe */ 991 pda_p->numSector = secPerSU; 992 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList); 993 break; 994 case 3: /* two slabs */ 995 pda_p->numSector = fone->numSector; 996 pda_p->raidAddress += fone_start; 997 pda_p->startSector += fone_start; 998 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 999 pda_p++; 1000 pda_p->type = RF_PDA_TYPE_DATA; 1001 pda_p->raidAddress = sosAddr + (i * secPerSU); 1002 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 1003 pda_p->numSector = ftwo->numSector; 1004 pda_p->raidAddress += ftwo_start; 1005 pda_p->startSector += ftwo_start; 1006 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList); 1007 break; 1008 default: 1009 RF_PANIC(); 1010 } 1011 pda_p++; 1012 } 1013 1014 RF_ASSERT(pda_p - *pdap == napdas); 1015 return; 1016 } 1017 #define INIT_DISK_NODE(node,name) \ 1018 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ 1019 (node)->succedents[0] = unblockNode; \ 1020 (node)->succedents[1] = recoveryNode; \ 1021 (node)->antecedents[0] = blockNode; \ 1022 (node)->antType[0] = rf_control 1023 1024 #define DISK_NODE_PARAMS(_node_,_p_) \ 1025 (_node_).params[0].p = _p_ ; \ 1026 (_node_).params[1].p = (_p_)->bufPtr; \ 1027 (_node_).params[2].v = parityStripeID; \ 1028 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru) 1029 1030 void 1031 rf_DoubleDegRead(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1032 RF_DagHeader_t *dag_h, void *bp, 1033 RF_RaidAccessFlags_t flags, 1034 RF_AllocListElem_t *allocList, 1035 const char *redundantReadNodeName, 1036 const char *recoveryNodeName, 1037 int (*recovFunc) (RF_DagNode_t *)) 1038 { 1039 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 1040 RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode, 1041 *unblockNode, *rpNodes, *rqNodes, *termNode; 1042 RF_PhysDiskAddr_t *pda, *pqPDAs; 1043 RF_PhysDiskAddr_t *npdas; 1044 int nNodes, nRrdNodes, nRudNodes, i; 1045 RF_ReconUnitNum_t which_ru; 1046 int nReadNodes, nPQNodes; 1047 RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0]; 1048 RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1]; 1049 RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); 1050 1051 #if RF_DEBUG_DAG 1052 if (rf_dagDebug) 1053 printf("[Creating Double Degraded Read DAG]\n"); 1054 #endif 1055 rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList); 1056 1057 nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); 1058 nReadNodes = nRrdNodes + nRudNodes + 2 * nPQNodes; 1059 nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes; 1060 1061 RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); 1062 i = 0; 1063 blockNode = &nodes[i]; 1064 i += 1; 1065 unblockNode = &nodes[i]; 1066 i += 1; 1067 recoveryNode = &nodes[i]; 1068 i += 1; 1069 termNode = &nodes[i]; 1070 i += 1; 1071 rudNodes = &nodes[i]; 1072 i += nRudNodes; 1073 rrdNodes = &nodes[i]; 1074 i += nRrdNodes; 1075 rpNodes = &nodes[i]; 1076 i += nPQNodes; 1077 rqNodes = &nodes[i]; 1078 i += nPQNodes; 1079 RF_ASSERT(i == nNodes); 1080 1081 dag_h->numSuccedents = 1; 1082 dag_h->succedents[0] = blockNode; 1083 dag_h->creator = "DoubleDegRead"; 1084 dag_h->numCommits = 0; 1085 dag_h->numCommitNodes = 1; /* unblock */ 1086 1087 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList); 1088 termNode->antecedents[0] = unblockNode; 1089 termNode->antType[0] = rf_control; 1090 termNode->antecedents[1] = recoveryNode; 1091 termNode->antType[1] = rf_control; 1092 1093 /* init the block and unblock nodes */ 1094 /* The block node has all nodes except itself, unblock and recovery as 1095 * successors. Similarly for predecessors of the unblock. */ 1096 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList); 1097 rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList); 1098 1099 for (i = 0; i < nReadNodes; i++) { 1100 blockNode->succedents[i] = rudNodes + i; 1101 unblockNode->antecedents[i] = rudNodes + i; 1102 unblockNode->antType[i] = rf_control; 1103 } 1104 unblockNode->succedents[0] = termNode; 1105 1106 /* The recovery node has all the reads as predecessors, and the term 1107 * node as successors. It gets a pda as a param from each of the read 1108 * nodes plus the raidPtr. For each failed unit is has a result pda. */ 1109 rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, 1110 1, /* succesors */ 1111 nReadNodes, /* preds */ 1112 nReadNodes + 2, /* params */ 1113 asmap->numDataFailed, /* results */ 1114 dag_h, recoveryNodeName, allocList); 1115 1116 recoveryNode->succedents[0] = termNode; 1117 for (i = 0; i < nReadNodes; i++) { 1118 recoveryNode->antecedents[i] = rudNodes + i; 1119 recoveryNode->antType[i] = rf_trueData; 1120 } 1121 1122 /* build the read nodes, then come back and fill in recovery params 1123 * and results */ 1124 pda = asmap->physInfo; 1125 for (i = 0; i < nRudNodes; pda = pda->next) { 1126 if ((pda == failedPDA) || (pda == failedPDAtwo)) 1127 continue; 1128 INIT_DISK_NODE(rudNodes + i, "Rud"); 1129 RF_ASSERT(pda); 1130 DISK_NODE_PARAMS(rudNodes[i], pda); 1131 i++; 1132 } 1133 1134 pda = npdas; 1135 for (i = 0; i < nRrdNodes; i++, pda = pda->next) { 1136 INIT_DISK_NODE(rrdNodes + i, "Rrd"); 1137 RF_ASSERT(pda); 1138 DISK_NODE_PARAMS(rrdNodes[i], pda); 1139 } 1140 1141 /* redundancy pdas */ 1142 pda = pqPDAs; 1143 INIT_DISK_NODE(rpNodes, "Rp"); 1144 RF_ASSERT(pda); 1145 DISK_NODE_PARAMS(rpNodes[0], pda); 1146 pda++; 1147 INIT_DISK_NODE(rqNodes, redundantReadNodeName); 1148 RF_ASSERT(pda); 1149 DISK_NODE_PARAMS(rqNodes[0], pda); 1150 if (nPQNodes == 2) { 1151 pda++; 1152 INIT_DISK_NODE(rpNodes + 1, "Rp"); 1153 RF_ASSERT(pda); 1154 DISK_NODE_PARAMS(rpNodes[1], pda); 1155 pda++; 1156 INIT_DISK_NODE(rqNodes + 1, redundantReadNodeName); 1157 RF_ASSERT(pda); 1158 DISK_NODE_PARAMS(rqNodes[1], pda); 1159 } 1160 /* fill in recovery node params */ 1161 for (i = 0; i < nReadNodes; i++) 1162 recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */ 1163 recoveryNode->params[i++].p = (void *) raidPtr; 1164 recoveryNode->params[i++].p = (void *) asmap; 1165 recoveryNode->results[0] = failedPDA; 1166 if (asmap->numDataFailed == 2) 1167 recoveryNode->results[1] = failedPDAtwo; 1168 1169 /* zero fill the target data buffers? */ 1170 } 1171 1172 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */ 1173