1 /* $NetBSD: rf_pq.c,v 1.9 2001/07/18 06:45:34 thorpej Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Daniel Stodolsky 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /* 30 * Code for RAID level 6 (P + Q) disk array architecture. 31 */ 32 33 #include "rf_archs.h" 34 35 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) 36 37 #include "rf_types.h" 38 #include "rf_raid.h" 39 #include "rf_dag.h" 40 #include "rf_dagffrd.h" 41 #include "rf_dagffwr.h" 42 #include "rf_dagdegrd.h" 43 #include "rf_dagdegwr.h" 44 #include "rf_dagutils.h" 45 #include "rf_dagfuncs.h" 46 #include "rf_etimer.h" 47 #include "rf_pqdeg.h" 48 #include "rf_general.h" 49 #include "rf_map.h" 50 #include "rf_pq.h" 51 52 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"}; 53 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"}; 54 55 int 56 rf_RegularONPFunc(node) 57 RF_DagNode_t *node; 58 { 59 return (rf_RegularXorFunc(node)); 60 } 61 /* 62 same as simpleONQ func, but the coefficient is always 1 63 */ 64 65 int 66 rf_SimpleONPFunc(node) 67 RF_DagNode_t *node; 68 { 69 return (rf_SimpleXorFunc(node)); 70 } 71 72 int 73 rf_RecoveryPFunc(node) 74 RF_DagNode_t *node; 75 { 76 return (rf_RecoveryXorFunc(node)); 77 } 78 79 int 80 rf_RegularPFunc(node) 81 RF_DagNode_t *node; 82 { 83 return (rf_RegularXorFunc(node)); 84 } 85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */ 86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 87 88 static void 89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length, 90 unsigned char coeff); 91 static void 92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, 93 unsigned length, unsigned coeff); 94 95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"}; 96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"}; 97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"}; 98 99 void 100 rf_PQDagSelect( 101 RF_Raid_t * raidPtr, 102 RF_IoType_t type, 103 RF_AccessStripeMap_t * asmap, 104 RF_VoidFuncPtr * createFunc) 105 { 106 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 107 unsigned ndfail = asmap->numDataFailed; 108 unsigned npfail = asmap->numParityFailed; 109 unsigned ntfail = npfail + ndfail; 110 111 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 112 if (ntfail > 2) { 113 RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n"); 114 /* *infoFunc = */ *createFunc = NULL; 115 return; 116 } 117 /* ok, we can do this I/O */ 118 if (type == RF_IO_TYPE_READ) { 119 switch (ndfail) { 120 case 0: 121 /* fault free read */ 122 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; /* same as raid 5 */ 123 break; 124 case 1: 125 /* lost a single data unit */ 126 /* two cases: (1) parity is not lost. do a normal raid 127 * 5 reconstruct read. (2) parity is lost. do a 128 * reconstruct read using "q". */ 129 if (ntfail == 2) { /* also lost redundancy */ 130 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) 131 *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG; 132 else 133 *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG; 134 } else { 135 /* P and Q are ok. But is there a failure in 136 * some unaccessed data unit? */ 137 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) 138 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG; 139 else 140 *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG; 141 } 142 break; 143 case 2: 144 /* lost two data units */ 145 /* *infoFunc = PQOneTwo; */ 146 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG; 147 break; 148 } 149 return; 150 } 151 /* a write */ 152 switch (ntfail) { 153 case 0: /* fault free */ 154 if (rf_suppressLocksAndLargeWrites || 155 (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || 156 (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { 157 158 *createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG; 159 } else { 160 *createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG; 161 } 162 break; 163 164 case 1: /* single disk fault */ 165 if (npfail == 1) { 166 RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)); 167 if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) { /* q died, treat like 168 * normal mode raid5 169 * write. */ 170 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) 171 || rf_NumFailedDataUnitsInStripe(raidPtr, asmap)) 172 *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG; 173 else 174 *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG; 175 } else {/* parity died, small write only updating Q */ 176 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) 177 || rf_NumFailedDataUnitsInStripe(raidPtr, asmap)) 178 *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG; 179 else 180 *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG; 181 } 182 } else { /* data missing. Do a P reconstruct write if 183 * only a single data unit is lost in the 184 * stripe, otherwise a PQ reconstruct write. */ 185 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) 186 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG; 187 else 188 *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG; 189 } 190 break; 191 192 case 2: /* two disk faults */ 193 switch (npfail) { 194 case 2: /* both p and q dead */ 195 *createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG; 196 break; 197 case 1: /* either p or q and dead data */ 198 RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA); 199 RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)); 200 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) 201 *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG; 202 else 203 *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG; 204 break; 205 case 0: /* double data loss */ 206 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG; 207 break; 208 } 209 break; 210 211 default: /* more than 2 disk faults */ 212 *createFunc = NULL; 213 RF_PANIC(); 214 } 215 return; 216 } 217 /* 218 Used as a stop gap info function 219 */ 220 #if 0 221 static void 222 PQOne(raidPtr, nSucc, nAnte, asmap) 223 RF_Raid_t *raidPtr; 224 int *nSucc; 225 int *nAnte; 226 RF_AccessStripeMap_t *asmap; 227 { 228 *nSucc = *nAnte = 1; 229 } 230 231 static void 232 PQOneTwo(raidPtr, nSucc, nAnte, asmap) 233 RF_Raid_t *raidPtr; 234 int *nSucc; 235 int *nAnte; 236 RF_AccessStripeMap_t *asmap; 237 { 238 *nSucc = 1; 239 *nAnte = 2; 240 } 241 #endif 242 243 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG) 244 { 245 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, 246 rf_RegularPQFunc, RF_FALSE); 247 } 248 249 int 250 rf_RegularONQFunc(node) 251 RF_DagNode_t *node; 252 { 253 int np = node->numParams; 254 int d; 255 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 256 int i; 257 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 258 RF_Etimer_t timer; 259 char *qbuf, *qpbuf; 260 char *obuf, *nbuf; 261 RF_PhysDiskAddr_t *old, *new; 262 unsigned long coeff; 263 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 264 265 RF_ETIMER_START(timer); 266 267 d = (np - 3) / 4; 268 RF_ASSERT(4 * d + 3 == np); 269 qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */ 270 for (i = 0; i < d; i++) { 271 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 272 obuf = (char *) node->params[2 * i + 1].p; 273 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p; 274 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p; 275 RF_ASSERT(new->numSector == old->numSector); 276 RF_ASSERT(new->raidAddress == old->raidAddress); 277 /* the stripe unit within the stripe tells us the coefficient 278 * to use for the multiply. */ 279 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress); 280 /* compute the data unit offset within the column, then add 281 * one */ 282 coeff = (coeff % raidPtr->Layout.numDataCol); 283 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU); 284 QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 285 } 286 287 RF_ETIMER_STOP(timer); 288 RF_ETIMER_EVAL(timer); 289 tracerec->q_us += RF_ETIMER_VAL_US(timer); 290 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no 291 * I/O in this node */ 292 return (0); 293 } 294 /* 295 See the SimpleXORFunc for the difference between a simple and regular func. 296 These Q functions should be used for 297 298 new q = Q(data,old data,old q) 299 300 style updates and not for 301 302 q = ( new data, new data, .... ) 303 304 computations. 305 306 The simple q takes 2(2d+1)+1 params, where d is the number 307 of stripes written. The order of params is 308 old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d 309 [2d] old q pda_0, old q buffer 310 [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d 311 raidPtr 312 */ 313 314 int 315 rf_SimpleONQFunc(node) 316 RF_DagNode_t *node; 317 { 318 int np = node->numParams; 319 int d; 320 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 321 int i; 322 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 323 RF_Etimer_t timer; 324 char *qbuf; 325 char *obuf, *nbuf; 326 RF_PhysDiskAddr_t *old, *new; 327 unsigned long coeff; 328 329 RF_ETIMER_START(timer); 330 331 d = (np - 3) / 4; 332 RF_ASSERT(4 * d + 3 == np); 333 qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */ 334 for (i = 0; i < d; i++) { 335 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 336 obuf = (char *) node->params[2 * i + 1].p; 337 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p; 338 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p; 339 RF_ASSERT(new->numSector == old->numSector); 340 RF_ASSERT(new->raidAddress == old->raidAddress); 341 /* the stripe unit within the stripe tells us the coefficient 342 * to use for the multiply. */ 343 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress); 344 /* compute the data unit offset within the column, then add 345 * one */ 346 coeff = (coeff % raidPtr->Layout.numDataCol); 347 QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 348 } 349 350 RF_ETIMER_STOP(timer); 351 RF_ETIMER_EVAL(timer); 352 tracerec->q_us += RF_ETIMER_VAL_US(timer); 353 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no 354 * I/O in this node */ 355 return (0); 356 } 357 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG) 358 { 359 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs); 360 } 361 362 static void RegularQSubr(RF_DagNode_t *node, char *qbuf); 363 364 static void 365 RegularQSubr(node, qbuf) 366 RF_DagNode_t *node; 367 char *qbuf; 368 { 369 int np = node->numParams; 370 int d; 371 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 372 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 373 int i; 374 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 375 RF_Etimer_t timer; 376 char *obuf, *qpbuf; 377 RF_PhysDiskAddr_t *old; 378 unsigned long coeff; 379 380 RF_ETIMER_START(timer); 381 382 d = (np - 1) / 2; 383 RF_ASSERT(2 * d + 1 == np); 384 for (i = 0; i < d; i++) { 385 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 386 obuf = (char *) node->params[2 * i + 1].p; 387 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); 388 /* compute the data unit offset within the column, then add 389 * one */ 390 coeff = (coeff % raidPtr->Layout.numDataCol); 391 /* the input buffers may not all be aligned with the start of 392 * the stripe. so shift by their sector offset within the 393 * stripe unit */ 394 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU); 395 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 396 } 397 398 RF_ETIMER_STOP(timer); 399 RF_ETIMER_EVAL(timer); 400 tracerec->q_us += RF_ETIMER_VAL_US(timer); 401 } 402 /* 403 used in degraded writes. 404 */ 405 406 static void DegrQSubr(RF_DagNode_t *node); 407 408 static void 409 DegrQSubr(node) 410 RF_DagNode_t *node; 411 { 412 int np = node->numParams; 413 int d; 414 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 415 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 416 int i; 417 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 418 RF_Etimer_t timer; 419 char *qbuf = node->results[1]; 420 char *obuf, *qpbuf; 421 RF_PhysDiskAddr_t *old; 422 unsigned long coeff; 423 unsigned fail_start; 424 int j; 425 426 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p; 427 fail_start = old->startSector % secPerSU; 428 429 RF_ETIMER_START(timer); 430 431 d = (np - 2) / 2; 432 RF_ASSERT(2 * d + 2 == np); 433 for (i = 0; i < d; i++) { 434 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 435 obuf = (char *) node->params[2 * i + 1].p; 436 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); 437 /* compute the data unit offset within the column, then add 438 * one */ 439 coeff = (coeff % raidPtr->Layout.numDataCol); 440 /* the input buffers may not all be aligned with the start of 441 * the stripe. so shift by their sector offset within the 442 * stripe unit */ 443 j = old->startSector % secPerSU; 444 RF_ASSERT(j >= fail_start); 445 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start); 446 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 447 } 448 449 RF_ETIMER_STOP(timer); 450 RF_ETIMER_EVAL(timer); 451 tracerec->q_us += RF_ETIMER_VAL_US(timer); 452 } 453 /* 454 Called by large write code to compute the new parity and the new q. 455 456 structure of the params: 457 458 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol 459 raidPtr 460 461 for a total of 2d+1 arguments. 462 The result buffers results[0], results[1] are the buffers for the p and q, 463 respectively. 464 465 We compute Q first, then compute P. The P calculation may try to reuse 466 one of the input buffers for its output, so if we computed P first, we would 467 corrupt the input for the q calculation. 468 */ 469 470 int 471 rf_RegularPQFunc(node) 472 RF_DagNode_t *node; 473 { 474 RegularQSubr(node, node->results[1]); 475 return (rf_RegularXorFunc(node)); /* does the wakeup */ 476 } 477 478 int 479 rf_RegularQFunc(node) 480 RF_DagNode_t *node; 481 { 482 /* Almost ... adjust Qsubr args */ 483 RegularQSubr(node, node->results[0]); 484 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no 485 * I/O in this node */ 486 return (0); 487 } 488 /* 489 Called by singly degraded write code to compute the new parity and the new q. 490 491 structure of the params: 492 493 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d 494 failedPDA raidPtr 495 496 for a total of 2d+2 arguments. 497 The result buffers results[0], results[1] are the buffers for the parity and q, 498 respectively. 499 500 We compute Q first, then compute parity. The parity calculation may try to reuse 501 one of the input buffers for its output, so if we computed parity first, we would 502 corrupt the input for the q calculation. 503 504 We treat this identically to the regularPQ case, ignoring the failedPDA extra argument. 505 */ 506 507 void 508 rf_Degraded_100_PQFunc(node) 509 RF_DagNode_t *node; 510 { 511 int np = node->numParams; 512 513 RF_ASSERT(np >= 2); 514 DegrQSubr(node); 515 rf_RecoveryXorFunc(node); 516 } 517 518 519 /* 520 The two below are used when reading a stripe with a single lost data unit. 521 The parameters are 522 523 pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr 524 525 and results[0] contains the data buffer. Which is originally zero-filled. 526 527 */ 528 529 /* this Q func is used by the degraded-mode dag functions to recover lost data. 530 * the second-to-last parameter is the PDA for the failed portion of the access. 531 * the code here looks at this PDA and assumes that the xor target buffer is 532 * equal in size to the number of sectors in the failed PDA. It then uses 533 * the other PDAs in the parameter list to determine where within the target 534 * buffer the corresponding data should be xored. 535 * 536 * Recall the basic equation is 537 * 538 * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256 539 * 540 * so to recover data_j we need 541 * 542 * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256 543 * 544 * So the coefficient for each buffer is (255 - data_col), and j should be initialized by 545 * copying Q into it. Then we need to do a table lookup to convert to solve 546 * data_j /= J 547 * 548 * 549 */ 550 int 551 rf_RecoveryQFunc(node) 552 RF_DagNode_t *node; 553 { 554 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 555 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 556 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 557 int i; 558 RF_PhysDiskAddr_t *pda; 559 RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 560 char *srcbuf, *destbuf; 561 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 562 RF_Etimer_t timer; 563 unsigned long coeff; 564 565 RF_ETIMER_START(timer); 566 /* start by copying Q into the buffer */ 567 bcopy(node->params[node->numParams - 3].p, node->results[0], 568 rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); 569 for (i = 0; i < node->numParams - 4; i += 2) { 570 RF_ASSERT(node->params[i + 1].p != node->results[0]); 571 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 572 srcbuf = (char *) node->params[i + 1].p; 573 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 574 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 575 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); 576 /* compute the data unit offset within the column */ 577 coeff = (coeff % raidPtr->Layout.numDataCol); 578 rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); 579 } 580 /* Do the nasty inversion now */ 581 coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol); 582 rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); 583 RF_ETIMER_STOP(timer); 584 RF_ETIMER_EVAL(timer); 585 tracerec->q_us += RF_ETIMER_VAL_US(timer); 586 rf_GenericWakeupFunc(node, 0); 587 return (0); 588 } 589 590 int 591 rf_RecoveryPQFunc(node) 592 RF_DagNode_t *node; 593 { 594 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 595 printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid); 596 return (1); 597 } 598 /* 599 Degraded write Q subroutine. 600 Used when P is dead. 601 Large-write style Q computation. 602 Parameters 603 604 (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr. 605 606 We ignore failedPDA. 607 608 This is a "simple style" recovery func. 609 */ 610 611 void 612 rf_PQ_DegradedWriteQFunc(node) 613 RF_DagNode_t *node; 614 { 615 int np = node->numParams; 616 int d; 617 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 618 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 619 int i; 620 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 621 RF_Etimer_t timer; 622 char *qbuf = node->results[0]; 623 char *obuf, *qpbuf; 624 RF_PhysDiskAddr_t *old; 625 unsigned long coeff; 626 int fail_start, j; 627 628 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p; 629 fail_start = old->startSector % secPerSU; 630 631 RF_ETIMER_START(timer); 632 633 d = (np - 2) / 2; 634 RF_ASSERT(2 * d + 2 == np); 635 636 for (i = 0; i < d; i++) { 637 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 638 obuf = (char *) node->params[2 * i + 1].p; 639 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); 640 /* compute the data unit offset within the column, then add 641 * one */ 642 coeff = (coeff % raidPtr->Layout.numDataCol); 643 j = old->startSector % secPerSU; 644 RF_ASSERT(j >= fail_start); 645 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start); 646 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 647 } 648 649 RF_ETIMER_STOP(timer); 650 RF_ETIMER_EVAL(timer); 651 tracerec->q_us += RF_ETIMER_VAL_US(timer); 652 rf_GenericWakeupFunc(node, 0); 653 } 654 655 656 657 658 /* Q computations */ 659 660 /* 661 coeff - colummn; 662 663 compute dest ^= qfor[28-coeff][rn[coeff+1] a] 664 665 on 5-bit basis; 666 length in bytes; 667 */ 668 669 void 670 rf_IncQ(dest, buf, length, coeff) 671 unsigned long *dest; 672 unsigned long *buf; 673 unsigned length; 674 unsigned coeff; 675 { 676 unsigned long a, d, new; 677 unsigned long a1, a2; 678 unsigned int *q = &(rf_qfor[28 - coeff][0]); 679 unsigned r = rf_rn[coeff + 1]; 680 681 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f) 682 #define INSERT(a,i) (a << (5L*i)) 683 684 length /= 8; 685 /* 13 5 bit quants in a 64 bit word */ 686 while (length) { 687 a = *buf++; 688 d = *dest; 689 a1 = EXTRACT(a, 0) ^ r; 690 a2 = EXTRACT(a, 1) ^ r; 691 new = INSERT(a2, 1) | a1; 692 a1 = EXTRACT(a, 2) ^ r; 693 a2 = EXTRACT(a, 3) ^ r; 694 a1 = q[a1]; 695 a2 = q[a2]; 696 new = new | INSERT(a1, 2) | INSERT(a2, 3); 697 a1 = EXTRACT(a, 4) ^ r; 698 a2 = EXTRACT(a, 5) ^ r; 699 a1 = q[a1]; 700 a2 = q[a2]; 701 new = new | INSERT(a1, 4) | INSERT(a2, 5); 702 a1 = EXTRACT(a, 5) ^ r; 703 a2 = EXTRACT(a, 6) ^ r; 704 a1 = q[a1]; 705 a2 = q[a2]; 706 new = new | INSERT(a1, 5) | INSERT(a2, 6); 707 #if RF_LONGSHIFT > 2 708 a1 = EXTRACT(a, 7) ^ r; 709 a2 = EXTRACT(a, 8) ^ r; 710 a1 = q[a1]; 711 a2 = q[a2]; 712 new = new | INSERT(a1, 7) | INSERT(a2, 8); 713 a1 = EXTRACT(a, 9) ^ r; 714 a2 = EXTRACT(a, 10) ^ r; 715 a1 = q[a1]; 716 a2 = q[a2]; 717 new = new | INSERT(a1, 9) | INSERT(a2, 10); 718 a1 = EXTRACT(a, 11) ^ r; 719 a2 = EXTRACT(a, 12) ^ r; 720 a1 = q[a1]; 721 a2 = q[a2]; 722 new = new | INSERT(a1, 11) | INSERT(a2, 12); 723 #endif /* RF_LONGSHIFT > 2 */ 724 d ^= new; 725 *dest++ = d; 726 length--; 727 } 728 } 729 /* 730 compute 731 732 dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ] 733 734 on a five bit basis. 735 optimization: compute old ^ new on 64 bit basis. 736 737 length in bytes. 738 */ 739 740 static void 741 QDelta( 742 char *dest, 743 char *obuf, 744 char *nbuf, 745 unsigned length, 746 unsigned char coeff) 747 { 748 unsigned long a, d, new; 749 unsigned long a1, a2; 750 unsigned int *q = &(rf_qfor[28 - coeff][0]); 751 unsigned int r = rf_rn[coeff + 1]; 752 753 r = a1 = a2 = new = d = a = 0; /* XXX for now... */ 754 q = NULL; /* XXX for now */ 755 756 #ifdef _KERNEL 757 /* PQ in kernel currently not supported because the encoding/decoding 758 * table is not present */ 759 memset(dest, 0, length); 760 #else /* KERNEL */ 761 /* this code probably doesn't work and should be rewritten -wvcii */ 762 /* 13 5 bit quants in a 64 bit word */ 763 length /= 8; 764 while (length) { 765 a = *obuf++; /* XXX need to reorg to avoid cache conflicts */ 766 a ^= *nbuf++; 767 d = *dest; 768 a1 = EXTRACT(a, 0) ^ r; 769 a2 = EXTRACT(a, 1) ^ r; 770 a1 = q[a1]; 771 a2 = q[a2]; 772 new = INSERT(a2, 1) | a1; 773 a1 = EXTRACT(a, 2) ^ r; 774 a2 = EXTRACT(a, 3) ^ r; 775 a1 = q[a1]; 776 a2 = q[a2]; 777 new = new | INSERT(a1, 2) | INSERT(a2, 3); 778 a1 = EXTRACT(a, 4) ^ r; 779 a2 = EXTRACT(a, 5) ^ r; 780 a1 = q[a1]; 781 a2 = q[a2]; 782 new = new | INSERT(a1, 4) | INSERT(a2, 5); 783 a1 = EXTRACT(a, 5) ^ r; 784 a2 = EXTRACT(a, 6) ^ r; 785 a1 = q[a1]; 786 a2 = q[a2]; 787 new = new | INSERT(a1, 5) | INSERT(a2, 6); 788 #if RF_LONGSHIFT > 2 789 a1 = EXTRACT(a, 7) ^ r; 790 a2 = EXTRACT(a, 8) ^ r; 791 a1 = q[a1]; 792 a2 = q[a2]; 793 new = new | INSERT(a1, 7) | INSERT(a2, 8); 794 a1 = EXTRACT(a, 9) ^ r; 795 a2 = EXTRACT(a, 10) ^ r; 796 a1 = q[a1]; 797 a2 = q[a2]; 798 new = new | INSERT(a1, 9) | INSERT(a2, 10); 799 a1 = EXTRACT(a, 11) ^ r; 800 a2 = EXTRACT(a, 12) ^ r; 801 a1 = q[a1]; 802 a2 = q[a2]; 803 new = new | INSERT(a1, 11) | INSERT(a2, 12); 804 #endif /* RF_LONGSHIFT > 2 */ 805 d ^= new; 806 *dest++ = d; 807 length--; 808 } 809 #endif /* _KERNEL */ 810 } 811 /* 812 recover columns a and b from the given p and q into 813 bufs abuf and bbuf. All bufs are word aligned. 814 Length is in bytes. 815 */ 816 817 818 /* 819 * XXX 820 * 821 * Everything about this seems wrong. 822 */ 823 void 824 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b) 825 unsigned long *pbuf; 826 unsigned long *qbuf; 827 unsigned long *abuf; 828 unsigned long *bbuf; 829 unsigned length; 830 unsigned coeff_a; 831 unsigned coeff_b; 832 { 833 unsigned long p, q, a, a0, a1; 834 int col = (29 * coeff_a) + coeff_b; 835 unsigned char *q0 = &(rf_qinv[col][0]); 836 837 length /= 8; 838 while (length) { 839 p = *pbuf++; 840 q = *qbuf++; 841 a0 = EXTRACT(p, 0); 842 a1 = EXTRACT(q, 0); 843 a = q0[a0 << 5 | a1]; 844 #define MF(i) \ 845 a0 = EXTRACT(p,i); \ 846 a1 = EXTRACT(q,i); \ 847 a = a | INSERT(q0[a0<<5 | a1],i) 848 849 MF(1); 850 MF(2); 851 MF(3); 852 MF(4); 853 MF(5); 854 MF(6); 855 #if 0 856 MF(7); 857 MF(8); 858 MF(9); 859 MF(10); 860 MF(11); 861 MF(12); 862 #endif /* 0 */ 863 *abuf++ = a; 864 *bbuf++ = a ^ p; 865 length--; 866 } 867 } 868 /* 869 Lost parity and a data column. Recover that data column. 870 Assume col coeff is lost. Let q the contents of Q after 871 all surviving data columns have been q-xored out of it. 872 Then we have the equation 873 874 q[28-coeff][a_i ^ r_i+1] = q 875 876 but q is cyclic with period 31. 877 So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] = 878 q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} . 879 880 so a_i = r_{coeff+1} ^ q[3+coeff][q] 881 882 The routine is passed q buffer and the buffer 883 the data is to be recoverd into. They can be the same. 884 */ 885 886 887 888 static void 889 rf_InvertQ( 890 unsigned long *qbuf, 891 unsigned long *abuf, 892 unsigned length, 893 unsigned coeff) 894 { 895 unsigned long a, new; 896 unsigned long a1, a2; 897 unsigned int *q = &(rf_qfor[3 + coeff][0]); 898 unsigned r = rf_rn[coeff + 1]; 899 900 /* 13 5 bit quants in a 64 bit word */ 901 length /= 8; 902 while (length) { 903 a = *qbuf++; 904 a1 = EXTRACT(a, 0); 905 a2 = EXTRACT(a, 1); 906 a1 = r ^ q[a1]; 907 a2 = r ^ q[a2]; 908 new = INSERT(a2, 1) | a1; 909 #define M(i,j) \ 910 a1 = EXTRACT(a,i); \ 911 a2 = EXTRACT(a,j); \ 912 a1 = r ^ q[a1]; \ 913 a2 = r ^ q[a2]; \ 914 new = new | INSERT(a1,i) | INSERT(a2,j) 915 916 M(2, 3); 917 M(4, 5); 918 M(5, 6); 919 #if RF_LONGSHIFT > 2 920 M(7, 8); 921 M(9, 10); 922 M(11, 12); 923 #endif /* RF_LONGSHIFT > 2 */ 924 *abuf++ = new; 925 length--; 926 } 927 } 928 #endif /* (RF_INCLUDE_DECL_PQ > 0) || 929 * (RF_INCLUDE_RAID6 > 0) */ 930