1 /* $NetBSD: rf_pq.c,v 1.7 2000/01/07 03:41:02 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Daniel Stodolsky 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /* 30 * Code for RAID level 6 (P + Q) disk array architecture. 31 */ 32 33 #include "rf_archs.h" 34 #include "rf_types.h" 35 #include "rf_raid.h" 36 #include "rf_dag.h" 37 #include "rf_dagffrd.h" 38 #include "rf_dagffwr.h" 39 #include "rf_dagdegrd.h" 40 #include "rf_dagdegwr.h" 41 #include "rf_dagutils.h" 42 #include "rf_dagfuncs.h" 43 #include "rf_etimer.h" 44 #include "rf_pqdeg.h" 45 #include "rf_general.h" 46 #include "rf_map.h" 47 #include "rf_pq.h" 48 49 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"}; 50 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"}; 51 52 int 53 rf_RegularONPFunc(node) 54 RF_DagNode_t *node; 55 { 56 return (rf_RegularXorFunc(node)); 57 } 58 /* 59 same as simpleONQ func, but the coefficient is always 1 60 */ 61 62 int 63 rf_SimpleONPFunc(node) 64 RF_DagNode_t *node; 65 { 66 return (rf_SimpleXorFunc(node)); 67 } 68 69 int 70 rf_RecoveryPFunc(node) 71 RF_DagNode_t *node; 72 { 73 return (rf_RecoveryXorFunc(node)); 74 } 75 76 int 77 rf_RegularPFunc(node) 78 RF_DagNode_t *node; 79 { 80 return (rf_RegularXorFunc(node)); 81 } 82 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 83 84 static void 85 QDelta(char *dest, char *obuf, char *nbuf, unsigned length, 86 unsigned char coeff); 87 static void 88 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, 89 unsigned length, unsigned coeff); 90 91 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"}; 92 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"}; 93 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"}; 94 95 void 96 rf_PQDagSelect( 97 RF_Raid_t * raidPtr, 98 RF_IoType_t type, 99 RF_AccessStripeMap_t * asmap, 100 RF_VoidFuncPtr * createFunc) 101 { 102 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 103 unsigned ndfail = asmap->numDataFailed; 104 unsigned npfail = asmap->numParityFailed; 105 unsigned ntfail = npfail + ndfail; 106 107 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 108 if (ntfail > 2) { 109 RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n"); 110 /* *infoFunc = */ *createFunc = NULL; 111 return; 112 } 113 /* ok, we can do this I/O */ 114 if (type == RF_IO_TYPE_READ) { 115 switch (ndfail) { 116 case 0: 117 /* fault free read */ 118 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; /* same as raid 5 */ 119 break; 120 case 1: 121 /* lost a single data unit */ 122 /* two cases: (1) parity is not lost. do a normal raid 123 * 5 reconstruct read. (2) parity is lost. do a 124 * reconstruct read using "q". */ 125 if (ntfail == 2) { /* also lost redundancy */ 126 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) 127 *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG; 128 else 129 *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG; 130 } else { 131 /* P and Q are ok. But is there a failure in 132 * some unaccessed data unit? */ 133 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) 134 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG; 135 else 136 *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG; 137 } 138 break; 139 case 2: 140 /* lost two data units */ 141 /* *infoFunc = PQOneTwo; */ 142 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG; 143 break; 144 } 145 return; 146 } 147 /* a write */ 148 switch (ntfail) { 149 case 0: /* fault free */ 150 if (rf_suppressLocksAndLargeWrites || 151 (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || 152 (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { 153 154 *createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG; 155 } else { 156 *createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG; 157 } 158 break; 159 160 case 1: /* single disk fault */ 161 if (npfail == 1) { 162 RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)); 163 if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) { /* q died, treat like 164 * normal mode raid5 165 * write. */ 166 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) 167 || rf_NumFailedDataUnitsInStripe(raidPtr, asmap)) 168 *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG; 169 else 170 *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG; 171 } else {/* parity died, small write only updating Q */ 172 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) 173 || rf_NumFailedDataUnitsInStripe(raidPtr, asmap)) 174 *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG; 175 else 176 *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG; 177 } 178 } else { /* data missing. Do a P reconstruct write if 179 * only a single data unit is lost in the 180 * stripe, otherwise a PQ reconstruct write. */ 181 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2) 182 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG; 183 else 184 *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG; 185 } 186 break; 187 188 case 2: /* two disk faults */ 189 switch (npfail) { 190 case 2: /* both p and q dead */ 191 *createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG; 192 break; 193 case 1: /* either p or q and dead data */ 194 RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA); 195 RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)); 196 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) 197 *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG; 198 else 199 *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG; 200 break; 201 case 0: /* double data loss */ 202 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG; 203 break; 204 } 205 break; 206 207 default: /* more than 2 disk faults */ 208 *createFunc = NULL; 209 RF_PANIC(); 210 } 211 return; 212 } 213 /* 214 Used as a stop gap info function 215 */ 216 #if 0 217 static void 218 PQOne(raidPtr, nSucc, nAnte, asmap) 219 RF_Raid_t *raidPtr; 220 int *nSucc; 221 int *nAnte; 222 RF_AccessStripeMap_t *asmap; 223 { 224 *nSucc = *nAnte = 1; 225 } 226 227 static void 228 PQOneTwo(raidPtr, nSucc, nAnte, asmap) 229 RF_Raid_t *raidPtr; 230 int *nSucc; 231 int *nAnte; 232 RF_AccessStripeMap_t *asmap; 233 { 234 *nSucc = 1; 235 *nAnte = 2; 236 } 237 #endif 238 239 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG) 240 { 241 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, 242 rf_RegularPQFunc, RF_FALSE); 243 } 244 245 int 246 rf_RegularONQFunc(node) 247 RF_DagNode_t *node; 248 { 249 int np = node->numParams; 250 int d; 251 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 252 int i; 253 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 254 RF_Etimer_t timer; 255 char *qbuf, *qpbuf; 256 char *obuf, *nbuf; 257 RF_PhysDiskAddr_t *old, *new; 258 unsigned long coeff; 259 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 260 261 RF_ETIMER_START(timer); 262 263 d = (np - 3) / 4; 264 RF_ASSERT(4 * d + 3 == np); 265 qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */ 266 for (i = 0; i < d; i++) { 267 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 268 obuf = (char *) node->params[2 * i + 1].p; 269 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p; 270 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p; 271 RF_ASSERT(new->numSector == old->numSector); 272 RF_ASSERT(new->raidAddress == old->raidAddress); 273 /* the stripe unit within the stripe tells us the coefficient 274 * to use for the multiply. */ 275 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress); 276 /* compute the data unit offset within the column, then add 277 * one */ 278 coeff = (coeff % raidPtr->Layout.numDataCol); 279 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU); 280 QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 281 } 282 283 RF_ETIMER_STOP(timer); 284 RF_ETIMER_EVAL(timer); 285 tracerec->q_us += RF_ETIMER_VAL_US(timer); 286 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no 287 * I/O in this node */ 288 return (0); 289 } 290 /* 291 See the SimpleXORFunc for the difference between a simple and regular func. 292 These Q functions should be used for 293 294 new q = Q(data,old data,old q) 295 296 style updates and not for 297 298 q = ( new data, new data, .... ) 299 300 computations. 301 302 The simple q takes 2(2d+1)+1 params, where d is the number 303 of stripes written. The order of params is 304 old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d 305 [2d] old q pda_0, old q buffer 306 [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d 307 raidPtr 308 */ 309 310 int 311 rf_SimpleONQFunc(node) 312 RF_DagNode_t *node; 313 { 314 int np = node->numParams; 315 int d; 316 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 317 int i; 318 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 319 RF_Etimer_t timer; 320 char *qbuf; 321 char *obuf, *nbuf; 322 RF_PhysDiskAddr_t *old, *new; 323 unsigned long coeff; 324 325 RF_ETIMER_START(timer); 326 327 d = (np - 3) / 4; 328 RF_ASSERT(4 * d + 3 == np); 329 qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */ 330 for (i = 0; i < d; i++) { 331 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 332 obuf = (char *) node->params[2 * i + 1].p; 333 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p; 334 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p; 335 RF_ASSERT(new->numSector == old->numSector); 336 RF_ASSERT(new->raidAddress == old->raidAddress); 337 /* the stripe unit within the stripe tells us the coefficient 338 * to use for the multiply. */ 339 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress); 340 /* compute the data unit offset within the column, then add 341 * one */ 342 coeff = (coeff % raidPtr->Layout.numDataCol); 343 QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 344 } 345 346 RF_ETIMER_STOP(timer); 347 RF_ETIMER_EVAL(timer); 348 tracerec->q_us += RF_ETIMER_VAL_US(timer); 349 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no 350 * I/O in this node */ 351 return (0); 352 } 353 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG) 354 { 355 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs); 356 } 357 358 static void RegularQSubr(RF_DagNode_t *node, char *qbuf); 359 360 static void 361 RegularQSubr(node, qbuf) 362 RF_DagNode_t *node; 363 char *qbuf; 364 { 365 int np = node->numParams; 366 int d; 367 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 368 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 369 int i; 370 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 371 RF_Etimer_t timer; 372 char *obuf, *qpbuf; 373 RF_PhysDiskAddr_t *old; 374 unsigned long coeff; 375 376 RF_ETIMER_START(timer); 377 378 d = (np - 1) / 2; 379 RF_ASSERT(2 * d + 1 == np); 380 for (i = 0; i < d; i++) { 381 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 382 obuf = (char *) node->params[2 * i + 1].p; 383 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); 384 /* compute the data unit offset within the column, then add 385 * one */ 386 coeff = (coeff % raidPtr->Layout.numDataCol); 387 /* the input buffers may not all be aligned with the start of 388 * the stripe. so shift by their sector offset within the 389 * stripe unit */ 390 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU); 391 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 392 } 393 394 RF_ETIMER_STOP(timer); 395 RF_ETIMER_EVAL(timer); 396 tracerec->q_us += RF_ETIMER_VAL_US(timer); 397 } 398 /* 399 used in degraded writes. 400 */ 401 402 static void DegrQSubr(RF_DagNode_t *node); 403 404 static void 405 DegrQSubr(node) 406 RF_DagNode_t *node; 407 { 408 int np = node->numParams; 409 int d; 410 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 411 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 412 int i; 413 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 414 RF_Etimer_t timer; 415 char *qbuf = node->results[1]; 416 char *obuf, *qpbuf; 417 RF_PhysDiskAddr_t *old; 418 unsigned long coeff; 419 unsigned fail_start; 420 int j; 421 422 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p; 423 fail_start = old->startSector % secPerSU; 424 425 RF_ETIMER_START(timer); 426 427 d = (np - 2) / 2; 428 RF_ASSERT(2 * d + 2 == np); 429 for (i = 0; i < d; i++) { 430 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 431 obuf = (char *) node->params[2 * i + 1].p; 432 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); 433 /* compute the data unit offset within the column, then add 434 * one */ 435 coeff = (coeff % raidPtr->Layout.numDataCol); 436 /* the input buffers may not all be aligned with the start of 437 * the stripe. so shift by their sector offset within the 438 * stripe unit */ 439 j = old->startSector % secPerSU; 440 RF_ASSERT(j >= fail_start); 441 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start); 442 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 443 } 444 445 RF_ETIMER_STOP(timer); 446 RF_ETIMER_EVAL(timer); 447 tracerec->q_us += RF_ETIMER_VAL_US(timer); 448 } 449 /* 450 Called by large write code to compute the new parity and the new q. 451 452 structure of the params: 453 454 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol 455 raidPtr 456 457 for a total of 2d+1 arguments. 458 The result buffers results[0], results[1] are the buffers for the p and q, 459 respectively. 460 461 We compute Q first, then compute P. The P calculation may try to reuse 462 one of the input buffers for its output, so if we computed P first, we would 463 corrupt the input for the q calculation. 464 */ 465 466 int 467 rf_RegularPQFunc(node) 468 RF_DagNode_t *node; 469 { 470 RegularQSubr(node, node->results[1]); 471 return (rf_RegularXorFunc(node)); /* does the wakeup */ 472 } 473 474 int 475 rf_RegularQFunc(node) 476 RF_DagNode_t *node; 477 { 478 /* Almost ... adjust Qsubr args */ 479 RegularQSubr(node, node->results[0]); 480 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no 481 * I/O in this node */ 482 return (0); 483 } 484 /* 485 Called by singly degraded write code to compute the new parity and the new q. 486 487 structure of the params: 488 489 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d 490 failedPDA raidPtr 491 492 for a total of 2d+2 arguments. 493 The result buffers results[0], results[1] are the buffers for the parity and q, 494 respectively. 495 496 We compute Q first, then compute parity. The parity calculation may try to reuse 497 one of the input buffers for its output, so if we computed parity first, we would 498 corrupt the input for the q calculation. 499 500 We treat this identically to the regularPQ case, ignoring the failedPDA extra argument. 501 */ 502 503 void 504 rf_Degraded_100_PQFunc(node) 505 RF_DagNode_t *node; 506 { 507 int np = node->numParams; 508 509 RF_ASSERT(np >= 2); 510 DegrQSubr(node); 511 rf_RecoveryXorFunc(node); 512 } 513 514 515 /* 516 The two below are used when reading a stripe with a single lost data unit. 517 The parameters are 518 519 pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr 520 521 and results[0] contains the data buffer. Which is originally zero-filled. 522 523 */ 524 525 /* this Q func is used by the degraded-mode dag functions to recover lost data. 526 * the second-to-last parameter is the PDA for the failed portion of the access. 527 * the code here looks at this PDA and assumes that the xor target buffer is 528 * equal in size to the number of sectors in the failed PDA. It then uses 529 * the other PDAs in the parameter list to determine where within the target 530 * buffer the corresponding data should be xored. 531 * 532 * Recall the basic equation is 533 * 534 * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256 535 * 536 * so to recover data_j we need 537 * 538 * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256 539 * 540 * So the coefficient for each buffer is (255 - data_col), and j should be initialized by 541 * copying Q into it. Then we need to do a table lookup to convert to solve 542 * data_j /= J 543 * 544 * 545 */ 546 int 547 rf_RecoveryQFunc(node) 548 RF_DagNode_t *node; 549 { 550 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 551 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 552 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 553 int i; 554 RF_PhysDiskAddr_t *pda; 555 RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 556 char *srcbuf, *destbuf; 557 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 558 RF_Etimer_t timer; 559 unsigned long coeff; 560 561 RF_ETIMER_START(timer); 562 /* start by copying Q into the buffer */ 563 bcopy(node->params[node->numParams - 3].p, node->results[0], 564 rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); 565 for (i = 0; i < node->numParams - 4; i += 2) { 566 RF_ASSERT(node->params[i + 1].p != node->results[0]); 567 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 568 srcbuf = (char *) node->params[i + 1].p; 569 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 570 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 571 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); 572 /* compute the data unit offset within the column */ 573 coeff = (coeff % raidPtr->Layout.numDataCol); 574 rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); 575 } 576 /* Do the nasty inversion now */ 577 coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol); 578 rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); 579 RF_ETIMER_STOP(timer); 580 RF_ETIMER_EVAL(timer); 581 tracerec->q_us += RF_ETIMER_VAL_US(timer); 582 rf_GenericWakeupFunc(node, 0); 583 return (0); 584 } 585 586 int 587 rf_RecoveryPQFunc(node) 588 RF_DagNode_t *node; 589 { 590 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 591 printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid); 592 return (1); 593 } 594 /* 595 Degraded write Q subroutine. 596 Used when P is dead. 597 Large-write style Q computation. 598 Parameters 599 600 (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr. 601 602 We ignore failedPDA. 603 604 This is a "simple style" recovery func. 605 */ 606 607 void 608 rf_PQ_DegradedWriteQFunc(node) 609 RF_DagNode_t *node; 610 { 611 int np = node->numParams; 612 int d; 613 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p; 614 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 615 int i; 616 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 617 RF_Etimer_t timer; 618 char *qbuf = node->results[0]; 619 char *obuf, *qpbuf; 620 RF_PhysDiskAddr_t *old; 621 unsigned long coeff; 622 int fail_start, j; 623 624 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p; 625 fail_start = old->startSector % secPerSU; 626 627 RF_ETIMER_START(timer); 628 629 d = (np - 2) / 2; 630 RF_ASSERT(2 * d + 2 == np); 631 632 for (i = 0; i < d; i++) { 633 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p; 634 obuf = (char *) node->params[2 * i + 1].p; 635 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress); 636 /* compute the data unit offset within the column, then add 637 * one */ 638 coeff = (coeff % raidPtr->Layout.numDataCol); 639 j = old->startSector % secPerSU; 640 RF_ASSERT(j >= fail_start); 641 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start); 642 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff); 643 } 644 645 RF_ETIMER_STOP(timer); 646 RF_ETIMER_EVAL(timer); 647 tracerec->q_us += RF_ETIMER_VAL_US(timer); 648 rf_GenericWakeupFunc(node, 0); 649 } 650 651 652 653 654 /* Q computations */ 655 656 /* 657 coeff - colummn; 658 659 compute dest ^= qfor[28-coeff][rn[coeff+1] a] 660 661 on 5-bit basis; 662 length in bytes; 663 */ 664 665 void 666 rf_IncQ(dest, buf, length, coeff) 667 unsigned long *dest; 668 unsigned long *buf; 669 unsigned length; 670 unsigned coeff; 671 { 672 unsigned long a, d, new; 673 unsigned long a1, a2; 674 unsigned int *q = &(rf_qfor[28 - coeff][0]); 675 unsigned r = rf_rn[coeff + 1]; 676 677 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f) 678 #define INSERT(a,i) (a << (5L*i)) 679 680 length /= 8; 681 /* 13 5 bit quants in a 64 bit word */ 682 while (length) { 683 a = *buf++; 684 d = *dest; 685 a1 = EXTRACT(a, 0) ^ r; 686 a2 = EXTRACT(a, 1) ^ r; 687 new = INSERT(a2, 1) | a1; 688 a1 = EXTRACT(a, 2) ^ r; 689 a2 = EXTRACT(a, 3) ^ r; 690 a1 = q[a1]; 691 a2 = q[a2]; 692 new = new | INSERT(a1, 2) | INSERT(a2, 3); 693 a1 = EXTRACT(a, 4) ^ r; 694 a2 = EXTRACT(a, 5) ^ r; 695 a1 = q[a1]; 696 a2 = q[a2]; 697 new = new | INSERT(a1, 4) | INSERT(a2, 5); 698 a1 = EXTRACT(a, 5) ^ r; 699 a2 = EXTRACT(a, 6) ^ r; 700 a1 = q[a1]; 701 a2 = q[a2]; 702 new = new | INSERT(a1, 5) | INSERT(a2, 6); 703 #if RF_LONGSHIFT > 2 704 a1 = EXTRACT(a, 7) ^ r; 705 a2 = EXTRACT(a, 8) ^ r; 706 a1 = q[a1]; 707 a2 = q[a2]; 708 new = new | INSERT(a1, 7) | INSERT(a2, 8); 709 a1 = EXTRACT(a, 9) ^ r; 710 a2 = EXTRACT(a, 10) ^ r; 711 a1 = q[a1]; 712 a2 = q[a2]; 713 new = new | INSERT(a1, 9) | INSERT(a2, 10); 714 a1 = EXTRACT(a, 11) ^ r; 715 a2 = EXTRACT(a, 12) ^ r; 716 a1 = q[a1]; 717 a2 = q[a2]; 718 new = new | INSERT(a1, 11) | INSERT(a2, 12); 719 #endif /* RF_LONGSHIFT > 2 */ 720 d ^= new; 721 *dest++ = d; 722 length--; 723 } 724 } 725 /* 726 compute 727 728 dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ] 729 730 on a five bit basis. 731 optimization: compute old ^ new on 64 bit basis. 732 733 length in bytes. 734 */ 735 736 static void 737 QDelta( 738 char *dest, 739 char *obuf, 740 char *nbuf, 741 unsigned length, 742 unsigned char coeff) 743 { 744 unsigned long a, d, new; 745 unsigned long a1, a2; 746 unsigned int *q = &(rf_qfor[28 - coeff][0]); 747 unsigned int r = rf_rn[coeff + 1]; 748 749 r = a1 = a2 = new = d = a = 0; /* XXX for now... */ 750 q = NULL; /* XXX for now */ 751 752 #ifdef _KERNEL 753 /* PQ in kernel currently not supported because the encoding/decoding 754 * table is not present */ 755 bzero(dest, length); 756 #else /* KERNEL */ 757 /* this code probably doesn't work and should be rewritten -wvcii */ 758 /* 13 5 bit quants in a 64 bit word */ 759 length /= 8; 760 while (length) { 761 a = *obuf++; /* XXX need to reorg to avoid cache conflicts */ 762 a ^= *nbuf++; 763 d = *dest; 764 a1 = EXTRACT(a, 0) ^ r; 765 a2 = EXTRACT(a, 1) ^ r; 766 a1 = q[a1]; 767 a2 = q[a2]; 768 new = INSERT(a2, 1) | a1; 769 a1 = EXTRACT(a, 2) ^ r; 770 a2 = EXTRACT(a, 3) ^ r; 771 a1 = q[a1]; 772 a2 = q[a2]; 773 new = new | INSERT(a1, 2) | INSERT(a2, 3); 774 a1 = EXTRACT(a, 4) ^ r; 775 a2 = EXTRACT(a, 5) ^ r; 776 a1 = q[a1]; 777 a2 = q[a2]; 778 new = new | INSERT(a1, 4) | INSERT(a2, 5); 779 a1 = EXTRACT(a, 5) ^ r; 780 a2 = EXTRACT(a, 6) ^ r; 781 a1 = q[a1]; 782 a2 = q[a2]; 783 new = new | INSERT(a1, 5) | INSERT(a2, 6); 784 #if RF_LONGSHIFT > 2 785 a1 = EXTRACT(a, 7) ^ r; 786 a2 = EXTRACT(a, 8) ^ r; 787 a1 = q[a1]; 788 a2 = q[a2]; 789 new = new | INSERT(a1, 7) | INSERT(a2, 8); 790 a1 = EXTRACT(a, 9) ^ r; 791 a2 = EXTRACT(a, 10) ^ r; 792 a1 = q[a1]; 793 a2 = q[a2]; 794 new = new | INSERT(a1, 9) | INSERT(a2, 10); 795 a1 = EXTRACT(a, 11) ^ r; 796 a2 = EXTRACT(a, 12) ^ r; 797 a1 = q[a1]; 798 a2 = q[a2]; 799 new = new | INSERT(a1, 11) | INSERT(a2, 12); 800 #endif /* RF_LONGSHIFT > 2 */ 801 d ^= new; 802 *dest++ = d; 803 length--; 804 } 805 #endif /* _KERNEL */ 806 } 807 /* 808 recover columns a and b from the given p and q into 809 bufs abuf and bbuf. All bufs are word aligned. 810 Length is in bytes. 811 */ 812 813 814 /* 815 * XXX 816 * 817 * Everything about this seems wrong. 818 */ 819 void 820 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b) 821 unsigned long *pbuf; 822 unsigned long *qbuf; 823 unsigned long *abuf; 824 unsigned long *bbuf; 825 unsigned length; 826 unsigned coeff_a; 827 unsigned coeff_b; 828 { 829 unsigned long p, q, a, a0, a1; 830 int col = (29 * coeff_a) + coeff_b; 831 unsigned char *q0 = &(rf_qinv[col][0]); 832 833 length /= 8; 834 while (length) { 835 p = *pbuf++; 836 q = *qbuf++; 837 a0 = EXTRACT(p, 0); 838 a1 = EXTRACT(q, 0); 839 a = q0[a0 << 5 | a1]; 840 #define MF(i) \ 841 a0 = EXTRACT(p,i); \ 842 a1 = EXTRACT(q,i); \ 843 a = a | INSERT(q0[a0<<5 | a1],i) 844 845 MF(1); 846 MF(2); 847 MF(3); 848 MF(4); 849 MF(5); 850 MF(6); 851 #if 0 852 MF(7); 853 MF(8); 854 MF(9); 855 MF(10); 856 MF(11); 857 MF(12); 858 #endif /* 0 */ 859 *abuf++ = a; 860 *bbuf++ = a ^ p; 861 length--; 862 } 863 } 864 /* 865 Lost parity and a data column. Recover that data column. 866 Assume col coeff is lost. Let q the contents of Q after 867 all surviving data columns have been q-xored out of it. 868 Then we have the equation 869 870 q[28-coeff][a_i ^ r_i+1] = q 871 872 but q is cyclic with period 31. 873 So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] = 874 q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} . 875 876 so a_i = r_{coeff+1} ^ q[3+coeff][q] 877 878 The routine is passed q buffer and the buffer 879 the data is to be recoverd into. They can be the same. 880 */ 881 882 883 884 static void 885 rf_InvertQ( 886 unsigned long *qbuf, 887 unsigned long *abuf, 888 unsigned length, 889 unsigned coeff) 890 { 891 unsigned long a, new; 892 unsigned long a1, a2; 893 unsigned int *q = &(rf_qfor[3 + coeff][0]); 894 unsigned r = rf_rn[coeff + 1]; 895 896 /* 13 5 bit quants in a 64 bit word */ 897 length /= 8; 898 while (length) { 899 a = *qbuf++; 900 a1 = EXTRACT(a, 0); 901 a2 = EXTRACT(a, 1); 902 a1 = r ^ q[a1]; 903 a2 = r ^ q[a2]; 904 new = INSERT(a2, 1) | a1; 905 #define M(i,j) \ 906 a1 = EXTRACT(a,i); \ 907 a2 = EXTRACT(a,j); \ 908 a1 = r ^ q[a1]; \ 909 a2 = r ^ q[a2]; \ 910 new = new | INSERT(a1,i) | INSERT(a2,j) 911 912 M(2, 3); 913 M(4, 5); 914 M(5, 6); 915 #if RF_LONGSHIFT > 2 916 M(7, 8); 917 M(9, 10); 918 M(11, 12); 919 #endif /* RF_LONGSHIFT > 2 */ 920 *abuf++ = new; 921 length--; 922 } 923 } 924 #endif /* (RF_INCLUDE_DECL_PQ > 0) || 925 * (RF_INCLUDE_RAID6 > 0) */ 926