1 /* $NetBSD: rf_pq.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Daniel Stodolsky 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /* 30 * Code for RAID level 6 (P + Q) disk array architecture. 31 * 32 * : 33 * Log: rf_pq.c,v 34 * Revision 1.33 1996/11/05 21:10:40 jimz 35 * failed pda generalization 36 * 37 * Revision 1.32 1996/07/31 16:29:50 jimz 38 * "fix" math on 32-bit machines using RF_LONGSHIFT 39 * (may be incorrect) 40 * 41 * Revision 1.31 1996/07/31 15:35:01 jimz 42 * evenodd changes; bugfixes for double-degraded archs, generalize 43 * some formerly PQ-only functions 44 * 45 * Revision 1.30 1996/07/27 23:36:08 jimz 46 * Solaris port of simulator 47 * 48 * Revision 1.29 1996/07/22 19:52:16 jimz 49 * switched node params to RF_DagParam_t, a union of 50 * a 64-bit int and a void *, for better portability 51 * attempted hpux port, but failed partway through for 52 * lack of a single C compiler capable of compiling all 53 * source files 54 * 55 * Revision 1.28 1996/06/09 02:36:46 jimz 56 * lots of little crufty cleanup- fixup whitespace 57 * issues, comment #ifdefs, improve typing in some 58 * places (esp size-related) 59 * 60 * Revision 1.27 1996/06/07 21:33:04 jimz 61 * begin using consistent types for sector numbers, 62 * stripe numbers, row+col numbers, recon unit numbers 63 * 64 * Revision 1.26 1996/06/02 17:31:48 jimz 65 * Moved a lot of global stuff into array structure, where it belongs. 66 * Fixed up paritylogging, pss modules in this manner. Some general 67 * code cleanup. Removed lots of dead code, some dead files. 68 * 69 * Revision 1.25 1996/05/31 22:26:54 jimz 70 * fix a lot of mapping problems, memory allocation problems 71 * found some weird lock issues, fixed 'em 72 * more code cleanup 73 * 74 * Revision 1.24 1996/05/30 23:22:16 jimz 75 * bugfixes of serialization, timing problems 76 * more cleanup 77 * 78 * Revision 1.23 1996/05/30 12:59:18 jimz 79 * make etimer happier, more portable 80 * 81 * Revision 1.22 1996/05/27 18:56:37 jimz 82 * more code cleanup 83 * better typing 84 * compiles in all 3 environments 85 * 86 * Revision 1.21 1996/05/24 22:17:04 jimz 87 * continue code + namespace cleanup 88 * typed a bunch of flags 89 * 90 * Revision 1.20 1996/05/24 04:28:55 jimz 91 * release cleanup ckpt 92 * 93 * Revision 1.19 1996/05/23 21:46:35 jimz 94 * checkpoint in code cleanup (release prep) 95 * lots of types, function names have been fixed 96 * 97 * Revision 1.18 1996/05/23 00:33:23 jimz 98 * code cleanup: move all debug decls to rf_options.c, all extern 99 * debug decls to rf_options.h, all debug vars preceded by rf_ 100 * 101 * Revision 1.17 1996/05/18 19:51:34 jimz 102 * major code cleanup- fix syntax, make some types consistent, 103 * add prototypes, clean out dead code, et cetera 104 * 105 * Revision 1.16 1996/05/17 14:52:04 wvcii 106 * added prototyping to QDelta() 107 * - changed buf params from volatile unsigned long * to char * 108 * changed QDelta for kernel 109 * - just bzero the buf since kernel doesn't include pq decode table 110 * 111 * Revision 1.15 1996/05/03 19:40:20 wvcii 112 * added includes for dag library 113 * 114 * Revision 1.14 1995/12/12 18:10:06 jimz 115 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT 116 * fix 80-column brain damage in comments 117 * 118 * Revision 1.13 1995/11/30 16:19:55 wvcii 119 * added copyright info 120 * 121 * Revision 1.12 1995/11/07 16:13:47 wvcii 122 * changed PQDagSelect prototype 123 * function no longer returns numHdrSucc, numTermAnt 124 * note: this file contains node functions which should be 125 * moved to rf_dagfuncs.c so that all node funcs are bundled together 126 * 127 * Revision 1.11 1995/10/04 03:50:33 wvcii 128 * removed panics, minor code cleanup in dag selection 129 * 130 * 131 */ 132 133 #include "rf_archs.h" 134 #include "rf_types.h" 135 #include "rf_raid.h" 136 #include "rf_dag.h" 137 #include "rf_dagffrd.h" 138 #include "rf_dagffwr.h" 139 #include "rf_dagdegrd.h" 140 #include "rf_dagdegwr.h" 141 #include "rf_dagutils.h" 142 #include "rf_dagfuncs.h" 143 #include "rf_threadid.h" 144 #include "rf_etimer.h" 145 #include "rf_pqdeg.h" 146 #include "rf_general.h" 147 #include "rf_map.h" 148 #include "rf_pq.h" 149 #include "rf_sys.h" 150 151 RF_RedFuncs_t rf_pFuncs = { rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P" }; 152 RF_RedFuncs_t rf_pRecoveryFuncs = { rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func" }; 153 154 int rf_RegularONPFunc(node) 155 RF_DagNode_t *node; 156 { 157 return(rf_RegularXorFunc(node)); 158 } 159 160 /* 161 same as simpleONQ func, but the coefficient is always 1 162 */ 163 164 int rf_SimpleONPFunc(node) 165 RF_DagNode_t *node; 166 { 167 return(rf_SimpleXorFunc(node)); 168 } 169 170 int rf_RecoveryPFunc(node) 171 RF_DagNode_t *node; 172 { 173 return(rf_RecoveryXorFunc(node)); 174 } 175 176 int rf_RegularPFunc(node) 177 RF_DagNode_t *node; 178 { 179 return(rf_RegularXorFunc(node)); 180 } 181 182 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 183 184 static void QDelta(char *dest, char *obuf, char *nbuf, unsigned length, 185 unsigned char coeff); 186 static void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, 187 unsigned length, unsigned coeff); 188 189 RF_RedFuncs_t rf_qFuncs = { rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q" }; 190 RF_RedFuncs_t rf_qRecoveryFuncs = { rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func" }; 191 RF_RedFuncs_t rf_pqRecoveryFuncs = { rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func" }; 192 193 void rf_PQDagSelect( 194 RF_Raid_t *raidPtr, 195 RF_IoType_t type, 196 RF_AccessStripeMap_t *asmap, 197 RF_VoidFuncPtr *createFunc) 198 { 199 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 200 unsigned ndfail = asmap->numDataFailed; 201 unsigned npfail = asmap->numParityFailed; 202 unsigned ntfail = npfail + ndfail; 203 204 RF_ASSERT(RF_IO_IS_R_OR_W(type)); 205 if (ntfail > 2) 206 { 207 RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n"); 208 /* *infoFunc = */ *createFunc = NULL; 209 return; 210 } 211 212 /* ok, we can do this I/O */ 213 if (type == RF_IO_TYPE_READ) 214 { 215 switch (ndfail) 216 { 217 case 0: 218 /* fault free read */ 219 *createFunc = rf_CreateFaultFreeReadDAG; /* same as raid 5 */ 220 break; 221 case 1: 222 /* lost a single data unit */ 223 /* two cases: 224 (1) parity is not lost. 225 do a normal raid 5 reconstruct read. 226 (2) parity is lost. 227 do a reconstruct read using "q". 228 */ 229 if (ntfail == 2) /* also lost redundancy */ 230 { 231 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) 232 *createFunc = rf_PQ_110_CreateReadDAG; 233 else 234 *createFunc = rf_PQ_101_CreateReadDAG; 235 } 236 else 237 { 238 /* P and Q are ok. But is there a failure 239 in some unaccessed data unit? 240 */ 241 if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2) 242 *createFunc = rf_PQ_200_CreateReadDAG; 243 else 244 *createFunc = rf_PQ_100_CreateReadDAG; 245 } 246 break; 247 case 2: 248 /* lost two data units */ 249 /* *infoFunc = PQOneTwo; */ 250 *createFunc = rf_PQ_200_CreateReadDAG; 251 break; 252 } 253 return; 254 } 255 256 /* a write */ 257 switch (ntfail) 258 { 259 case 0: /* fault free */ 260 if (rf_suppressLocksAndLargeWrites || 261 (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || 262 (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { 263 264 *createFunc = rf_PQCreateSmallWriteDAG; 265 } 266 else { 267 *createFunc = rf_PQCreateLargeWriteDAG; 268 } 269 break; 270 271 case 1: /* single disk fault */ 272 if (npfail==1) 273 { 274 RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)); 275 if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) 276 { /* q died, treat like normal mode raid5 write.*/ 277 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) 278 || rf_NumFailedDataUnitsInStripe(raidPtr,asmap)) 279 *createFunc = rf_PQ_001_CreateSmallWriteDAG; 280 else 281 *createFunc = rf_PQ_001_CreateLargeWriteDAG; 282 } 283 else 284 { /* parity died, small write only updating Q */ 285 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) 286 || rf_NumFailedDataUnitsInStripe(raidPtr,asmap)) 287 *createFunc = rf_PQ_010_CreateSmallWriteDAG; 288 else 289 *createFunc = rf_PQ_010_CreateLargeWriteDAG; 290 } 291 } 292 else 293 { /* data missing. 294 Do a P reconstruct write if only a single data unit 295 is lost in the stripe, otherwise a PQ reconstruct 296 write. */ 297 if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2) 298 *createFunc = rf_PQ_200_CreateWriteDAG; 299 else 300 *createFunc = rf_PQ_100_CreateWriteDAG; 301 } 302 break; 303 304 case 2: /* two disk faults */ 305 switch (npfail) 306 { 307 case 2: /* both p and q dead */ 308 *createFunc = rf_PQ_011_CreateWriteDAG; 309 break; 310 case 1: /* either p or q and dead data */ 311 RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA); 312 RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)); 313 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) 314 *createFunc = rf_PQ_101_CreateWriteDAG; 315 else 316 *createFunc = rf_PQ_110_CreateWriteDAG; 317 break; 318 case 0: /* double data loss */ 319 *createFunc = rf_PQ_200_CreateWriteDAG; 320 break; 321 } 322 break; 323 324 default: /* more than 2 disk faults */ 325 *createFunc = NULL; 326 RF_PANIC(); 327 } 328 return; 329 } 330 331 /* 332 Used as a stop gap info function 333 */ 334 static void PQOne(raidPtr, nSucc, nAnte, asmap) 335 RF_Raid_t *raidPtr; 336 int *nSucc; 337 int *nAnte; 338 RF_AccessStripeMap_t *asmap; 339 { 340 *nSucc = *nAnte = 1; 341 } 342 343 static void PQOneTwo(raidPtr, nSucc, nAnte, asmap) 344 RF_Raid_t *raidPtr; 345 int *nSucc; 346 int *nAnte; 347 RF_AccessStripeMap_t *asmap; 348 { 349 *nSucc = 1; 350 *nAnte = 2; 351 } 352 353 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG) 354 { 355 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, 356 rf_RegularPQFunc, RF_FALSE); 357 } 358 359 int rf_RegularONQFunc(node) 360 RF_DagNode_t *node; 361 { 362 int np = node->numParams; 363 int d; 364 RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[np-1].p; 365 int i; 366 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 367 RF_Etimer_t timer; 368 char *qbuf, *qpbuf; 369 char *obuf, *nbuf; 370 RF_PhysDiskAddr_t *old, *new; 371 unsigned long coeff; 372 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 373 374 RF_ETIMER_START(timer); 375 376 d = (np-3)/4; 377 RF_ASSERT (4*d+3 == np); 378 qbuf = (char *) node->params[2*d+1].p; /* q buffer*/ 379 for (i=0; i < d; i++) 380 { 381 old = (RF_PhysDiskAddr_t *) node->params[2*i].p; 382 obuf = (char *) node->params[2*i+1].p; 383 new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p; 384 nbuf = (char *) node->params[2*(d+1+i)+1].p; 385 RF_ASSERT (new->numSector == old->numSector); 386 RF_ASSERT (new->raidAddress == old->raidAddress); 387 /* the stripe unit within the stripe tells us the coefficient to use 388 for the multiply. */ 389 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress); 390 /* compute the data unit offset within the column, then add one */ 391 coeff = (coeff % raidPtr->Layout.numDataCol); 392 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU); 393 QDelta(qpbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff); 394 } 395 396 RF_ETIMER_STOP(timer); 397 RF_ETIMER_EVAL(timer); 398 tracerec->q_us += RF_ETIMER_VAL_US(timer); 399 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */ 400 return(0); 401 } 402 403 /* 404 See the SimpleXORFunc for the difference between a simple and regular func. 405 These Q functions should be used for 406 407 new q = Q(data,old data,old q) 408 409 style updates and not for 410 411 q = ( new data, new data, .... ) 412 413 computations. 414 415 The simple q takes 2(2d+1)+1 params, where d is the number 416 of stripes written. The order of params is 417 old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d 418 [2d] old q pda_0, old q buffer 419 [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d 420 raidPtr 421 */ 422 423 int rf_SimpleONQFunc(node) 424 RF_DagNode_t *node; 425 { 426 int np = node->numParams; 427 int d; 428 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; 429 int i; 430 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 431 RF_Etimer_t timer; 432 char *qbuf; 433 char *obuf, *nbuf; 434 RF_PhysDiskAddr_t *old, *new; 435 unsigned long coeff; 436 437 RF_ETIMER_START(timer); 438 439 d = (np-3)/4; 440 RF_ASSERT (4*d+3 == np); 441 qbuf = (char *) node->params[2*d+1].p; /* q buffer*/ 442 for (i=0; i < d; i++) 443 { 444 old = (RF_PhysDiskAddr_t *) node->params[2*i].p; 445 obuf = (char *) node->params[2*i+1].p; 446 new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p; 447 nbuf = (char *) node->params[2*(d+1+i)+1].p; 448 RF_ASSERT (new->numSector == old->numSector); 449 RF_ASSERT (new->raidAddress == old->raidAddress); 450 /* the stripe unit within the stripe tells us the coefficient to use 451 for the multiply. */ 452 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress); 453 /* compute the data unit offset within the column, then add one */ 454 coeff = (coeff % raidPtr->Layout.numDataCol); 455 QDelta(qbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff); 456 } 457 458 RF_ETIMER_STOP(timer); 459 RF_ETIMER_EVAL(timer); 460 tracerec->q_us += RF_ETIMER_VAL_US(timer); 461 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */ 462 return(0); 463 } 464 465 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG) 466 { 467 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs); 468 } 469 470 static void RegularQSubr(node,qbuf) 471 RF_DagNode_t *node; 472 char *qbuf; 473 { 474 int np = node->numParams; 475 int d; 476 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; 477 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 478 int i; 479 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 480 RF_Etimer_t timer; 481 char *obuf, *qpbuf; 482 RF_PhysDiskAddr_t *old; 483 unsigned long coeff; 484 485 RF_ETIMER_START(timer); 486 487 d = (np-1)/2; 488 RF_ASSERT (2*d+1 == np); 489 for (i=0; i < d; i++) 490 { 491 old = (RF_PhysDiskAddr_t *) node->params[2*i].p; 492 obuf = (char *) node->params[2*i+1].p; 493 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress); 494 /* compute the data unit offset within the column, then add one */ 495 coeff = (coeff % raidPtr->Layout.numDataCol); 496 /* the input buffers may not all be aligned with the start of the 497 stripe. so shift by their sector offset within the stripe unit */ 498 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU); 499 rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff); 500 } 501 502 RF_ETIMER_STOP(timer); 503 RF_ETIMER_EVAL(timer); 504 tracerec->q_us += RF_ETIMER_VAL_US(timer); 505 } 506 507 /* 508 used in degraded writes. 509 */ 510 511 static void DegrQSubr(node) 512 RF_DagNode_t *node; 513 { 514 int np = node->numParams; 515 int d; 516 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; 517 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 518 int i; 519 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 520 RF_Etimer_t timer; 521 char *qbuf = node->results[1]; 522 char *obuf, *qpbuf; 523 RF_PhysDiskAddr_t *old; 524 unsigned long coeff; 525 unsigned fail_start; 526 int j; 527 528 old = (RF_PhysDiskAddr_t *)node->params[np-2].p; 529 fail_start = old->startSector % secPerSU; 530 531 RF_ETIMER_START(timer); 532 533 d = (np-2)/2; 534 RF_ASSERT (2*d+2 == np); 535 for (i=0; i < d; i++) 536 { 537 old = (RF_PhysDiskAddr_t *) node->params[2*i].p; 538 obuf = (char *) node->params[2*i+1].p; 539 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress); 540 /* compute the data unit offset within the column, then add one */ 541 coeff = (coeff % raidPtr->Layout.numDataCol); 542 /* the input buffers may not all be aligned with the start of the 543 stripe. so shift by their sector offset within the stripe unit */ 544 j = old->startSector % secPerSU; 545 RF_ASSERT(j >= fail_start); 546 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start); 547 rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff); 548 } 549 550 RF_ETIMER_STOP(timer); 551 RF_ETIMER_EVAL(timer); 552 tracerec->q_us += RF_ETIMER_VAL_US(timer); 553 } 554 555 /* 556 Called by large write code to compute the new parity and the new q. 557 558 structure of the params: 559 560 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol 561 raidPtr 562 563 for a total of 2d+1 arguments. 564 The result buffers results[0], results[1] are the buffers for the p and q, 565 respectively. 566 567 We compute Q first, then compute P. The P calculation may try to reuse 568 one of the input buffers for its output, so if we computed P first, we would 569 corrupt the input for the q calculation. 570 */ 571 572 int rf_RegularPQFunc(node) 573 RF_DagNode_t *node; 574 { 575 RegularQSubr(node,node->results[1]); 576 return(rf_RegularXorFunc(node)); /* does the wakeup */ 577 } 578 579 int rf_RegularQFunc(node) 580 RF_DagNode_t *node; 581 { 582 /* Almost ... adjust Qsubr args */ 583 RegularQSubr(node, node->results[0]); 584 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */ 585 return(0); 586 } 587 588 /* 589 Called by singly degraded write code to compute the new parity and the new q. 590 591 structure of the params: 592 593 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d 594 failedPDA raidPtr 595 596 for a total of 2d+2 arguments. 597 The result buffers results[0], results[1] are the buffers for the parity and q, 598 respectively. 599 600 We compute Q first, then compute parity. The parity calculation may try to reuse 601 one of the input buffers for its output, so if we computed parity first, we would 602 corrupt the input for the q calculation. 603 604 We treat this identically to the regularPQ case, ignoring the failedPDA extra argument. 605 */ 606 607 void rf_Degraded_100_PQFunc(node) 608 RF_DagNode_t *node; 609 { 610 int np = node->numParams; 611 612 RF_ASSERT (np >= 2); 613 DegrQSubr(node); 614 rf_RecoveryXorFunc(node); 615 } 616 617 618 /* 619 The two below are used when reading a stripe with a single lost data unit. 620 The parameters are 621 622 pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr 623 624 and results[0] contains the data buffer. Which is originally zero-filled. 625 626 */ 627 628 /* this Q func is used by the degraded-mode dag functions to recover lost data. 629 * the second-to-last parameter is the PDA for the failed portion of the access. 630 * the code here looks at this PDA and assumes that the xor target buffer is 631 * equal in size to the number of sectors in the failed PDA. It then uses 632 * the other PDAs in the parameter list to determine where within the target 633 * buffer the corresponding data should be xored. 634 * 635 * Recall the basic equation is 636 * 637 * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256 638 * 639 * so to recover data_j we need 640 * 641 * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256 642 * 643 * So the coefficient for each buffer is (255 - data_col), and j should be initialized by 644 * copying Q into it. Then we need to do a table lookup to convert to solve 645 * data_j /= J 646 * 647 * 648 */ 649 int rf_RecoveryQFunc(node) 650 RF_DagNode_t *node; 651 { 652 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; 653 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; 654 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p; 655 int i; 656 RF_PhysDiskAddr_t *pda; 657 RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector); 658 char *srcbuf, *destbuf; 659 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 660 RF_Etimer_t timer; 661 unsigned long coeff; 662 663 RF_ETIMER_START(timer); 664 /* start by copying Q into the buffer */ 665 bcopy(node->params[node->numParams-3].p,node->results[0], 666 rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); 667 for (i=0; i<node->numParams-4; i+=2) 668 { 669 RF_ASSERT (node->params[i+1].p != node->results[0]); 670 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 671 srcbuf = (char *) node->params[i+1].p; 672 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 673 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset); 674 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress); 675 /* compute the data unit offset within the column */ 676 coeff = (coeff % raidPtr->Layout.numDataCol); 677 rf_IncQ((unsigned long *)destbuf, (unsigned long *)srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); 678 } 679 /* Do the nasty inversion now */ 680 coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),failedPDA->startSector) % raidPtr->Layout.numDataCol); 681 rf_InvertQ(node->results[0],node->results[0],rf_RaidAddressToByte(raidPtr,pda->numSector),coeff); 682 RF_ETIMER_STOP(timer); 683 RF_ETIMER_EVAL(timer); 684 tracerec->q_us += RF_ETIMER_VAL_US(timer); 685 rf_GenericWakeupFunc(node, 0); 686 return(0); 687 } 688 689 int rf_RecoveryPQFunc(node) 690 RF_DagNode_t *node; 691 { 692 RF_PANIC(); 693 return(1); 694 } 695 696 /* 697 Degraded write Q subroutine. 698 Used when P is dead. 699 Large-write style Q computation. 700 Parameters 701 702 (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr. 703 704 We ignore failedPDA. 705 706 This is a "simple style" recovery func. 707 */ 708 709 void rf_PQ_DegradedWriteQFunc(node) 710 RF_DagNode_t *node; 711 { 712 int np = node->numParams; 713 int d; 714 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; 715 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; 716 int i; 717 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 718 RF_Etimer_t timer; 719 char *qbuf = node->results[0]; 720 char *obuf, *qpbuf; 721 RF_PhysDiskAddr_t *old; 722 unsigned long coeff; 723 int fail_start,j; 724 725 old = (RF_PhysDiskAddr_t *) node->params[np-2].p; 726 fail_start = old->startSector % secPerSU; 727 728 RF_ETIMER_START(timer); 729 730 d = (np-2)/2; 731 RF_ASSERT (2*d+2 == np); 732 733 for (i=0; i < d; i++) 734 { 735 old = (RF_PhysDiskAddr_t *) node->params[2*i].p; 736 obuf = (char *) node->params[2*i+1].p; 737 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress); 738 /* compute the data unit offset within the column, then add one */ 739 coeff = (coeff % raidPtr->Layout.numDataCol); 740 j = old->startSector % secPerSU; 741 RF_ASSERT(j >= fail_start); 742 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start); 743 rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff); 744 } 745 746 RF_ETIMER_STOP(timer); 747 RF_ETIMER_EVAL(timer); 748 tracerec->q_us += RF_ETIMER_VAL_US(timer); 749 rf_GenericWakeupFunc(node, 0); 750 } 751 752 753 754 755 /* Q computations */ 756 757 /* 758 coeff - colummn; 759 760 compute dest ^= qfor[28-coeff][rn[coeff+1] a] 761 762 on 5-bit basis; 763 length in bytes; 764 */ 765 766 void rf_IncQ(dest,buf,length,coeff) 767 unsigned long *dest; 768 unsigned long *buf; 769 unsigned length; 770 unsigned coeff; 771 { 772 unsigned long a, d, new; 773 unsigned long a1, a2; 774 unsigned int *q = &(rf_qfor[28-coeff][0]); 775 unsigned r = rf_rn[coeff+1]; 776 777 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f) 778 #define INSERT(a,i) (a << (5L*i)) 779 780 length /= 8; 781 /* 13 5 bit quants in a 64 bit word */ 782 while (length) 783 { 784 a = *buf++; 785 d = *dest; 786 a1 = EXTRACT(a,0) ^ r; 787 a2 = EXTRACT(a,1) ^ r; 788 new = INSERT(a2,1) | a1 ; 789 a1 = EXTRACT(a,2) ^ r; 790 a2 = EXTRACT(a,3) ^ r; 791 a1 = q[a1]; 792 a2 = q[a2]; 793 new = new | INSERT(a1,2) | INSERT (a2,3); 794 a1 = EXTRACT(a,4) ^ r; 795 a2 = EXTRACT(a,5) ^ r; 796 a1 = q[a1]; 797 a2 = q[a2]; 798 new = new | INSERT(a1,4) | INSERT (a2,5); 799 a1 = EXTRACT(a,5) ^ r; 800 a2 = EXTRACT(a,6) ^ r; 801 a1 = q[a1]; 802 a2 = q[a2]; 803 new = new | INSERT(a1,5) | INSERT (a2,6); 804 #if RF_LONGSHIFT > 2 805 a1 = EXTRACT(a,7) ^ r; 806 a2 = EXTRACT(a,8) ^ r; 807 a1 = q[a1]; 808 a2 = q[a2]; 809 new = new | INSERT(a1,7) | INSERT (a2,8); 810 a1 = EXTRACT(a,9) ^ r; 811 a2 = EXTRACT(a,10) ^ r; 812 a1 = q[a1]; 813 a2 = q[a2]; 814 new = new | INSERT(a1,9) | INSERT (a2,10); 815 a1 = EXTRACT(a,11) ^ r; 816 a2 = EXTRACT(a,12) ^ r; 817 a1 = q[a1]; 818 a2 = q[a2]; 819 new = new | INSERT(a1,11) | INSERT (a2,12); 820 #endif /* RF_LONGSHIFT > 2 */ 821 d ^= new; 822 *dest++ = d; 823 length--; 824 } 825 } 826 827 /* 828 compute 829 830 dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ] 831 832 on a five bit basis. 833 optimization: compute old ^ new on 64 bit basis. 834 835 length in bytes. 836 */ 837 838 static void QDelta( 839 char *dest, 840 char *obuf, 841 char *nbuf, 842 unsigned length, 843 unsigned char coeff) 844 { 845 unsigned long a, d, new; 846 unsigned long a1, a2; 847 unsigned int *q = &(rf_qfor[28-coeff][0]); 848 unsigned r = rf_rn[coeff+1]; 849 850 #ifdef KERNEL 851 /* PQ in kernel currently not supported because the encoding/decoding table is not present */ 852 bzero(dest, length); 853 #else /* KERNEL */ 854 /* this code probably doesn't work and should be rewritten -wvcii */ 855 /* 13 5 bit quants in a 64 bit word */ 856 length /= 8; 857 while (length) 858 { 859 a = *obuf++; /* XXX need to reorg to avoid cache conflicts */ 860 a ^= *nbuf++; 861 d = *dest; 862 a1 = EXTRACT(a,0) ^ r; 863 a2 = EXTRACT(a,1) ^ r; 864 a1 = q[a1]; 865 a2 = q[a2]; 866 new = INSERT(a2,1) | a1 ; 867 a1 = EXTRACT(a,2) ^ r; 868 a2 = EXTRACT(a,3) ^ r; 869 a1 = q[a1]; 870 a2 = q[a2]; 871 new = new | INSERT(a1,2) | INSERT (a2,3); 872 a1 = EXTRACT(a,4) ^ r; 873 a2 = EXTRACT(a,5) ^ r; 874 a1 = q[a1]; 875 a2 = q[a2]; 876 new = new | INSERT(a1,4) | INSERT (a2,5); 877 a1 = EXTRACT(a,5) ^ r; 878 a2 = EXTRACT(a,6) ^ r; 879 a1 = q[a1]; 880 a2 = q[a2]; 881 new = new | INSERT(a1,5) | INSERT (a2,6); 882 #if RF_LONGSHIFT > 2 883 a1 = EXTRACT(a,7) ^ r; 884 a2 = EXTRACT(a,8) ^ r; 885 a1 = q[a1]; 886 a2 = q[a2]; 887 new = new | INSERT(a1,7) | INSERT (a2,8); 888 a1 = EXTRACT(a,9) ^ r; 889 a2 = EXTRACT(a,10) ^ r; 890 a1 = q[a1]; 891 a2 = q[a2]; 892 new = new | INSERT(a1,9) | INSERT (a2,10); 893 a1 = EXTRACT(a,11) ^ r; 894 a2 = EXTRACT(a,12) ^ r; 895 a1 = q[a1]; 896 a2 = q[a2]; 897 new = new | INSERT(a1,11) | INSERT (a2,12); 898 #endif /* RF_LONGSHIFT > 2 */ 899 d ^= new; 900 *dest++ = d; 901 length--; 902 } 903 #endif /* KERNEL */ 904 } 905 906 /* 907 recover columns a and b from the given p and q into 908 bufs abuf and bbuf. All bufs are word aligned. 909 Length is in bytes. 910 */ 911 912 913 /* 914 * XXX 915 * 916 * Everything about this seems wrong. 917 */ 918 void rf_PQ_recover(pbuf,qbuf,abuf,bbuf,length,coeff_a,coeff_b) 919 unsigned long *pbuf; 920 unsigned long *qbuf; 921 unsigned long *abuf; 922 unsigned long *bbuf; 923 unsigned length; 924 unsigned coeff_a; 925 unsigned coeff_b; 926 { 927 unsigned long p, q, a, a0, a1; 928 int col = (29 * coeff_a) + coeff_b; 929 unsigned char *q0 = & (rf_qinv[col][0]); 930 931 length /= 8; 932 while (length) 933 { 934 p = *pbuf++; 935 q = *qbuf++; 936 a0 = EXTRACT(p,0); 937 a1 = EXTRACT(q,0); 938 a = q0[a0<<5 | a1]; 939 #define MF(i) \ 940 a0 = EXTRACT(p,i); \ 941 a1 = EXTRACT(q,i); \ 942 a = a | INSERT(q0[a0<<5 | a1],i) 943 944 MF(1); 945 MF(2); 946 MF(3); 947 MF(4); 948 MF(5); 949 MF(6); 950 #if 0 951 MF(7); 952 MF(8); 953 MF(9); 954 MF(10); 955 MF(11); 956 MF(12); 957 #endif /* 0 */ 958 *abuf++ = a; 959 *bbuf++ = a ^ p; 960 length--; 961 } 962 } 963 964 /* 965 Lost parity and a data column. Recover that data column. 966 Assume col coeff is lost. Let q the contents of Q after 967 all surviving data columns have been q-xored out of it. 968 Then we have the equation 969 970 q[28-coeff][a_i ^ r_i+1] = q 971 972 but q is cyclic with period 31. 973 So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] = 974 q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} . 975 976 so a_i = r_{coeff+1} ^ q[3+coeff][q] 977 978 The routine is passed q buffer and the buffer 979 the data is to be recoverd into. They can be the same. 980 */ 981 982 983 984 static void rf_InvertQ( 985 unsigned long *qbuf, 986 unsigned long *abuf, 987 unsigned length, 988 unsigned coeff) 989 { 990 unsigned long a, new; 991 unsigned long a1, a2; 992 unsigned int *q = &(rf_qfor[3+coeff][0]); 993 unsigned r = rf_rn[coeff+1]; 994 995 /* 13 5 bit quants in a 64 bit word */ 996 length /= 8; 997 while (length) 998 { 999 a = *qbuf++; 1000 a1 = EXTRACT(a,0); 1001 a2 = EXTRACT(a,1); 1002 a1 = r ^ q[a1]; 1003 a2 = r ^ q[a2]; 1004 new = INSERT(a2,1) | a1; 1005 #define M(i,j) \ 1006 a1 = EXTRACT(a,i); \ 1007 a2 = EXTRACT(a,j); \ 1008 a1 = r ^ q[a1]; \ 1009 a2 = r ^ q[a2]; \ 1010 new = new | INSERT(a1,i) | INSERT(a2,j) 1011 1012 M(2,3); 1013 M(4,5); 1014 M(5,6); 1015 #if RF_LONGSHIFT > 2 1016 M(7,8); 1017 M(9,10); 1018 M(11,12); 1019 #endif /* RF_LONGSHIFT > 2 */ 1020 *abuf++ = new; 1021 length--; 1022 } 1023 } 1024 1025 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */ 1026