1 /* $NetBSD: rf_map.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /************************************************************************** 30 * 31 * map.c -- main code for mapping RAID addresses to physical disk addresses 32 * 33 **************************************************************************/ 34 35 /* 36 * : 37 * Log: rf_map.c,v 38 * Revision 1.53 1996/11/05 21:10:40 jimz 39 * failed pda generalization 40 * 41 * Revision 1.52 1996/08/20 19:58:39 jimz 42 * initialize numParityFailed and numQFailed to 0 in MarkFailuresInASMList 43 * 44 * Revision 1.51 1996/08/19 22:26:31 jimz 45 * add Chang's bugfixes for double-disk failures in MarkFailuresInASMList 46 * 47 * Revision 1.50 1996/08/19 21:38:06 jimz 48 * stripeOffset was uninitialized in CheckStripeForFailures 49 * 50 * Revision 1.49 1996/07/31 15:34:56 jimz 51 * evenodd changes; bugfixes for double-degraded archs, generalize 52 * some formerly PQ-only functions 53 * 54 * Revision 1.48 1996/07/27 23:36:08 jimz 55 * Solaris port of simulator 56 * 57 * Revision 1.47 1996/07/22 19:52:16 jimz 58 * switched node params to RF_DagParam_t, a union of 59 * a 64-bit int and a void *, for better portability 60 * attempted hpux port, but failed partway through for 61 * lack of a single C compiler capable of compiling all 62 * source files 63 * 64 * Revision 1.46 1996/06/10 12:50:57 jimz 65 * Add counters to freelists to track number of allocations, frees, 66 * grows, max size, etc. Adjust a couple sets of PRIME params based 67 * on the results. 68 * 69 * Revision 1.45 1996/06/10 11:55:47 jimz 70 * Straightened out some per-array/not-per-array distinctions, fixed 71 * a couple bugs related to confusion. Added shutdown lists. Removed 72 * layout shutdown function (now subsumed by shutdown lists). 73 * 74 * Revision 1.44 1996/06/09 02:36:46 jimz 75 * lots of little crufty cleanup- fixup whitespace 76 * issues, comment #ifdefs, improve typing in some 77 * places (esp size-related) 78 * 79 * Revision 1.43 1996/06/07 21:33:04 jimz 80 * begin using consistent types for sector numbers, 81 * stripe numbers, row+col numbers, recon unit numbers 82 * 83 * Revision 1.42 1996/06/05 18:06:02 jimz 84 * Major code cleanup. The Great Renaming is now done. 85 * Better modularity. Better typing. Fixed a bunch of 86 * synchronization bugs. Made a lot of global stuff 87 * per-desc or per-array. Removed dead code. 88 * 89 * Revision 1.41 1996/06/03 23:28:26 jimz 90 * more bugfixes 91 * check in tree to sync for IPDS runs with current bugfixes 92 * there still may be a problem with threads in the script test 93 * getting I/Os stuck- not trivially reproducible (runs ~50 times 94 * in a row without getting stuck) 95 * 96 * Revision 1.40 1996/05/31 22:26:54 jimz 97 * fix a lot of mapping problems, memory allocation problems 98 * found some weird lock issues, fixed 'em 99 * more code cleanup 100 * 101 * Revision 1.39 1996/05/30 23:22:16 jimz 102 * bugfixes of serialization, timing problems 103 * more cleanup 104 * 105 * Revision 1.38 1996/05/30 11:29:41 jimz 106 * Numerous bug fixes. Stripe lock release code disagreed with the taking code 107 * about when stripes should be locked (I made it consistent: no parity, no lock) 108 * There was a lot of extra serialization of I/Os which I've removed- a lot of 109 * it was to calculate values for the cache code, which is no longer with us. 110 * More types, function, macro cleanup. Added code to properly quiesce the array 111 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general 112 * before. Fixed memory allocation, freeing bugs. 113 * 114 * Revision 1.37 1996/05/27 18:56:37 jimz 115 * more code cleanup 116 * better typing 117 * compiles in all 3 environments 118 * 119 * Revision 1.36 1996/05/23 21:46:35 jimz 120 * checkpoint in code cleanup (release prep) 121 * lots of types, function names have been fixed 122 * 123 * Revision 1.35 1996/05/23 00:33:23 jimz 124 * code cleanup: move all debug decls to rf_options.c, all extern 125 * debug decls to rf_options.h, all debug vars preceded by rf_ 126 * 127 * Revision 1.34 1996/05/20 16:14:45 jimz 128 * switch to rf_{mutex,cond}_{init,destroy} 129 * 130 * Revision 1.33 1996/05/18 19:51:34 jimz 131 * major code cleanup- fix syntax, make some types consistent, 132 * add prototypes, clean out dead code, et cetera 133 * 134 * Revision 1.32 1996/05/17 00:51:47 jimz 135 * reformat for readability 136 * 137 * Revision 1.31 1996/05/16 23:06:26 jimz 138 * convert asmhdr to use RF_FREELIST stuff 139 * 140 * Revision 1.30 1996/05/16 19:09:42 jimz 141 * grow init asm freelist to 32 142 * 143 * Revision 1.29 1996/05/16 15:27:55 jimz 144 * prime freelist pumps for asm and pda lists 145 * 146 * Revision 1.28 1996/05/02 14:58:35 jimz 147 * legibility cleanup 148 * 149 * Revision 1.27 1995/12/12 18:10:06 jimz 150 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT 151 * fix 80-column brain damage in comments 152 * 153 * Revision 1.26 1995/12/01 19:25:06 root 154 * added copyright info 155 * 156 * Revision 1.25 1995/11/17 19:01:57 wvcii 157 * added call to MapQ in two fault tolerant case 158 * 159 * Revision 1.24 1995/11/17 15:10:53 wvcii 160 * fixed bug in ASMCheckStatus - ASSERT was using disk sector addresses 161 * rather than raidAddress 162 * 163 * Revision 1.23 1995/07/26 03:26:51 robby 164 * map the allocation and freeing routines for some stuff non-static 165 * 166 * Revision 1.22 1995/06/28 09:33:45 holland 167 * bug fixes related to dist sparing and multiple-row arrays 168 * 169 * Revision 1.21 1995/06/28 04:51:08 holland 170 * added some asserts against zero-length accesses 171 * 172 * Revision 1.20 1995/06/23 13:40:06 robby 173 * updeated to prototypes in rf_layout.h 174 * 175 */ 176 177 #include "rf_types.h" 178 #include "rf_threadstuff.h" 179 #include "rf_raid.h" 180 #include "rf_general.h" 181 #include "rf_map.h" 182 #include "rf_freelist.h" 183 #include "rf_shutdown.h" 184 #include "rf_sys.h" 185 186 static void rf_FreePDAList(RF_PhysDiskAddr_t *start, RF_PhysDiskAddr_t *end, int count); 187 static void rf_FreeASMList(RF_AccessStripeMap_t *start, RF_AccessStripeMap_t *end, 188 int count); 189 190 /***************************************************************************************** 191 * 192 * MapAccess -- main 1st order mapping routine. 193 * 194 * Maps an access in the RAID address space to the corresponding set of physical disk 195 * addresses. The result is returned as a list of AccessStripeMap structures, one per 196 * stripe accessed. Each ASM structure contains a pointer to a list of PhysDiskAddr 197 * structures, which describe the physical locations touched by the user access. Note 198 * that this routine returns only static mapping information, i.e. the list of physical 199 * addresses returned does not necessarily identify the set of physical locations that 200 * will actually be read or written. 201 * 202 * The routine also maps the parity. The physical disk location returned always 203 * indicates the entire parity unit, even when only a subset of it is being accessed. 204 * This is because an access that is not stripe unit aligned but that spans a stripe 205 * unit boundary may require access two distinct portions of the parity unit, and we 206 * can't yet tell which portion(s) we'll actually need. We leave it up to the algorithm 207 * selection code to decide what subset of the parity unit to access. 208 * 209 * Note that addresses in the RAID address space must always be maintained as 210 * longs, instead of ints. 211 * 212 * This routine returns NULL if numBlocks is 0 213 * 214 ****************************************************************************************/ 215 216 RF_AccessStripeMapHeader_t *rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap) 217 RF_Raid_t *raidPtr; 218 RF_RaidAddr_t raidAddress; /* starting address in RAID address space */ 219 RF_SectorCount_t numBlocks; /* number of blocks in RAID address space to access */ 220 caddr_t buffer; /* buffer to supply/receive data */ 221 int remap; /* 1 => remap addresses to spare space */ 222 { 223 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 224 RF_AccessStripeMapHeader_t *asm_hdr = NULL; 225 RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL; 226 int faultsTolerated = layoutPtr->map->faultsTolerated; 227 RF_RaidAddr_t startAddress = raidAddress; /* we'll change raidAddress along the way */ 228 RF_RaidAddr_t endAddress = raidAddress + numBlocks; 229 RF_RaidDisk_t **disks = raidPtr->Disks; 230 231 RF_PhysDiskAddr_t *pda_p, *pda_q; 232 RF_StripeCount_t numStripes = 0; 233 RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress; 234 RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr; 235 RF_StripeCount_t totStripes; 236 RF_StripeNum_t stripeID, lastSID, SUID, lastSUID; 237 RF_AccessStripeMap_t *asmList, *t_asm; 238 RF_PhysDiskAddr_t *pdaList, *t_pda; 239 240 /* allocate all the ASMs and PDAs up front */ 241 lastRaidAddr = raidAddress + numBlocks - 1 ; 242 stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress); 243 lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr); 244 totStripes = lastSID - stripeID + 1; 245 SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress); 246 lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr); 247 248 asmList = rf_AllocASMList(totStripes); 249 pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes); /* may also need pda(s) per stripe for parity */ 250 251 if (raidAddress+numBlocks > raidPtr->totalSectors) { 252 RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n", 253 (int)raidAddress); 254 return(NULL); 255 } 256 257 if (rf_mapDebug) 258 rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks); 259 for (; raidAddress < endAddress; ) { 260 /* make the next stripe structure */ 261 RF_ASSERT(asmList); 262 t_asm = asmList; 263 asmList = asmList->next; 264 bzero((char *)t_asm, sizeof(RF_AccessStripeMap_t)); 265 if (!asm_p) 266 asm_list = asm_p = t_asm; 267 else { 268 asm_p->next = t_asm; 269 asm_p = asm_p->next; 270 } 271 numStripes++; 272 273 /* map SUs from current location to the end of the stripe */ 274 asm_p->stripeID = /*rf_RaidAddressToStripeID(layoutPtr, raidAddress)*/ stripeID++; 275 stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress); 276 stripeEndAddress = RF_MIN(endAddress,stripeRealEndAddress ); 277 asm_p->raidAddress = raidAddress; 278 asm_p->endRaidAddress = stripeEndAddress; 279 280 /* map each stripe unit in the stripe */ 281 pda_p = NULL; 282 startAddrWithinStripe = raidAddress; /* Raid addr of start of portion of access that is within this stripe */ 283 for (; raidAddress < stripeEndAddress; ) { 284 RF_ASSERT(pdaList); 285 t_pda = pdaList; 286 pdaList = pdaList->next; 287 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); 288 if (!pda_p) 289 asm_p->physInfo = pda_p = t_pda; 290 else { 291 pda_p->next = t_pda; 292 pda_p = pda_p->next; 293 } 294 295 pda_p->type = RF_PDA_TYPE_DATA; 296 (layoutPtr->map->MapSector)(raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap); 297 298 /* mark any failures we find. failedPDA is don't-care if there is more than one failure */ 299 pda_p->raidAddress = raidAddress; /* the RAID address corresponding to this physical disk address */ 300 nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress); 301 pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress; 302 RF_ASSERT(pda_p->numSector != 0); 303 rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,0); 304 pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress)); 305 asm_p->totalSectorsAccessed += pda_p->numSector; 306 asm_p->numStripeUnitsAccessed++; 307 asm_p->origRow = pda_p->row; /* redundant but harmless to do this in every loop iteration */ 308 309 raidAddress = RF_MIN(endAddress, nextStripeUnitAddress); 310 } 311 312 /* Map the parity. At this stage, the startSector and numSector fields 313 * for the parity unit are always set to indicate the entire parity unit. 314 * We may modify this after mapping the data portion. 315 */ 316 switch (faultsTolerated) 317 { 318 case 0: 319 break; 320 case 1: /* single fault tolerant */ 321 RF_ASSERT(pdaList); 322 t_pda = pdaList; 323 pdaList = pdaList->next; 324 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); 325 pda_p = asm_p->parityInfo = t_pda; 326 pda_p->type = RF_PDA_TYPE_PARITY; 327 (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), 328 &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap); 329 pda_p->numSector = layoutPtr->sectorsPerStripeUnit; 330 /* raidAddr may be needed to find unit to redirect to */ 331 pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); 332 rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1); 333 rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p); 334 335 break; 336 case 2: /* two fault tolerant */ 337 RF_ASSERT(pdaList && pdaList->next); 338 t_pda = pdaList; 339 pdaList = pdaList->next; 340 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); 341 pda_p = asm_p->parityInfo = t_pda; 342 pda_p->type = RF_PDA_TYPE_PARITY; 343 t_pda = pdaList; 344 pdaList = pdaList->next; 345 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); 346 pda_q = asm_p->qInfo = t_pda; 347 pda_q->type = RF_PDA_TYPE_Q; 348 (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), 349 &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap); 350 (layoutPtr->map->MapQ)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), 351 &(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap); 352 pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit; 353 /* raidAddr may be needed to find unit to redirect to */ 354 pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); 355 pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); 356 /* failure mode stuff */ 357 rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1); 358 rf_ASMCheckStatus(raidPtr,pda_q,asm_p,disks,1); 359 rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p); 360 rf_ASMParityAdjust(asm_p->qInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p); 361 break; 362 } 363 } 364 RF_ASSERT(asmList == NULL && pdaList == NULL); 365 /* make the header structure */ 366 asm_hdr = rf_AllocAccessStripeMapHeader(); 367 RF_ASSERT(numStripes == totStripes); 368 asm_hdr->numStripes = numStripes; 369 asm_hdr->stripeMap = asm_list; 370 371 if (rf_mapDebug) 372 rf_PrintAccessStripeMap(asm_hdr); 373 return(asm_hdr); 374 } 375 376 /***************************************************************************************** 377 * This routine walks through an ASM list and marks the PDAs that have failed. 378 * It's called only when a disk failure causes an in-flight DAG to fail. 379 * The parity may consist of two components, but we want to use only one failedPDA 380 * pointer. Thus we set failedPDA to point to the first parity component, and rely 381 * on the rest of the code to do the right thing with this. 382 ****************************************************************************************/ 383 384 void rf_MarkFailuresInASMList(raidPtr, asm_h) 385 RF_Raid_t *raidPtr; 386 RF_AccessStripeMapHeader_t *asm_h; 387 { 388 RF_RaidDisk_t **disks = raidPtr->Disks; 389 RF_AccessStripeMap_t *asmap; 390 RF_PhysDiskAddr_t *pda; 391 392 for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) { 393 asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0; 394 asmap->numFailedPDAs = 0; 395 bzero((char *)asmap->failedPDAs, 396 RF_MAX_FAILED_PDA*sizeof(RF_PhysDiskAddr_t *)); 397 for (pda = asmap->physInfo; pda; pda=pda->next) { 398 if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) { 399 printf("DEAD DISK BOGUSLY DETECTED!!\n"); 400 asmap->numDataFailed++; 401 asmap->failedPDAs[asmap->numFailedPDAs] = pda; 402 asmap->numFailedPDAs++; 403 } 404 } 405 pda = asmap->parityInfo; 406 if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) { 407 asmap->numParityFailed++; 408 asmap->failedPDAs[asmap->numFailedPDAs] = pda; 409 asmap->numFailedPDAs++; 410 } 411 pda = asmap->qInfo; 412 if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) { 413 asmap->numQFailed++; 414 asmap->failedPDAs[asmap->numFailedPDAs] = pda; 415 asmap->numFailedPDAs++; 416 } 417 } 418 } 419 420 /***************************************************************************************** 421 * 422 * DuplicateASM -- duplicates an ASM and returns the new one 423 * 424 ****************************************************************************************/ 425 RF_AccessStripeMap_t *rf_DuplicateASM(asmap) 426 RF_AccessStripeMap_t *asmap; 427 { 428 RF_AccessStripeMap_t *new_asm; 429 RF_PhysDiskAddr_t *pda, *new_pda, *t_pda; 430 431 new_pda = NULL; 432 new_asm = rf_AllocAccessStripeMapComponent(); 433 bcopy((char *)asmap, (char *)new_asm, sizeof(RF_AccessStripeMap_t)); 434 new_asm->numFailedPDAs = 0; /* ??? */ 435 new_asm->failedPDAs[0] = NULL; 436 new_asm->physInfo = NULL; 437 new_asm->parityInfo = NULL; 438 new_asm->next = NULL; 439 440 for (pda = asmap->physInfo; pda; pda=pda->next) { /* copy the physInfo list */ 441 t_pda = rf_AllocPhysDiskAddr(); 442 bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t)); 443 t_pda->next = NULL; 444 if (!new_asm->physInfo) {new_asm->physInfo = t_pda; new_pda = t_pda;} 445 else {new_pda->next = t_pda; new_pda = new_pda->next;} 446 if (pda == asmap->failedPDAs[0]) 447 new_asm->failedPDAs[0] = t_pda; 448 } 449 for (pda = asmap->parityInfo; pda; pda=pda->next) { /* copy the parityInfo list */ 450 t_pda = rf_AllocPhysDiskAddr(); 451 bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t)); 452 t_pda->next = NULL; 453 if (!new_asm->parityInfo) {new_asm->parityInfo = t_pda; new_pda = t_pda;} 454 else {new_pda->next = t_pda; new_pda = new_pda->next;} 455 if (pda == asmap->failedPDAs[0]) 456 new_asm->failedPDAs[0] = t_pda; 457 } 458 return(new_asm); 459 } 460 461 /***************************************************************************************** 462 * 463 * DuplicatePDA -- duplicates a PDA and returns the new one 464 * 465 ****************************************************************************************/ 466 RF_PhysDiskAddr_t *rf_DuplicatePDA(pda) 467 RF_PhysDiskAddr_t *pda; 468 { 469 RF_PhysDiskAddr_t *new; 470 471 new = rf_AllocPhysDiskAddr(); 472 bcopy((char *)pda, (char *)new, sizeof(RF_PhysDiskAddr_t)); 473 return(new); 474 } 475 476 /***************************************************************************************** 477 * 478 * routines to allocate and free list elements. All allocation routines zero the 479 * structure before returning it. 480 * 481 * FreePhysDiskAddr is static. It should never be called directly, because 482 * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list. 483 * 484 ****************************************************************************************/ 485 486 static RF_FreeList_t *rf_asmhdr_freelist; 487 #define RF_MAX_FREE_ASMHDR 128 488 #define RF_ASMHDR_INC 16 489 #define RF_ASMHDR_INITIAL 32 490 491 static RF_FreeList_t *rf_asm_freelist; 492 #define RF_MAX_FREE_ASM 192 493 #define RF_ASM_INC 24 494 #define RF_ASM_INITIAL 64 495 496 static RF_FreeList_t *rf_pda_freelist; 497 #define RF_MAX_FREE_PDA 192 498 #define RF_PDA_INC 24 499 #define RF_PDA_INITIAL 64 500 501 /* called at shutdown time. So far, all that is necessary is to release all the free lists */ 502 static void rf_ShutdownMapModule(void *); 503 static void rf_ShutdownMapModule(ignored) 504 void *ignored; 505 { 506 RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *)); 507 RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *)); 508 RF_FREELIST_DESTROY(rf_asm_freelist,next,(RF_AccessStripeMap_t *)); 509 } 510 511 int rf_ConfigureMapModule(listp) 512 RF_ShutdownList_t **listp; 513 { 514 int rc; 515 516 RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR, 517 RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t)); 518 if (rf_asmhdr_freelist == NULL) { 519 return(ENOMEM); 520 } 521 RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM, 522 RF_ASM_INC, sizeof(RF_AccessStripeMap_t)); 523 if (rf_asm_freelist == NULL) { 524 RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *)); 525 return(ENOMEM); 526 } 527 RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA, 528 RF_PDA_INC, sizeof(RF_PhysDiskAddr_t)); 529 if (rf_pda_freelist == NULL) { 530 RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *)); 531 RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *)); 532 return(ENOMEM); 533 } 534 535 rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL); 536 if (rc) { 537 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, 538 __LINE__, rc); 539 rf_ShutdownMapModule(NULL); 540 return(rc); 541 } 542 543 RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL,next, 544 (RF_AccessStripeMapHeader_t *)); 545 RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL,next, 546 (RF_AccessStripeMap_t *)); 547 RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL,next, 548 (RF_PhysDiskAddr_t *)); 549 550 return(0); 551 } 552 553 RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader() 554 { 555 RF_AccessStripeMapHeader_t *p; 556 557 RF_FREELIST_GET(rf_asmhdr_freelist,p,next,(RF_AccessStripeMapHeader_t *)); 558 bzero((char *)p, sizeof(RF_AccessStripeMapHeader_t)); 559 560 return(p); 561 } 562 563 564 void rf_FreeAccessStripeMapHeader(p) 565 RF_AccessStripeMapHeader_t *p; 566 { 567 RF_FREELIST_FREE(rf_asmhdr_freelist,p,next); 568 } 569 570 RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr() 571 { 572 RF_PhysDiskAddr_t *p; 573 574 RF_FREELIST_GET(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *)); 575 bzero((char *)p, sizeof(RF_PhysDiskAddr_t)); 576 577 return(p); 578 } 579 580 /* allocates a list of PDAs, locking the free list only once 581 * when we have to call calloc, we do it one component at a time to simplify 582 * the process of freeing the list at program shutdown. This should not be 583 * much of a performance hit, because it should be very infrequently executed. 584 */ 585 RF_PhysDiskAddr_t *rf_AllocPDAList(count) 586 int count; 587 { 588 RF_PhysDiskAddr_t *p = NULL; 589 590 RF_FREELIST_GET_N(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *),count); 591 return(p); 592 } 593 594 void rf_FreePhysDiskAddr(p) 595 RF_PhysDiskAddr_t *p; 596 { 597 RF_FREELIST_FREE(rf_pda_freelist,p,next); 598 } 599 600 static void rf_FreePDAList(l_start, l_end, count) 601 RF_PhysDiskAddr_t *l_start, *l_end; /* pointers to start and end of list */ 602 int count; /* number of elements in list */ 603 { 604 RF_FREELIST_FREE_N(rf_pda_freelist,l_start,next,(RF_PhysDiskAddr_t *),count); 605 } 606 607 RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent() 608 { 609 RF_AccessStripeMap_t *p; 610 611 RF_FREELIST_GET(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *)); 612 bzero((char *)p, sizeof(RF_AccessStripeMap_t)); 613 614 return(p); 615 } 616 617 /* this is essentially identical to AllocPDAList. I should combine the two. 618 * when we have to call calloc, we do it one component at a time to simplify 619 * the process of freeing the list at program shutdown. This should not be 620 * much of a performance hit, because it should be very infrequently executed. 621 */ 622 RF_AccessStripeMap_t *rf_AllocASMList(count) 623 int count; 624 { 625 RF_AccessStripeMap_t *p = NULL; 626 627 RF_FREELIST_GET_N(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *),count); 628 return(p); 629 } 630 631 void rf_FreeAccessStripeMapComponent(p) 632 RF_AccessStripeMap_t *p; 633 { 634 RF_FREELIST_FREE(rf_asm_freelist,p,next); 635 } 636 637 static void rf_FreeASMList(l_start, l_end, count) 638 RF_AccessStripeMap_t *l_start, *l_end; 639 int count; 640 { 641 RF_FREELIST_FREE_N(rf_asm_freelist,l_start,next,(RF_AccessStripeMap_t *),count); 642 } 643 644 void rf_FreeAccessStripeMap(hdr) 645 RF_AccessStripeMapHeader_t *hdr; 646 { 647 RF_AccessStripeMap_t *p, *pt = NULL; 648 RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL; 649 int count = 0, t, asm_count = 0; 650 651 for (p = hdr->stripeMap; p; p=p->next) { 652 653 /* link the 3 pda lists into the accumulating pda list */ 654 655 if (!pdaList) pdaList = p->qInfo; else pdaEnd->next = p->qInfo; 656 for (trailer=NULL,pdp=p->qInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;} 657 if (trailer) pdaEnd = trailer; 658 659 if (!pdaList) pdaList = p->parityInfo; else pdaEnd->next = p->parityInfo; 660 for (trailer=NULL,pdp=p->parityInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;} 661 if (trailer) pdaEnd = trailer; 662 663 if (!pdaList) pdaList = p->physInfo; else pdaEnd->next = p->physInfo; 664 for (trailer=NULL,pdp=p->physInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;} 665 if (trailer) pdaEnd = trailer; 666 667 pt = p; 668 asm_count++; 669 } 670 671 /* debug only */ 672 for (t=0,pdp=pdaList; pdp; pdp=pdp->next) 673 t++; 674 RF_ASSERT(t == count); 675 676 if (pdaList) 677 rf_FreePDAList(pdaList, pdaEnd, count); 678 rf_FreeASMList(hdr->stripeMap, pt, asm_count); 679 rf_FreeAccessStripeMapHeader(hdr); 680 } 681 682 /* We can't use the large write optimization if there are any failures in the stripe. 683 * In the declustered layout, there is no way to immediately determine what disks 684 * constitute a stripe, so we actually have to hunt through the stripe looking for failures. 685 * The reason we map the parity instead of just using asm->parityInfo->col is because 686 * the latter may have been already redirected to a spare drive, which would 687 * mess up the computation of the stripe offset. 688 * 689 * ASSUMES AT MOST ONE FAILURE IN THE STRIPE. 690 */ 691 int rf_CheckStripeForFailures(raidPtr, asmap) 692 RF_Raid_t *raidPtr; 693 RF_AccessStripeMap_t *asmap; 694 { 695 RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i; 696 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 697 RF_StripeCount_t stripeOffset; 698 int numFailures; 699 RF_RaidAddr_t sosAddr; 700 RF_SectorNum_t diskOffset, poffset; 701 RF_RowCol_t testrow; 702 703 /* quick out in the fault-free case. */ 704 RF_LOCK_MUTEX(raidPtr->mutex); 705 numFailures = raidPtr->numFailures; 706 RF_UNLOCK_MUTEX(raidPtr->mutex); 707 if (numFailures == 0) return(0); 708 709 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 710 row = asmap->physInfo->row; 711 (layoutPtr->map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &testrow); 712 (layoutPtr->map->MapParity)(raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0); /* get pcol */ 713 714 /* this need not be true if we've redirected the access to a spare in another row 715 RF_ASSERT(row == testrow); 716 */ 717 stripeOffset = 0; 718 for (i=0; i<layoutPtr->numDataCol+layoutPtr->numParityCol; i++) { 719 if (diskids[i] != pcol) { 720 if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) { 721 if (raidPtr->status[testrow] != rf_rs_reconstructing) 722 return(1); 723 RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]); 724 layoutPtr->map->MapSector(raidPtr, 725 sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit, 726 &trow, &tcol, &diskOffset, 0); 727 RF_ASSERT( (trow == testrow) && (tcol == diskids[i]) ); 728 if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset)) 729 return(1); 730 asmap->flags |= RF_ASM_REDIR_LARGE_WRITE; 731 return(0); 732 } 733 stripeOffset++; 734 } 735 } 736 return(0); 737 } 738 739 /* 740 return the number of failed data units in the stripe. 741 */ 742 743 int rf_NumFailedDataUnitsInStripe(raidPtr, asmap) 744 RF_Raid_t *raidPtr; 745 RF_AccessStripeMap_t *asmap; 746 { 747 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 748 RF_RowCol_t trow, tcol, row, i; 749 RF_SectorNum_t diskOffset; 750 RF_RaidAddr_t sosAddr; 751 int numFailures; 752 753 /* quick out in the fault-free case. */ 754 RF_LOCK_MUTEX(raidPtr->mutex); 755 numFailures = raidPtr->numFailures; 756 RF_UNLOCK_MUTEX(raidPtr->mutex); 757 if (numFailures == 0) return(0); 758 numFailures = 0; 759 760 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 761 row = asmap->physInfo->row; 762 for (i=0; i<layoutPtr->numDataCol; i++) 763 { 764 (layoutPtr->map->MapSector)(raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit, 765 &trow, &tcol, &diskOffset, 0); 766 if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status)) 767 numFailures++; 768 } 769 770 return numFailures; 771 } 772 773 774 /***************************************************************************************** 775 * 776 * debug routines 777 * 778 ****************************************************************************************/ 779 780 void rf_PrintAccessStripeMap(asm_h) 781 RF_AccessStripeMapHeader_t *asm_h; 782 { 783 rf_PrintFullAccessStripeMap(asm_h, 0); 784 } 785 786 void rf_PrintFullAccessStripeMap(asm_h, prbuf) 787 RF_AccessStripeMapHeader_t *asm_h; 788 int prbuf; /* flag to print buffer pointers */ 789 { 790 int i; 791 RF_AccessStripeMap_t *asmap = asm_h->stripeMap; 792 RF_PhysDiskAddr_t *p; 793 printf("%d stripes total\n", (int)asm_h->numStripes); 794 for (; asmap; asmap = asmap->next) { 795 /* printf("Num failures: %d\n",asmap->numDataFailed); */ 796 /* printf("Num sectors: %d\n",(int)asmap->totalSectorsAccessed); */ 797 printf("Stripe %d (%d sectors), failures: %d data, %d parity: ", 798 (int) asmap->stripeID, 799 (int) asmap->totalSectorsAccessed, 800 (int) asmap->numDataFailed, 801 (int) asmap->numParityFailed); 802 if (asmap->parityInfo) { 803 printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col, 804 (int)asmap->parityInfo->startSector, 805 (int)(asmap->parityInfo->startSector + 806 asmap->parityInfo->numSector - 1)); 807 if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->bufPtr); 808 if (asmap->parityInfo->next) { 809 printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row, 810 asmap->parityInfo->next->col, 811 (int) asmap->parityInfo->next->startSector, 812 (int)(asmap->parityInfo->next->startSector + 813 asmap->parityInfo->next->numSector - 1)); 814 if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->next->bufPtr); 815 RF_ASSERT(asmap->parityInfo->next->next == NULL); 816 } 817 printf("]\n\t"); 818 } 819 for (i=0,p=asmap->physInfo; p; p=p->next,i++) { 820 printf("SU r%d c%d s%d-%d ", p->row, p->col, (int)p->startSector, 821 (int)(p->startSector + p->numSector - 1)); 822 if (prbuf) printf("b0x%lx ", (unsigned long) p->bufPtr); 823 if (i && !(i&1)) printf("\n\t"); 824 } 825 printf("\n"); 826 p = asm_h->stripeMap->failedPDAs[0]; 827 if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1) printf("[multiple failures]\n"); 828 else if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0) 829 printf("\t[Failed PDA: r%d c%d s%d-%d]\n",p->row, p->col, 830 (int)p->startSector, (int)(p->startSector + p->numSector-1)); 831 } 832 } 833 834 void rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks) 835 RF_Raid_t *raidPtr; 836 RF_RaidAddr_t raidAddr; 837 RF_SectorCount_t numBlocks; 838 { 839 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 840 RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); 841 842 printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t"); 843 for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) { 844 printf("%d (0x%x), ",(int)ra, (int)ra); 845 } 846 printf("\n"); 847 printf("Offset into stripe unit: %d (0x%x)\n", 848 (int)(raidAddr % layoutPtr->sectorsPerStripeUnit), 849 (int)(raidAddr % layoutPtr->sectorsPerStripeUnit)); 850 } 851 852 /* 853 given a parity descriptor and the starting address within a stripe, 854 range restrict the parity descriptor to touch only the correct stuff. 855 */ 856 void rf_ASMParityAdjust( 857 RF_PhysDiskAddr_t *toAdjust, 858 RF_StripeNum_t startAddrWithinStripe, 859 RF_SectorNum_t endAddress, 860 RF_RaidLayout_t *layoutPtr, 861 RF_AccessStripeMap_t *asm_p) 862 { 863 RF_PhysDiskAddr_t *new_pda; 864 865 /* when we're accessing only a portion of one stripe unit, we want the parity descriptor 866 * to identify only the chunk of parity associated with the data. When the access spans 867 * exactly one stripe unit boundary and is less than a stripe unit in size, it uses two disjoint 868 * regions of the parity unit. When an access spans more than one stripe unit boundary, it 869 * uses all of the parity unit. 870 * 871 * To better handle the case where stripe units are small, we may eventually want to change 872 * the 2nd case so that if the SU size is below some threshold, we just read/write the whole 873 * thing instead of breaking it up into two accesses. 874 */ 875 if (asm_p->numStripeUnitsAccessed == 1) 876 { 877 int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit); 878 toAdjust->startSector += x; 879 toAdjust->raidAddress += x; 880 toAdjust->numSector = asm_p->physInfo->numSector; 881 RF_ASSERT(toAdjust->numSector != 0); 882 } 883 else 884 if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit) 885 { 886 int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit); 887 888 /* create a second pda and copy the parity map info into it */ 889 RF_ASSERT(toAdjust->next == NULL); 890 new_pda = toAdjust->next = rf_AllocPhysDiskAddr(); 891 *new_pda = *toAdjust; /* structure assignment */ 892 new_pda->next = NULL; 893 894 /* adjust the start sector & number of blocks for the first parity pda */ 895 toAdjust->startSector += x; 896 toAdjust->raidAddress += x; 897 toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe; 898 RF_ASSERT(toAdjust->numSector != 0); 899 900 /* adjust the second pda */ 901 new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress); 902 /*new_pda->raidAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, toAdjust->raidAddress);*/ 903 RF_ASSERT(new_pda->numSector != 0); 904 } 905 } 906 907 /* 908 Check if a disk has been spared or failed. If spared, 909 redirect the I/O. 910 If it has been failed, record it in the asm pointer. 911 Fourth arg is whether data or parity. 912 */ 913 void rf_ASMCheckStatus( 914 RF_Raid_t *raidPtr, 915 RF_PhysDiskAddr_t *pda_p, 916 RF_AccessStripeMap_t *asm_p, 917 RF_RaidDisk_t **disks, 918 int parity) 919 { 920 RF_DiskStatus_t dstatus; 921 RF_RowCol_t frow, fcol; 922 923 dstatus = disks[pda_p->row][pda_p->col].status; 924 925 if (dstatus == rf_ds_spared) { 926 /* if the disk has been spared, redirect access to the spare */ 927 frow = pda_p->row; fcol = pda_p->col; 928 pda_p->row = disks[frow][fcol].spareRow; 929 pda_p->col = disks[frow][fcol].spareCol; 930 } 931 else if (dstatus == rf_ds_dist_spared) { 932 /* ditto if disk has been spared to dist spare space */ 933 RF_RowCol_t or = pda_p->row, oc=pda_p->col; 934 RF_SectorNum_t oo = pda_p->startSector; 935 936 if (pda_p -> type == RF_PDA_TYPE_DATA) 937 raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP); 938 else 939 raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP); 940 941 if (rf_mapDebug) { 942 printf("Redirected r %d c %d o %d -> r%d c %d o %d\n",or,oc,(int)oo, 943 pda_p->row,pda_p->col,(int)pda_p->startSector); 944 } 945 } else if (RF_DEAD_DISK(dstatus)) { 946 /* if the disk is inaccessible, mark the failure */ 947 if (parity) 948 asm_p->numParityFailed++; 949 else { 950 asm_p->numDataFailed++; 951 #if 0 952 /* XXX Do we really want this spewing out on the console? GO */ 953 printf("DATA_FAILED!\n"); 954 #endif 955 } 956 asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p; 957 asm_p->numFailedPDAs++; 958 #if 0 959 switch (asm_p->numParityFailed + asm_p->numDataFailed) 960 { 961 case 1: 962 asm_p->failedPDAs[0] = pda_p; 963 break; 964 case 2: 965 asm_p->failedPDAs[1] = pda_p; 966 default: 967 break; 968 } 969 #endif 970 } 971 /* the redirected access should never span a stripe unit boundary */ 972 RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress) == 973 rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress + pda_p->numSector -1)); 974 RF_ASSERT(pda_p->col != -1); 975 } 976