1 /* $NetBSD: rf_reconstruct.c,v 1.113 2011/05/11 18:13:12 mrg Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /************************************************************ 30 * 31 * rf_reconstruct.c -- code to perform on-line reconstruction 32 * 33 ************************************************************/ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.113 2011/05/11 18:13:12 mrg Exp $"); 37 38 #include <sys/param.h> 39 #include <sys/time.h> 40 #include <sys/buf.h> 41 #include <sys/errno.h> 42 #include <sys/systm.h> 43 #include <sys/proc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/namei.h> /* for pathbuf */ 48 #include <dev/raidframe/raidframevar.h> 49 50 #include "rf_raid.h" 51 #include "rf_reconutil.h" 52 #include "rf_revent.h" 53 #include "rf_reconbuffer.h" 54 #include "rf_acctrace.h" 55 #include "rf_etimer.h" 56 #include "rf_dag.h" 57 #include "rf_desc.h" 58 #include "rf_debugprint.h" 59 #include "rf_general.h" 60 #include "rf_driver.h" 61 #include "rf_utils.h" 62 #include "rf_shutdown.h" 63 64 #include "rf_kintf.h" 65 66 /* setting these to -1 causes them to be set to their default values if not set by debug options */ 67 68 #if RF_DEBUG_RECON 69 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) 70 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 71 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 72 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 73 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) 74 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) 75 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) 76 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) 77 78 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 79 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 80 81 #else /* RF_DEBUG_RECON */ 82 83 #define Dprintf(s) {} 84 #define Dprintf1(s,a) {} 85 #define Dprintf2(s,a,b) {} 86 #define Dprintf3(s,a,b,c) {} 87 #define Dprintf4(s,a,b,c,d) {} 88 #define Dprintf5(s,a,b,c,d,e) {} 89 #define Dprintf6(s,a,b,c,d,e,f) {} 90 #define Dprintf7(s,a,b,c,d,e,f,g) {} 91 92 #define DDprintf1(s,a) {} 93 #define DDprintf2(s,a,b) {} 94 95 #endif /* RF_DEBUG_RECON */ 96 97 #define RF_RECON_DONE_READS 1 98 #define RF_RECON_READ_ERROR 2 99 #define RF_RECON_WRITE_ERROR 3 100 #define RF_RECON_READ_STOPPED 4 101 #define RF_RECON_WRITE_DONE 5 102 103 #define RF_MAX_FREE_RECONBUFFER 32 104 #define RF_MIN_FREE_RECONBUFFER 16 105 106 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, 107 RF_RaidDisk_t *, int, RF_RowCol_t); 108 static void FreeReconDesc(RF_RaidReconDesc_t *); 109 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *); 110 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t); 111 static int TryToRead(RF_Raid_t *, RF_RowCol_t); 112 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t, 113 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *, 114 RF_SectorNum_t *); 115 static int IssueNextWriteRequest(RF_Raid_t *); 116 static int ReconReadDoneProc(void *, int); 117 static int ReconWriteDoneProc(void *, int); 118 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t); 119 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *, 120 RF_RowCol_t, RF_HeadSepLimit_t, 121 RF_ReconUnitNum_t); 122 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *, 123 RF_ReconParityStripeStatus_t *, 124 RF_PerDiskReconCtrl_t *, 125 RF_RowCol_t, RF_StripeNum_t, 126 RF_ReconUnitNum_t); 127 static void ForceReconReadDoneProc(void *, int); 128 static void rf_ShutdownReconstruction(void *); 129 130 struct RF_ReconDoneProc_s { 131 void (*proc) (RF_Raid_t *, void *); 132 void *arg; 133 RF_ReconDoneProc_t *next; 134 }; 135 136 /************************************************************************** 137 * 138 * sets up the parameters that will be used by the reconstruction process 139 * currently there are none, except for those that the layout-specific 140 * configuration (e.g. rf_ConfigureDeclustered) routine sets up. 141 * 142 * in the kernel, we fire off the recon thread. 143 * 144 **************************************************************************/ 145 static void 146 rf_ShutdownReconstruction(void *ignored) 147 { 148 pool_destroy(&rf_pools.reconbuffer); 149 } 150 151 int 152 rf_ConfigureReconstruction(RF_ShutdownList_t **listp) 153 { 154 155 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t), 156 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER); 157 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); 158 159 return (0); 160 } 161 162 static RF_RaidReconDesc_t * 163 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col, 164 RF_RaidDisk_t *spareDiskPtr, int numDisksDone, 165 RF_RowCol_t scol) 166 { 167 168 RF_RaidReconDesc_t *reconDesc; 169 170 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t), 171 (RF_RaidReconDesc_t *)); 172 reconDesc->raidPtr = raidPtr; 173 reconDesc->col = col; 174 reconDesc->spareDiskPtr = spareDiskPtr; 175 reconDesc->numDisksDone = numDisksDone; 176 reconDesc->scol = scol; 177 reconDesc->next = NULL; 178 179 return (reconDesc); 180 } 181 182 static void 183 FreeReconDesc(RF_RaidReconDesc_t *reconDesc) 184 { 185 #if RF_RECON_STATS > 0 186 printf("raid%d: %lu recon event waits, %lu recon delays\n", 187 reconDesc->raidPtr->raidid, 188 (long) reconDesc->numReconEventWaits, 189 (long) reconDesc->numReconExecDelays); 190 #endif /* RF_RECON_STATS > 0 */ 191 printf("raid%d: %lu max exec ticks\n", 192 reconDesc->raidPtr->raidid, 193 (long) reconDesc->maxReconExecTicks); 194 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t)); 195 } 196 197 198 /***************************************************************************** 199 * 200 * primary routine to reconstruct a failed disk. This should be called from 201 * within its own thread. It won't return until reconstruction completes, 202 * fails, or is aborted. 203 *****************************************************************************/ 204 int 205 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col) 206 { 207 const RF_LayoutSW_t *lp; 208 int rc; 209 210 lp = raidPtr->Layout.map; 211 if (lp->SubmitReconBuffer) { 212 /* 213 * The current infrastructure only supports reconstructing one 214 * disk at a time for each array. 215 */ 216 rf_lock_mutex2(raidPtr->mutex); 217 while (raidPtr->reconInProgress) { 218 rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); 219 } 220 raidPtr->reconInProgress++; 221 rf_unlock_mutex2(raidPtr->mutex); 222 rc = rf_ReconstructFailedDiskBasic(raidPtr, col); 223 rf_lock_mutex2(raidPtr->mutex); 224 raidPtr->reconInProgress--; 225 } else { 226 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 227 lp->parityConfig); 228 rc = EIO; 229 rf_lock_mutex2(raidPtr->mutex); 230 } 231 rf_signal_cond2(raidPtr->waitForReconCond); 232 rf_unlock_mutex2(raidPtr->mutex); 233 return (rc); 234 } 235 236 int 237 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) 238 { 239 RF_ComponentLabel_t *c_label; 240 RF_RaidDisk_t *spareDiskPtr = NULL; 241 RF_RaidReconDesc_t *reconDesc; 242 RF_RowCol_t scol; 243 int numDisksDone = 0, rc; 244 245 /* first look for a spare drive onto which to reconstruct the data */ 246 /* spare disk descriptors are stored in row 0. This may have to 247 * change eventually */ 248 249 rf_lock_mutex2(raidPtr->mutex); 250 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); 251 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 252 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 253 if (raidPtr->status != rf_rs_degraded) { 254 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col); 255 rf_unlock_mutex2(raidPtr->mutex); 256 return (EINVAL); 257 } 258 scol = (-1); 259 } else { 260 #endif 261 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { 262 if (raidPtr->Disks[scol].status == rf_ds_spare) { 263 spareDiskPtr = &raidPtr->Disks[scol]; 264 spareDiskPtr->status = rf_ds_used_spare; 265 break; 266 } 267 } 268 if (!spareDiskPtr) { 269 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col); 270 rf_unlock_mutex2(raidPtr->mutex); 271 return (ENOSPC); 272 } 273 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol); 274 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 275 } 276 #endif 277 rf_unlock_mutex2(raidPtr->mutex); 278 279 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol); 280 raidPtr->reconDesc = (void *) reconDesc; 281 #if RF_RECON_STATS > 0 282 reconDesc->hsStallCount = 0; 283 reconDesc->numReconExecDelays = 0; 284 reconDesc->numReconEventWaits = 0; 285 #endif /* RF_RECON_STATS > 0 */ 286 reconDesc->reconExecTimerRunning = 0; 287 reconDesc->reconExecTicks = 0; 288 reconDesc->maxReconExecTicks = 0; 289 rc = rf_ContinueReconstructFailedDisk(reconDesc); 290 291 if (!rc) { 292 /* fix up the component label */ 293 /* Don't actually need the read here.. */ 294 c_label = raidget_component_label(raidPtr, scol); 295 296 raid_init_component_label(raidPtr, c_label); 297 c_label->row = 0; 298 c_label->column = col; 299 c_label->clean = RF_RAID_DIRTY; 300 c_label->status = rf_ds_optimal; 301 rf_component_label_set_partitionsize(c_label, 302 raidPtr->Disks[scol].partitionSize); 303 304 /* We've just done a rebuild based on all the other 305 disks, so at this point the parity is known to be 306 clean, even if it wasn't before. */ 307 308 /* XXX doesn't hold for RAID 6!!*/ 309 310 rf_lock_mutex2(raidPtr->mutex); 311 raidPtr->parity_good = RF_RAID_CLEAN; 312 rf_unlock_mutex2(raidPtr->mutex); 313 314 /* XXXX MORE NEEDED HERE */ 315 316 raidflush_component_label(raidPtr, scol); 317 } else { 318 /* Reconstruct failed. */ 319 320 rf_lock_mutex2(raidPtr->mutex); 321 /* Failed disk goes back to "failed" status */ 322 raidPtr->Disks[col].status = rf_ds_failed; 323 324 /* Spare disk goes back to "spare" status. */ 325 spareDiskPtr->status = rf_ds_spare; 326 rf_unlock_mutex2(raidPtr->mutex); 327 328 } 329 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 330 return (rc); 331 } 332 333 /* 334 335 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, 336 and you don't get a spare until the next Monday. With this function 337 (and hot-swappable drives) you can now put your new disk containing 338 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to 339 rebuild the data "on the spot". 340 341 */ 342 343 int 344 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col) 345 { 346 RF_RaidDisk_t *spareDiskPtr = NULL; 347 RF_RaidReconDesc_t *reconDesc; 348 const RF_LayoutSW_t *lp; 349 RF_ComponentLabel_t *c_label; 350 int numDisksDone = 0, rc; 351 struct partinfo dpart; 352 struct pathbuf *pb; 353 struct vnode *vp; 354 struct vattr va; 355 int retcode; 356 int ac; 357 358 rf_lock_mutex2(raidPtr->mutex); 359 lp = raidPtr->Layout.map; 360 if (!lp->SubmitReconBuffer) { 361 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 362 lp->parityConfig); 363 /* wakeup anyone who might be waiting to do a reconstruct */ 364 rf_signal_cond2(raidPtr->waitForReconCond); 365 rf_unlock_mutex2(raidPtr->mutex); 366 return(EIO); 367 } 368 369 /* 370 * The current infrastructure only supports reconstructing one 371 * disk at a time for each array. 372 */ 373 374 if (raidPtr->Disks[col].status != rf_ds_failed) { 375 /* "It's gone..." */ 376 raidPtr->numFailures++; 377 raidPtr->Disks[col].status = rf_ds_failed; 378 raidPtr->status = rf_rs_degraded; 379 rf_unlock_mutex2(raidPtr->mutex); 380 rf_update_component_labels(raidPtr, 381 RF_NORMAL_COMPONENT_UPDATE); 382 rf_lock_mutex2(raidPtr->mutex); 383 } 384 385 while (raidPtr->reconInProgress) { 386 rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); 387 } 388 389 raidPtr->reconInProgress++; 390 391 /* first look for a spare drive onto which to reconstruct the 392 data. spare disk descriptors are stored in row 0. This 393 may have to change eventually */ 394 395 /* Actually, we don't care if it's failed or not... On a RAID 396 set with correct parity, this function should be callable 397 on any component without ill effects. */ 398 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */ 399 400 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 401 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 402 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col); 403 404 raidPtr->reconInProgress--; 405 rf_signal_cond2(raidPtr->waitForReconCond); 406 rf_unlock_mutex2(raidPtr->mutex); 407 return (EINVAL); 408 } 409 #endif 410 411 /* This device may have been opened successfully the 412 first time. Close it before trying to open it again.. */ 413 414 if (raidPtr->raid_cinfo[col].ci_vp != NULL) { 415 #if 0 416 printf("Closed the open device: %s\n", 417 raidPtr->Disks[col].devname); 418 #endif 419 vp = raidPtr->raid_cinfo[col].ci_vp; 420 ac = raidPtr->Disks[col].auto_configured; 421 rf_unlock_mutex2(raidPtr->mutex); 422 rf_close_component(raidPtr, vp, ac); 423 rf_lock_mutex2(raidPtr->mutex); 424 raidPtr->raid_cinfo[col].ci_vp = NULL; 425 } 426 /* note that this disk was *not* auto_configured (any longer)*/ 427 raidPtr->Disks[col].auto_configured = 0; 428 429 #if 0 430 printf("About to (re-)open the device for rebuilding: %s\n", 431 raidPtr->Disks[col].devname); 432 #endif 433 rf_unlock_mutex2(raidPtr->mutex); 434 pb = pathbuf_create(raidPtr->Disks[col].devname); 435 if (pb == NULL) { 436 retcode = ENOMEM; 437 } else { 438 retcode = dk_lookup(pb, curlwp, &vp); 439 pathbuf_destroy(pb); 440 } 441 442 if (retcode) { 443 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid, 444 raidPtr->Disks[col].devname, retcode); 445 446 /* the component isn't responding properly... 447 must be still dead :-( */ 448 rf_lock_mutex2(raidPtr->mutex); 449 raidPtr->reconInProgress--; 450 rf_signal_cond2(raidPtr->waitForReconCond); 451 rf_unlock_mutex2(raidPtr->mutex); 452 return(retcode); 453 } 454 455 /* Ok, so we can at least do a lookup... 456 How about actually getting a vp for it? */ 457 458 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) { 459 rf_lock_mutex2(raidPtr->mutex); 460 raidPtr->reconInProgress--; 461 rf_signal_cond2(raidPtr->waitForReconCond); 462 rf_unlock_mutex2(raidPtr->mutex); 463 return(retcode); 464 } 465 466 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred); 467 if (retcode) { 468 rf_lock_mutex2(raidPtr->mutex); 469 raidPtr->reconInProgress--; 470 rf_signal_cond2(raidPtr->waitForReconCond); 471 rf_unlock_mutex2(raidPtr->mutex); 472 return(retcode); 473 } 474 rf_lock_mutex2(raidPtr->mutex); 475 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize; 476 477 raidPtr->Disks[col].numBlocks = dpart.part->p_size - 478 rf_protectedSectors; 479 480 raidPtr->raid_cinfo[col].ci_vp = vp; 481 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev; 482 483 raidPtr->Disks[col].dev = va.va_rdev; 484 485 /* we allow the user to specify that only a fraction 486 of the disks should be used this is just for debug: 487 it speeds up * the parity scan */ 488 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks * 489 rf_sizePercentage / 100; 490 rf_unlock_mutex2(raidPtr->mutex); 491 492 spareDiskPtr = &raidPtr->Disks[col]; 493 spareDiskPtr->status = rf_ds_used_spare; 494 495 printf("raid%d: initiating in-place reconstruction on column %d\n", 496 raidPtr->raidid, col); 497 498 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, 499 numDisksDone, col); 500 raidPtr->reconDesc = (void *) reconDesc; 501 #if RF_RECON_STATS > 0 502 reconDesc->hsStallCount = 0; 503 reconDesc->numReconExecDelays = 0; 504 reconDesc->numReconEventWaits = 0; 505 #endif /* RF_RECON_STATS > 0 */ 506 reconDesc->reconExecTimerRunning = 0; 507 reconDesc->reconExecTicks = 0; 508 reconDesc->maxReconExecTicks = 0; 509 rc = rf_ContinueReconstructFailedDisk(reconDesc); 510 511 if (!rc) { 512 rf_lock_mutex2(raidPtr->mutex); 513 /* Need to set these here, as at this point it'll be claiming 514 that the disk is in rf_ds_spared! But we know better :-) */ 515 516 raidPtr->Disks[col].status = rf_ds_optimal; 517 raidPtr->status = rf_rs_optimal; 518 rf_unlock_mutex2(raidPtr->mutex); 519 520 /* fix up the component label */ 521 /* Don't actually need the read here.. */ 522 c_label = raidget_component_label(raidPtr, col); 523 524 rf_lock_mutex2(raidPtr->mutex); 525 raid_init_component_label(raidPtr, c_label); 526 527 c_label->row = 0; 528 c_label->column = col; 529 530 /* We've just done a rebuild based on all the other 531 disks, so at this point the parity is known to be 532 clean, even if it wasn't before. */ 533 534 /* XXX doesn't hold for RAID 6!!*/ 535 536 raidPtr->parity_good = RF_RAID_CLEAN; 537 rf_unlock_mutex2(raidPtr->mutex); 538 539 raidflush_component_label(raidPtr, col); 540 } else { 541 /* Reconstruct-in-place failed. Disk goes back to 542 "failed" status, regardless of what it was before. */ 543 rf_lock_mutex2(raidPtr->mutex); 544 raidPtr->Disks[col].status = rf_ds_failed; 545 rf_unlock_mutex2(raidPtr->mutex); 546 } 547 548 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 549 550 rf_lock_mutex2(raidPtr->mutex); 551 raidPtr->reconInProgress--; 552 rf_signal_cond2(raidPtr->waitForReconCond); 553 rf_unlock_mutex2(raidPtr->mutex); 554 555 return (rc); 556 } 557 558 559 int 560 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc) 561 { 562 RF_Raid_t *raidPtr = reconDesc->raidPtr; 563 RF_RowCol_t col = reconDesc->col; 564 RF_RowCol_t scol = reconDesc->scol; 565 RF_ReconMap_t *mapPtr; 566 RF_ReconCtrl_t *tmp_reconctrl; 567 RF_ReconEvent_t *event; 568 RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev; 569 RF_ReconUnitCount_t RUsPerPU; 570 struct timeval etime, elpsd; 571 unsigned long xor_s, xor_resid_us; 572 int i, ds; 573 int status, done; 574 int recon_error, write_error; 575 576 raidPtr->accumXorTimeUs = 0; 577 #if RF_ACC_TRACE > 0 578 /* create one trace record per physical disk */ 579 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); 580 #endif 581 582 /* quiesce the array prior to starting recon. this is needed 583 * to assure no nasty interactions with pending user writes. 584 * We need to do this before we change the disk or row status. */ 585 586 Dprintf("RECON: begin request suspend\n"); 587 rf_SuspendNewRequestsAndWait(raidPtr); 588 Dprintf("RECON: end request suspend\n"); 589 590 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */ 591 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol); 592 593 rf_lock_mutex2(raidPtr->mutex); 594 595 /* create the reconstruction control pointer and install it in 596 * the right slot */ 597 raidPtr->reconControl = tmp_reconctrl; 598 mapPtr = raidPtr->reconControl->reconMap; 599 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs; 600 raidPtr->reconControl->numRUsComplete = 0; 601 raidPtr->status = rf_rs_reconstructing; 602 raidPtr->Disks[col].status = rf_ds_reconstructing; 603 raidPtr->Disks[col].spareCol = scol; 604 605 rf_unlock_mutex2(raidPtr->mutex); 606 607 RF_GETTIME(raidPtr->reconControl->starttime); 608 609 Dprintf("RECON: resume requests\n"); 610 rf_ResumeNewRequests(raidPtr); 611 612 613 mapPtr = raidPtr->reconControl->reconMap; 614 615 incPSID = RF_RECONMAP_SIZE; 616 lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU; 617 RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU; 618 recon_error = 0; 619 write_error = 0; 620 pending_writes = incPSID; 621 raidPtr->reconControl->lastPSID = incPSID; 622 623 /* start the actual reconstruction */ 624 625 done = 0; 626 while (!done) { 627 628 if (raidPtr->waitShutdown) { 629 /* someone is unconfiguring this array... bail on the reconstruct.. */ 630 recon_error = 1; 631 break; 632 } 633 634 num_writes = 0; 635 636 /* issue a read for each surviving disk */ 637 638 reconDesc->numDisksDone = 0; 639 for (i = 0; i < raidPtr->numCol; i++) { 640 if (i != col) { 641 /* find and issue the next I/O on the 642 * indicated disk */ 643 if (IssueNextReadRequest(raidPtr, i)) { 644 Dprintf1("RECON: done issuing for c%d\n", i); 645 reconDesc->numDisksDone++; 646 } 647 } 648 } 649 650 /* process reconstruction events until all disks report that 651 * they've completed all work */ 652 653 while (reconDesc->numDisksDone < raidPtr->numCol - 1) { 654 655 event = rf_GetNextReconEvent(reconDesc); 656 status = ProcessReconEvent(raidPtr, event); 657 658 /* the normal case is that a read completes, and all is well. */ 659 if (status == RF_RECON_DONE_READS) { 660 reconDesc->numDisksDone++; 661 } else if ((status == RF_RECON_READ_ERROR) || 662 (status == RF_RECON_WRITE_ERROR)) { 663 /* an error was encountered while reconstructing... 664 Pretend we've finished this disk. 665 */ 666 recon_error = 1; 667 raidPtr->reconControl->error = 1; 668 669 /* bump the numDisksDone count for reads, 670 but not for writes */ 671 if (status == RF_RECON_READ_ERROR) 672 reconDesc->numDisksDone++; 673 674 /* write errors are special -- when we are 675 done dealing with the reads that are 676 finished, we don't want to wait for any 677 writes */ 678 if (status == RF_RECON_WRITE_ERROR) { 679 write_error = 1; 680 num_writes++; 681 } 682 683 } else if (status == RF_RECON_READ_STOPPED) { 684 /* count this component as being "done" */ 685 reconDesc->numDisksDone++; 686 } else if (status == RF_RECON_WRITE_DONE) { 687 num_writes++; 688 } 689 690 if (recon_error) { 691 /* make sure any stragglers are woken up so that 692 their theads will complete, and we can get out 693 of here with all IO processed */ 694 695 rf_WakeupHeadSepCBWaiters(raidPtr); 696 } 697 698 raidPtr->reconControl->numRUsTotal = 699 mapPtr->totalRUs; 700 raidPtr->reconControl->numRUsComplete = 701 mapPtr->totalRUs - 702 rf_UnitsLeftToReconstruct(mapPtr); 703 704 #if RF_DEBUG_RECON 705 raidPtr->reconControl->percentComplete = 706 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 707 if (rf_prReconSched) { 708 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 709 } 710 #endif 711 } 712 713 /* reads done, wakup any waiters, and then wait for writes */ 714 715 rf_WakeupHeadSepCBWaiters(raidPtr); 716 717 while (!recon_error && (num_writes < pending_writes)) { 718 event = rf_GetNextReconEvent(reconDesc); 719 status = ProcessReconEvent(raidPtr, event); 720 721 if (status == RF_RECON_WRITE_ERROR) { 722 num_writes++; 723 recon_error = 1; 724 raidPtr->reconControl->error = 1; 725 /* an error was encountered at the very end... bail */ 726 } else if (status == RF_RECON_WRITE_DONE) { 727 num_writes++; 728 } /* else it's something else, and we don't care */ 729 } 730 if (recon_error || 731 (raidPtr->reconControl->lastPSID == lastPSID)) { 732 done = 1; 733 break; 734 } 735 736 prev = raidPtr->reconControl->lastPSID; 737 raidPtr->reconControl->lastPSID += incPSID; 738 739 if (raidPtr->reconControl->lastPSID > lastPSID) { 740 pending_writes = lastPSID - prev; 741 raidPtr->reconControl->lastPSID = lastPSID; 742 } 743 744 /* back down curPSID to get ready for the next round... */ 745 for (i = 0; i < raidPtr->numCol; i++) { 746 if (i != col) { 747 raidPtr->reconControl->perDiskInfo[i].curPSID--; 748 raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1; 749 } 750 } 751 } 752 753 mapPtr = raidPtr->reconControl->reconMap; 754 if (rf_reconDebug) { 755 printf("RECON: all reads completed\n"); 756 } 757 /* at this point all the reads have completed. We now wait 758 * for any pending writes to complete, and then we're done */ 759 760 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) { 761 762 event = rf_GetNextReconEvent(reconDesc); 763 status = ProcessReconEvent(raidPtr, event); 764 765 if (status == RF_RECON_WRITE_ERROR) { 766 recon_error = 1; 767 raidPtr->reconControl->error = 1; 768 /* an error was encountered at the very end... bail */ 769 } else { 770 #if RF_DEBUG_RECON 771 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); 772 if (rf_prReconSched) { 773 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 774 } 775 #endif 776 } 777 } 778 779 if (recon_error) { 780 /* we've encountered an error in reconstructing. */ 781 printf("raid%d: reconstruction failed.\n", raidPtr->raidid); 782 783 /* we start by blocking IO to the RAID set. */ 784 rf_SuspendNewRequestsAndWait(raidPtr); 785 786 rf_lock_mutex2(raidPtr->mutex); 787 /* mark set as being degraded, rather than 788 rf_rs_reconstructing as we were before the problem. 789 After this is done we can update status of the 790 component disks without worrying about someone 791 trying to read from a failed component. 792 */ 793 raidPtr->status = rf_rs_degraded; 794 rf_unlock_mutex2(raidPtr->mutex); 795 796 /* resume IO */ 797 rf_ResumeNewRequests(raidPtr); 798 799 /* At this point there are two cases: 800 1) If we've experienced a read error, then we've 801 already waited for all the reads we're going to get, 802 and we just need to wait for the writes. 803 804 2) If we've experienced a write error, we've also 805 already waited for all the reads to complete, 806 but there is little point in waiting for the writes -- 807 when they do complete, they will just be ignored. 808 809 So we just wait for writes to complete if we didn't have a 810 write error. 811 */ 812 813 if (!write_error) { 814 /* wait for writes to complete */ 815 while (raidPtr->reconControl->pending_writes > 0) { 816 817 event = rf_GetNextReconEvent(reconDesc); 818 status = ProcessReconEvent(raidPtr, event); 819 820 if (status == RF_RECON_WRITE_ERROR) { 821 raidPtr->reconControl->error = 1; 822 /* an error was encountered at the very end... bail. 823 This will be very bad news for the user, since 824 at this point there will have been a read error 825 on one component, and a write error on another! 826 */ 827 break; 828 } 829 } 830 } 831 832 833 /* cleanup */ 834 835 /* drain the event queue - after waiting for the writes above, 836 there shouldn't be much (if anything!) left in the queue. */ 837 838 rf_DrainReconEventQueue(reconDesc); 839 840 /* XXX As much as we'd like to free the recon control structure 841 and the reconDesc, we have no way of knowing if/when those will 842 be touched by IO that has yet to occur. It is rather poor to be 843 basically causing a 'memory leak' here, but there doesn't seem to be 844 a cleaner alternative at this time. Perhaps when the reconstruct code 845 gets a makeover this problem will go away. 846 */ 847 #if 0 848 rf_FreeReconControl(raidPtr); 849 #endif 850 851 #if RF_ACC_TRACE > 0 852 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 853 #endif 854 /* XXX see comment above */ 855 #if 0 856 FreeReconDesc(reconDesc); 857 #endif 858 859 return (1); 860 } 861 862 /* Success: mark the dead disk as reconstructed. We quiesce 863 * the array here to assure no nasty interactions with pending 864 * user accesses when we free up the psstatus structure as 865 * part of FreeReconControl() */ 866 867 rf_SuspendNewRequestsAndWait(raidPtr); 868 869 rf_lock_mutex2(raidPtr->mutex); 870 raidPtr->numFailures--; 871 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); 872 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; 873 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal; 874 rf_unlock_mutex2(raidPtr->mutex); 875 RF_GETTIME(etime); 876 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd); 877 878 rf_ResumeNewRequests(raidPtr); 879 880 printf("raid%d: Reconstruction of disk at col %d completed\n", 881 raidPtr->raidid, col); 882 xor_s = raidPtr->accumXorTimeUs / 1000000; 883 xor_resid_us = raidPtr->accumXorTimeUs % 1000000; 884 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", 885 raidPtr->raidid, 886 (int) elpsd.tv_sec, (int) elpsd.tv_usec, 887 raidPtr->accumXorTimeUs, xor_s, xor_resid_us); 888 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n", 889 raidPtr->raidid, 890 (int) raidPtr->reconControl->starttime.tv_sec, 891 (int) raidPtr->reconControl->starttime.tv_usec, 892 (int) etime.tv_sec, (int) etime.tv_usec); 893 #if RF_RECON_STATS > 0 894 printf("raid%d: Total head-sep stall count was %d\n", 895 raidPtr->raidid, (int) reconDesc->hsStallCount); 896 #endif /* RF_RECON_STATS > 0 */ 897 rf_FreeReconControl(raidPtr); 898 #if RF_ACC_TRACE > 0 899 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 900 #endif 901 FreeReconDesc(reconDesc); 902 903 return (0); 904 905 } 906 /***************************************************************************** 907 * do the right thing upon each reconstruction event. 908 *****************************************************************************/ 909 static int 910 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event) 911 { 912 int retcode = 0, submitblocked; 913 RF_ReconBuffer_t *rbuf; 914 RF_SectorCount_t sectorsPerRU; 915 916 retcode = RF_RECON_READ_STOPPED; 917 918 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); 919 920 switch (event->type) { 921 922 /* a read I/O has completed */ 923 case RF_REVENT_READDONE: 924 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf; 925 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n", 926 event->col, rbuf->parityStripeID); 927 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", 928 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 929 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 930 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 931 if (!raidPtr->reconControl->error) { 932 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); 933 Dprintf1("RECON: submitblocked=%d\n", submitblocked); 934 if (!submitblocked) 935 retcode = IssueNextReadRequest(raidPtr, event->col); 936 else 937 retcode = 0; 938 } 939 break; 940 941 /* a write I/O has completed */ 942 case RF_REVENT_WRITEDONE: 943 #if RF_DEBUG_RECON 944 if (rf_floatingRbufDebug) { 945 rf_CheckFloatingRbufCount(raidPtr, 1); 946 } 947 #endif 948 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 949 rbuf = (RF_ReconBuffer_t *) event->arg; 950 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 951 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", 952 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete); 953 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap, 954 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); 955 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru); 956 957 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 958 raidPtr->reconControl->pending_writes--; 959 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 960 961 if (rbuf->type == RF_RBUF_TYPE_FLOATING) { 962 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 963 while(raidPtr->reconControl->rb_lock) { 964 rf_wait_cond2(raidPtr->reconControl->rb_cv, 965 raidPtr->reconControl->rb_mutex); 966 } 967 raidPtr->reconControl->rb_lock = 1; 968 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 969 970 raidPtr->numFullReconBuffers--; 971 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf); 972 973 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 974 raidPtr->reconControl->rb_lock = 0; 975 rf_broadcast_cond2(raidPtr->reconControl->rb_cv); 976 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 977 } else 978 if (rbuf->type == RF_RBUF_TYPE_FORCED) 979 rf_FreeReconBuffer(rbuf); 980 else 981 RF_ASSERT(0); 982 retcode = RF_RECON_WRITE_DONE; 983 break; 984 985 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been 986 * cleared */ 987 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col); 988 if (!raidPtr->reconControl->error) { 989 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf, 990 0, (int) (long) event->arg); 991 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the 992 * BUFCLEAR event if we 993 * couldn't submit */ 994 retcode = IssueNextReadRequest(raidPtr, event->col); 995 } 996 break; 997 998 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction 999 * blockage has been cleared */ 1000 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col); 1001 if (!raidPtr->reconControl->error) { 1002 retcode = TryToRead(raidPtr, event->col); 1003 } 1004 break; 1005 1006 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation 1007 * reconstruction blockage has been 1008 * cleared */ 1009 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col); 1010 if (!raidPtr->reconControl->error) { 1011 retcode = TryToRead(raidPtr, event->col); 1012 } 1013 break; 1014 1015 /* a buffer has become ready to write */ 1016 case RF_REVENT_BUFREADY: 1017 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col); 1018 if (!raidPtr->reconControl->error) { 1019 retcode = IssueNextWriteRequest(raidPtr); 1020 #if RF_DEBUG_RECON 1021 if (rf_floatingRbufDebug) { 1022 rf_CheckFloatingRbufCount(raidPtr, 1); 1023 } 1024 #endif 1025 } 1026 break; 1027 1028 /* we need to skip the current RU entirely because it got 1029 * recon'd while we were waiting for something else to happen */ 1030 case RF_REVENT_SKIP: 1031 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col); 1032 if (!raidPtr->reconControl->error) { 1033 retcode = IssueNextReadRequest(raidPtr, event->col); 1034 } 1035 break; 1036 1037 /* a forced-reconstruction read access has completed. Just 1038 * submit the buffer */ 1039 case RF_REVENT_FORCEDREADDONE: 1040 rbuf = (RF_ReconBuffer_t *) event->arg; 1041 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1042 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col); 1043 if (!raidPtr->reconControl->error) { 1044 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); 1045 RF_ASSERT(!submitblocked); 1046 retcode = 0; 1047 } 1048 break; 1049 1050 /* A read I/O failed to complete */ 1051 case RF_REVENT_READ_FAILED: 1052 retcode = RF_RECON_READ_ERROR; 1053 break; 1054 1055 /* A write I/O failed to complete */ 1056 case RF_REVENT_WRITE_FAILED: 1057 retcode = RF_RECON_WRITE_ERROR; 1058 1059 /* This is an error, but it was a pending write. 1060 Account for it. */ 1061 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1062 raidPtr->reconControl->pending_writes--; 1063 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1064 1065 rbuf = (RF_ReconBuffer_t *) event->arg; 1066 1067 /* cleanup the disk queue data */ 1068 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1069 1070 /* At this point we're erroring out, badly, and floatingRbufs 1071 may not even be valid. Rather than putting this back onto 1072 the floatingRbufs list, just arrange for its immediate 1073 destruction. 1074 */ 1075 rf_FreeReconBuffer(rbuf); 1076 break; 1077 1078 /* a forced read I/O failed to complete */ 1079 case RF_REVENT_FORCEDREAD_FAILED: 1080 retcode = RF_RECON_READ_ERROR; 1081 break; 1082 1083 default: 1084 RF_PANIC(); 1085 } 1086 rf_FreeReconEventDesc(event); 1087 return (retcode); 1088 } 1089 /***************************************************************************** 1090 * 1091 * find the next thing that's needed on the indicated disk, and issue 1092 * a read request for it. We assume that the reconstruction buffer 1093 * associated with this process is free to receive the data. If 1094 * reconstruction is blocked on the indicated RU, we issue a 1095 * blockage-release request instead of a physical disk read request. 1096 * If the current disk gets too far ahead of the others, we issue a 1097 * head-separation wait request and return. 1098 * 1099 * ctrl->{ru_count, curPSID, diskOffset} and 1100 * rbuf->failedDiskSectorOffset are maintained to point to the unit 1101 * we're currently accessing. Note that this deviates from the 1102 * standard C idiom of having counters point to the next thing to be 1103 * accessed. This allows us to easily retry when we're blocked by 1104 * head separation or reconstruction-blockage events. 1105 * 1106 *****************************************************************************/ 1107 static int 1108 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col) 1109 { 1110 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1111 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1112 RF_ReconBuffer_t *rbuf = ctrl->rbuf; 1113 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; 1114 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1115 int do_new_check = 0, retcode = 0, status; 1116 1117 /* if we are currently the slowest disk, mark that we have to do a new 1118 * check */ 1119 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter) 1120 do_new_check = 1; 1121 1122 while (1) { 1123 1124 ctrl->ru_count++; 1125 if (ctrl->ru_count < RUsPerPU) { 1126 ctrl->diskOffset += sectorsPerRU; 1127 rbuf->failedDiskSectorOffset += sectorsPerRU; 1128 } else { 1129 ctrl->curPSID++; 1130 ctrl->ru_count = 0; 1131 /* code left over from when head-sep was based on 1132 * parity stripe id */ 1133 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) { 1134 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter)); 1135 return (RF_RECON_DONE_READS); /* finito! */ 1136 } 1137 /* find the disk offsets of the start of the parity 1138 * stripe on both the current disk and the failed 1139 * disk. skip this entire parity stripe if either disk 1140 * does not appear in the indicated PS */ 1141 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, 1142 &rbuf->spCol, &rbuf->spOffset); 1143 if (status) { 1144 ctrl->ru_count = RUsPerPU - 1; 1145 continue; 1146 } 1147 } 1148 rbuf->which_ru = ctrl->ru_count; 1149 1150 /* skip this RU if it's already been reconstructed */ 1151 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) { 1152 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count); 1153 continue; 1154 } 1155 break; 1156 } 1157 ctrl->headSepCounter++; 1158 if (do_new_check) 1159 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */ 1160 1161 1162 /* at this point, we have definitely decided what to do, and we have 1163 * only to see if we can actually do it now */ 1164 rbuf->parityStripeID = ctrl->curPSID; 1165 rbuf->which_ru = ctrl->ru_count; 1166 #if RF_ACC_TRACE > 0 1167 memset((char *) &raidPtr->recon_tracerecs[col], 0, 1168 sizeof(raidPtr->recon_tracerecs[col])); 1169 raidPtr->recon_tracerecs[col].reconacc = 1; 1170 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1171 #endif 1172 retcode = TryToRead(raidPtr, col); 1173 return (retcode); 1174 } 1175 1176 /* 1177 * tries to issue the next read on the indicated disk. We may be 1178 * blocked by (a) the heads being too far apart, or (b) recon on the 1179 * indicated RU being blocked due to a write by a user thread. In 1180 * this case, we issue a head-sep or blockage wait request, which will 1181 * cause this same routine to be invoked again later when the blockage 1182 * has cleared. 1183 */ 1184 1185 static int 1186 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col) 1187 { 1188 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1189 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 1190 RF_StripeNum_t psid = ctrl->curPSID; 1191 RF_ReconUnitNum_t which_ru = ctrl->ru_count; 1192 RF_DiskQueueData_t *req; 1193 int status; 1194 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; 1195 1196 /* if the current disk is too far ahead of the others, issue a 1197 * head-separation wait and return */ 1198 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru)) 1199 return (0); 1200 1201 /* allocate a new PSS in case we need it */ 1202 newpssPtr = rf_AllocPSStatus(raidPtr); 1203 1204 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1205 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr); 1206 1207 if (pssPtr != newpssPtr) { 1208 rf_FreePSStatus(raidPtr, newpssPtr); 1209 } 1210 1211 /* if recon is blocked on the indicated parity stripe, issue a 1212 * block-wait request and return. this also must mark the indicated RU 1213 * in the stripe as under reconstruction if not blocked. */ 1214 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru); 1215 if (status == RF_PSS_RECON_BLOCKED) { 1216 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru); 1217 goto out; 1218 } else 1219 if (status == RF_PSS_FORCED_ON_WRITE) { 1220 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1221 goto out; 1222 } 1223 /* make one last check to be sure that the indicated RU didn't get 1224 * reconstructed while we were waiting for something else to happen. 1225 * This is unfortunate in that it causes us to make this check twice 1226 * in the normal case. Might want to make some attempt to re-work 1227 * this so that we only do this check if we've definitely blocked on 1228 * one of the above checks. When this condition is detected, we may 1229 * have just created a bogus status entry, which we need to delete. */ 1230 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { 1231 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru); 1232 if (pssPtr == newpssPtr) 1233 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1234 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1235 goto out; 1236 } 1237 /* found something to read. issue the I/O */ 1238 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n", 1239 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer); 1240 #if RF_ACC_TRACE > 0 1241 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); 1242 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); 1243 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = 1244 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); 1245 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1246 #endif 1247 /* should be ok to use a NULL proc pointer here, all the bufs we use 1248 * should be in kernel space */ 1249 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, 1250 ReconReadDoneProc, (void *) ctrl, 1251 #if RF_ACC_TRACE > 0 1252 &raidPtr->recon_tracerecs[col], 1253 #else 1254 NULL, 1255 #endif 1256 (void *) raidPtr, 0, NULL, PR_WAITOK); 1257 1258 ctrl->rbuf->arg = (void *) req; 1259 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY); 1260 pssPtr->issued[col] = 1; 1261 1262 out: 1263 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1264 return (0); 1265 } 1266 1267 1268 /* 1269 * given a parity stripe ID, we want to find out whether both the 1270 * current disk and the failed disk exist in that parity stripe. If 1271 * not, we want to skip this whole PS. If so, we want to find the 1272 * disk offset of the start of the PS on both the current disk and the 1273 * failed disk. 1274 * 1275 * this works by getting a list of disks comprising the indicated 1276 * parity stripe, and searching the list for the current and failed 1277 * disks. Once we've decided they both exist in the parity stripe, we 1278 * need to decide whether each is data or parity, so that we'll know 1279 * which mapping function to call to get the corresponding disk 1280 * offsets. 1281 * 1282 * this is kind of unpleasant, but doing it this way allows the 1283 * reconstruction code to use parity stripe IDs rather than physical 1284 * disks address to march through the failed disk, which greatly 1285 * simplifies a lot of code, as well as eliminating the need for a 1286 * reverse-mapping function. I also think it will execute faster, 1287 * since the calls to the mapping module are kept to a minimum. 1288 * 1289 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING 1290 * THE STRIPE IN THE CORRECT ORDER 1291 * 1292 * raidPtr - raid descriptor 1293 * psid - parity stripe identifier 1294 * col - column of disk to find the offsets for 1295 * spCol - out: col of spare unit for failed unit 1296 * spOffset - out: offset into disk containing spare unit 1297 * 1298 */ 1299 1300 1301 static int 1302 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, 1303 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, 1304 RF_SectorNum_t *outFailedDiskSectorOffset, 1305 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset) 1306 { 1307 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1308 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1309 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ 1310 RF_RowCol_t *diskids; 1311 u_int i, j, k, i_offset, j_offset; 1312 RF_RowCol_t pcol; 1313 int testcol; 1314 RF_SectorNum_t poffset; 1315 char i_is_parity = 0, j_is_parity = 0; 1316 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; 1317 1318 /* get a listing of the disks comprising that stripe */ 1319 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); 1320 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids); 1321 RF_ASSERT(diskids); 1322 1323 /* reject this entire parity stripe if it does not contain the 1324 * indicated disk or it does not contain the failed disk */ 1325 1326 for (i = 0; i < stripeWidth; i++) { 1327 if (col == diskids[i]) 1328 break; 1329 } 1330 if (i == stripeWidth) 1331 goto skipit; 1332 for (j = 0; j < stripeWidth; j++) { 1333 if (fcol == diskids[j]) 1334 break; 1335 } 1336 if (j == stripeWidth) { 1337 goto skipit; 1338 } 1339 /* find out which disk the parity is on */ 1340 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP); 1341 1342 /* find out if either the current RU or the failed RU is parity */ 1343 /* also, if the parity occurs in this stripe prior to the data and/or 1344 * failed col, we need to decrement i and/or j */ 1345 for (k = 0; k < stripeWidth; k++) 1346 if (diskids[k] == pcol) 1347 break; 1348 RF_ASSERT(k < stripeWidth); 1349 i_offset = i; 1350 j_offset = j; 1351 if (k < i) 1352 i_offset--; 1353 else 1354 if (k == i) { 1355 i_is_parity = 1; 1356 i_offset = 0; 1357 } /* set offsets to zero to disable multiply 1358 * below */ 1359 if (k < j) 1360 j_offset--; 1361 else 1362 if (k == j) { 1363 j_is_parity = 1; 1364 j_offset = 0; 1365 } 1366 /* at this point, [ij]_is_parity tells us whether the [current,failed] 1367 * disk is parity at the start of this RU, and, if data, "[ij]_offset" 1368 * tells us how far into the stripe the [current,failed] disk is. */ 1369 1370 /* call the mapping routine to get the offset into the current disk, 1371 * repeat for failed disk. */ 1372 if (i_is_parity) 1373 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1374 else 1375 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1376 1377 RF_ASSERT(col == testcol); 1378 1379 if (j_is_parity) 1380 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1381 else 1382 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1383 RF_ASSERT(fcol == testcol); 1384 1385 /* now locate the spare unit for the failed unit */ 1386 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1387 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 1388 if (j_is_parity) 1389 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1390 else 1391 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1392 } else { 1393 #endif 1394 *spCol = raidPtr->reconControl->spareCol; 1395 *spOffset = *outFailedDiskSectorOffset; 1396 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1397 } 1398 #endif 1399 return (0); 1400 1401 skipit: 1402 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n", 1403 psid, col); 1404 return (1); 1405 } 1406 /* this is called when a buffer has become ready to write to the replacement disk */ 1407 static int 1408 IssueNextWriteRequest(RF_Raid_t *raidPtr) 1409 { 1410 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1411 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1412 #if RF_ACC_TRACE > 0 1413 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1414 #endif 1415 RF_ReconBuffer_t *rbuf; 1416 RF_DiskQueueData_t *req; 1417 1418 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl); 1419 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't 1420 * have gotten the event that sent us here */ 1421 RF_ASSERT(rbuf->pssPtr); 1422 1423 rbuf->pssPtr->writeRbuf = rbuf; 1424 rbuf->pssPtr = NULL; 1425 1426 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", 1427 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, 1428 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); 1429 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", 1430 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 1431 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 1432 1433 /* should be ok to use a NULL b_proc here b/c all addrs should be in 1434 * kernel space */ 1435 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, 1436 sectorsPerRU, rbuf->buffer, 1437 rbuf->parityStripeID, rbuf->which_ru, 1438 ReconWriteDoneProc, (void *) rbuf, 1439 #if RF_ACC_TRACE > 0 1440 &raidPtr->recon_tracerecs[fcol], 1441 #else 1442 NULL, 1443 #endif 1444 (void *) raidPtr, 0, NULL, PR_WAITOK); 1445 1446 rbuf->arg = (void *) req; 1447 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1448 raidPtr->reconControl->pending_writes++; 1449 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1450 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY); 1451 1452 return (0); 1453 } 1454 1455 /* 1456 * this gets called upon the completion of a reconstruction read 1457 * operation the arg is a pointer to the per-disk reconstruction 1458 * control structure for the process that just finished a read. 1459 * 1460 * called at interrupt context in the kernel, so don't do anything 1461 * illegal here. 1462 */ 1463 static int 1464 ReconReadDoneProc(void *arg, int status) 1465 { 1466 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; 1467 RF_Raid_t *raidPtr; 1468 1469 /* Detect that reconCtrl is no longer valid, and if that 1470 is the case, bail without calling rf_CauseReconEvent(). 1471 There won't be anyone listening for this event anyway */ 1472 1473 if (ctrl->reconCtrl == NULL) 1474 return(0); 1475 1476 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; 1477 1478 if (status) { 1479 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status); 1480 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED); 1481 return(0); 1482 } 1483 #if RF_ACC_TRACE > 0 1484 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1485 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1486 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = 1487 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1488 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1489 #endif 1490 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE); 1491 return (0); 1492 } 1493 /* this gets called upon the completion of a reconstruction write operation. 1494 * the arg is a pointer to the rbuf that was just written 1495 * 1496 * called at interrupt context in the kernel, so don't do anything illegal here. 1497 */ 1498 static int 1499 ReconWriteDoneProc(void *arg, int status) 1500 { 1501 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; 1502 1503 /* Detect that reconControl is no longer valid, and if that 1504 is the case, bail without calling rf_CauseReconEvent(). 1505 There won't be anyone listening for this event anyway */ 1506 1507 if (rbuf->raidPtr->reconControl == NULL) 1508 return(0); 1509 1510 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru); 1511 if (status) { 1512 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid); 1513 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED); 1514 return(0); 1515 } 1516 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE); 1517 return (0); 1518 } 1519 1520 1521 /* 1522 * computes a new minimum head sep, and wakes up anyone who needs to 1523 * be woken as a result 1524 */ 1525 static void 1526 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr) 1527 { 1528 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1529 RF_HeadSepLimit_t new_min; 1530 RF_RowCol_t i; 1531 RF_CallbackDesc_t *p; 1532 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition 1533 * of a minimum */ 1534 1535 1536 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1537 while(reconCtrlPtr->rb_lock) { 1538 rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex); 1539 } 1540 reconCtrlPtr->rb_lock = 1; 1541 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1542 1543 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ 1544 for (i = 0; i < raidPtr->numCol; i++) 1545 if (i != reconCtrlPtr->fcol) { 1546 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) 1547 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; 1548 } 1549 /* set the new minimum and wake up anyone who can now run again */ 1550 if (new_min != reconCtrlPtr->minHeadSepCounter) { 1551 reconCtrlPtr->minHeadSepCounter = new_min; 1552 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min); 1553 while (reconCtrlPtr->headSepCBList) { 1554 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) 1555 break; 1556 p = reconCtrlPtr->headSepCBList; 1557 reconCtrlPtr->headSepCBList = p->next; 1558 p->next = NULL; 1559 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1560 rf_FreeCallbackDesc(p); 1561 } 1562 1563 } 1564 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1565 reconCtrlPtr->rb_lock = 0; 1566 rf_broadcast_cond2(reconCtrlPtr->rb_cv); 1567 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1568 } 1569 1570 /* 1571 * checks to see that the maximum head separation will not be violated 1572 * if we initiate a reconstruction I/O on the indicated disk. 1573 * Limiting the maximum head separation between two disks eliminates 1574 * the nasty buffer-stall conditions that occur when one disk races 1575 * ahead of the others and consumes all of the floating recon buffers. 1576 * This code is complex and unpleasant but it's necessary to avoid 1577 * some very nasty, albeit fairly rare, reconstruction behavior. 1578 * 1579 * returns non-zero if and only if we have to stop working on the 1580 * indicated disk due to a head-separation delay. 1581 */ 1582 static int 1583 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, 1584 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, 1585 RF_ReconUnitNum_t which_ru) 1586 { 1587 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1588 RF_CallbackDesc_t *cb, *p, *pt; 1589 int retval = 0; 1590 1591 /* if we're too far ahead of the slowest disk, stop working on this 1592 * disk until the slower ones catch up. We do this by scheduling a 1593 * wakeup callback for the time when the slowest disk has caught up. 1594 * We define "caught up" with 20% hysteresis, i.e. the head separation 1595 * must have fallen to at most 80% of the max allowable head 1596 * separation before we'll wake up. 1597 * 1598 */ 1599 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1600 while(reconCtrlPtr->rb_lock) { 1601 rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex); 1602 } 1603 reconCtrlPtr->rb_lock = 1; 1604 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1605 if ((raidPtr->headSepLimit >= 0) && 1606 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { 1607 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n", 1608 raidPtr->raidid, col, ctrl->headSepCounter, 1609 reconCtrlPtr->minHeadSepCounter, 1610 raidPtr->headSepLimit); 1611 cb = rf_AllocCallbackDesc(); 1612 /* the minHeadSepCounter value we have to get to before we'll 1613 * wake up. build in 20% hysteresis. */ 1614 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); 1615 cb->col = col; 1616 cb->next = NULL; 1617 1618 /* insert this callback descriptor into the sorted list of 1619 * pending head-sep callbacks */ 1620 p = reconCtrlPtr->headSepCBList; 1621 if (!p) 1622 reconCtrlPtr->headSepCBList = cb; 1623 else 1624 if (cb->callbackArg.v < p->callbackArg.v) { 1625 cb->next = reconCtrlPtr->headSepCBList; 1626 reconCtrlPtr->headSepCBList = cb; 1627 } else { 1628 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); 1629 cb->next = p; 1630 pt->next = cb; 1631 } 1632 retval = 1; 1633 #if RF_RECON_STATS > 0 1634 ctrl->reconCtrl->reconDesc->hsStallCount++; 1635 #endif /* RF_RECON_STATS > 0 */ 1636 } 1637 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1638 reconCtrlPtr->rb_lock = 0; 1639 rf_broadcast_cond2(reconCtrlPtr->rb_cv); 1640 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1641 1642 return (retval); 1643 } 1644 /* 1645 * checks to see if reconstruction has been either forced or blocked 1646 * by a user operation. if forced, we skip this RU entirely. else if 1647 * blocked, put ourselves on the wait list. else return 0. 1648 * 1649 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY 1650 */ 1651 static int 1652 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, 1653 RF_ReconParityStripeStatus_t *pssPtr, 1654 RF_PerDiskReconCtrl_t *ctrl, 1655 RF_RowCol_t col, 1656 RF_StripeNum_t psid, 1657 RF_ReconUnitNum_t which_ru) 1658 { 1659 RF_CallbackDesc_t *cb; 1660 int retcode = 0; 1661 1662 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) 1663 retcode = RF_PSS_FORCED_ON_WRITE; 1664 else 1665 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { 1666 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru); 1667 cb = rf_AllocCallbackDesc(); /* append ourselves to 1668 * the blockage-wait 1669 * list */ 1670 cb->col = col; 1671 cb->next = pssPtr->blockWaitList; 1672 pssPtr->blockWaitList = cb; 1673 retcode = RF_PSS_RECON_BLOCKED; 1674 } 1675 if (!retcode) 1676 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under 1677 * reconstruction */ 1678 1679 return (retcode); 1680 } 1681 /* 1682 * if reconstruction is currently ongoing for the indicated stripeID, 1683 * reconstruction is forced to completion and we return non-zero to 1684 * indicate that the caller must wait. If not, then reconstruction is 1685 * blocked on the indicated stripe and the routine returns zero. If 1686 * and only if we return non-zero, we'll cause the cbFunc to get 1687 * invoked with the cbArg when the reconstruction has completed. 1688 */ 1689 int 1690 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1691 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg) 1692 { 1693 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're 1694 * forcing recon on */ 1695 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ 1696 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity 1697 * stripe status structure */ 1698 RF_StripeNum_t psid; /* parity stripe id */ 1699 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk 1700 * offset */ 1701 RF_RowCol_t *diskids; 1702 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ 1703 RF_RowCol_t fcol, diskno, i; 1704 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ 1705 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ 1706 RF_CallbackDesc_t *cb; 1707 int nPromoted; 1708 1709 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1710 1711 /* allocate a new PSS in case we need it */ 1712 newpssPtr = rf_AllocPSStatus(raidPtr); 1713 1714 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1715 1716 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr); 1717 1718 if (pssPtr != newpssPtr) { 1719 rf_FreePSStatus(raidPtr, newpssPtr); 1720 } 1721 1722 /* if recon is not ongoing on this PS, just return */ 1723 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1724 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1725 return (0); 1726 } 1727 /* otherwise, we have to wait for reconstruction to complete on this 1728 * RU. */ 1729 /* In order to avoid waiting for a potentially large number of 1730 * low-priority accesses to complete, we force a normal-priority (i.e. 1731 * not low-priority) reconstruction on this RU. */ 1732 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { 1733 DDprintf1("Forcing recon on psid %ld\n", psid); 1734 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under 1735 * forced recon */ 1736 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage 1737 * that we just set */ 1738 fcol = raidPtr->reconControl->fcol; 1739 1740 /* get a listing of the disks comprising the indicated stripe */ 1741 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids); 1742 1743 /* For previously issued reads, elevate them to normal 1744 * priority. If the I/O has already completed, it won't be 1745 * found in the queue, and hence this will be a no-op. For 1746 * unissued reads, allocate buffers and issue new reads. The 1747 * fact that we've set the FORCED bit means that the regular 1748 * recon procs will not re-issue these reqs */ 1749 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) 1750 if ((diskno = diskids[i]) != fcol) { 1751 if (pssPtr->issued[diskno]) { 1752 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru); 1753 if (rf_reconDebug && nPromoted) 1754 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno); 1755 } else { 1756 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ 1757 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset, 1758 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare 1759 * location */ 1760 new_rbuf->parityStripeID = psid; /* fill in the buffer */ 1761 new_rbuf->which_ru = which_ru; 1762 new_rbuf->failedDiskSectorOffset = fd_offset; 1763 new_rbuf->priority = RF_IO_NORMAL_PRIORITY; 1764 1765 /* use NULL b_proc b/c all addrs 1766 * should be in kernel space */ 1767 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, 1768 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, 1769 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK); 1770 1771 new_rbuf->arg = req; 1772 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ 1773 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno); 1774 } 1775 } 1776 /* if the write is sitting in the disk queue, elevate its 1777 * priority */ 1778 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru)) 1779 if (rf_reconDebug) 1780 printf("raid%d: promoted write to col %d\n", 1781 raidPtr->raidid, fcol); 1782 } 1783 /* install a callback descriptor to be invoked when recon completes on 1784 * this parity stripe. */ 1785 cb = rf_AllocCallbackDesc(); 1786 /* XXX the following is bogus.. These functions don't really match!! 1787 * GO */ 1788 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; 1789 cb->callbackArg.p = (void *) cbArg; 1790 cb->next = pssPtr->procWaitList; 1791 pssPtr->procWaitList = cb; 1792 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n", 1793 raidPtr->raidid, psid); 1794 1795 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1796 return (1); 1797 } 1798 /* called upon the completion of a forced reconstruction read. 1799 * all we do is schedule the FORCEDREADONE event. 1800 * called at interrupt context in the kernel, so don't do anything illegal here. 1801 */ 1802 static void 1803 ForceReconReadDoneProc(void *arg, int status) 1804 { 1805 RF_ReconBuffer_t *rbuf = arg; 1806 1807 /* Detect that reconControl is no longer valid, and if that 1808 is the case, bail without calling rf_CauseReconEvent(). 1809 There won't be anyone listening for this event anyway */ 1810 1811 if (rbuf->raidPtr->reconControl == NULL) 1812 return; 1813 1814 if (status) { 1815 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid); 1816 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED); 1817 return; 1818 } 1819 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); 1820 } 1821 /* releases a block on the reconstruction of the indicated stripe */ 1822 int 1823 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) 1824 { 1825 RF_StripeNum_t stripeID = asmap->stripeID; 1826 RF_ReconParityStripeStatus_t *pssPtr; 1827 RF_ReconUnitNum_t which_ru; 1828 RF_StripeNum_t psid; 1829 RF_CallbackDesc_t *cb; 1830 1831 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1832 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1833 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL); 1834 1835 /* When recon is forced, the pss desc can get deleted before we get 1836 * back to unblock recon. But, this can _only_ happen when recon is 1837 * forced. It would be good to put some kind of sanity check here, but 1838 * how to decide if recon was just forced or not? */ 1839 if (!pssPtr) { 1840 /* printf("Warning: no pss descriptor upon unblock on psid %ld 1841 * RU %d\n",psid,which_ru); */ 1842 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0) 1843 if (rf_reconDebug || rf_pssDebug) 1844 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru); 1845 #endif 1846 goto out; 1847 } 1848 pssPtr->blockCount--; 1849 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n", 1850 raidPtr->raidid, psid, pssPtr->blockCount); 1851 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ 1852 1853 /* unblock recon before calling CauseReconEvent in case 1854 * CauseReconEvent causes us to try to issue a new read before 1855 * returning here. */ 1856 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; 1857 1858 1859 while (pssPtr->blockWaitList) { 1860 /* spin through the block-wait list and 1861 release all the waiters */ 1862 cb = pssPtr->blockWaitList; 1863 pssPtr->blockWaitList = cb->next; 1864 cb->next = NULL; 1865 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR); 1866 rf_FreeCallbackDesc(cb); 1867 } 1868 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1869 /* if no recon was requested while recon was blocked */ 1870 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1871 } 1872 } 1873 out: 1874 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1875 return (0); 1876 } 1877 1878 void 1879 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr) 1880 { 1881 RF_CallbackDesc_t *p; 1882 1883 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1884 while(raidPtr->reconControl->rb_lock) { 1885 rf_wait_cond2(raidPtr->reconControl->rb_cv, 1886 raidPtr->reconControl->rb_mutex); 1887 } 1888 1889 raidPtr->reconControl->rb_lock = 1; 1890 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1891 1892 while (raidPtr->reconControl->headSepCBList) { 1893 p = raidPtr->reconControl->headSepCBList; 1894 raidPtr->reconControl->headSepCBList = p->next; 1895 p->next = NULL; 1896 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1897 rf_FreeCallbackDesc(p); 1898 } 1899 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1900 raidPtr->reconControl->rb_lock = 0; 1901 rf_broadcast_cond2(raidPtr->reconControl->rb_cv); 1902 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1903 1904 } 1905 1906