1 /* $NetBSD: rf_reconstruct.c,v 1.120 2014/06/14 07:39:00 hannken Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /************************************************************ 30 * 31 * rf_reconstruct.c -- code to perform on-line reconstruction 32 * 33 ************************************************************/ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.120 2014/06/14 07:39:00 hannken Exp $"); 37 38 #include <sys/param.h> 39 #include <sys/time.h> 40 #include <sys/buf.h> 41 #include <sys/errno.h> 42 #include <sys/systm.h> 43 #include <sys/proc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <sys/namei.h> /* for pathbuf */ 48 #include <dev/raidframe/raidframevar.h> 49 50 #include <miscfs/specfs/specdev.h> /* for v_rdev */ 51 52 #include "rf_raid.h" 53 #include "rf_reconutil.h" 54 #include "rf_revent.h" 55 #include "rf_reconbuffer.h" 56 #include "rf_acctrace.h" 57 #include "rf_etimer.h" 58 #include "rf_dag.h" 59 #include "rf_desc.h" 60 #include "rf_debugprint.h" 61 #include "rf_general.h" 62 #include "rf_driver.h" 63 #include "rf_utils.h" 64 #include "rf_shutdown.h" 65 66 #include "rf_kintf.h" 67 68 /* setting these to -1 causes them to be set to their default values if not set by debug options */ 69 70 #if RF_DEBUG_RECON 71 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) 72 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 73 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 74 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 75 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) 76 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) 77 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) 78 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) 79 80 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 81 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 82 83 #else /* RF_DEBUG_RECON */ 84 85 #define Dprintf(s) {} 86 #define Dprintf1(s,a) {} 87 #define Dprintf2(s,a,b) {} 88 #define Dprintf3(s,a,b,c) {} 89 #define Dprintf4(s,a,b,c,d) {} 90 #define Dprintf5(s,a,b,c,d,e) {} 91 #define Dprintf6(s,a,b,c,d,e,f) {} 92 #define Dprintf7(s,a,b,c,d,e,f,g) {} 93 94 #define DDprintf1(s,a) {} 95 #define DDprintf2(s,a,b) {} 96 97 #endif /* RF_DEBUG_RECON */ 98 99 #define RF_RECON_DONE_READS 1 100 #define RF_RECON_READ_ERROR 2 101 #define RF_RECON_WRITE_ERROR 3 102 #define RF_RECON_READ_STOPPED 4 103 #define RF_RECON_WRITE_DONE 5 104 105 #define RF_MAX_FREE_RECONBUFFER 32 106 #define RF_MIN_FREE_RECONBUFFER 16 107 108 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, 109 RF_RaidDisk_t *, int, RF_RowCol_t); 110 static void FreeReconDesc(RF_RaidReconDesc_t *); 111 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *); 112 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t); 113 static int TryToRead(RF_Raid_t *, RF_RowCol_t); 114 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t, 115 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *, 116 RF_SectorNum_t *); 117 static int IssueNextWriteRequest(RF_Raid_t *); 118 static int ReconReadDoneProc(void *, int); 119 static int ReconWriteDoneProc(void *, int); 120 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t); 121 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *, 122 RF_RowCol_t, RF_HeadSepLimit_t, 123 RF_ReconUnitNum_t); 124 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *, 125 RF_ReconParityStripeStatus_t *, 126 RF_PerDiskReconCtrl_t *, 127 RF_RowCol_t, RF_StripeNum_t, 128 RF_ReconUnitNum_t); 129 static void ForceReconReadDoneProc(void *, int); 130 static void rf_ShutdownReconstruction(void *); 131 132 struct RF_ReconDoneProc_s { 133 void (*proc) (RF_Raid_t *, void *); 134 void *arg; 135 RF_ReconDoneProc_t *next; 136 }; 137 138 /************************************************************************** 139 * 140 * sets up the parameters that will be used by the reconstruction process 141 * currently there are none, except for those that the layout-specific 142 * configuration (e.g. rf_ConfigureDeclustered) routine sets up. 143 * 144 * in the kernel, we fire off the recon thread. 145 * 146 **************************************************************************/ 147 static void 148 rf_ShutdownReconstruction(void *ignored) 149 { 150 pool_destroy(&rf_pools.reconbuffer); 151 } 152 153 int 154 rf_ConfigureReconstruction(RF_ShutdownList_t **listp) 155 { 156 157 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t), 158 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER); 159 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); 160 161 return (0); 162 } 163 164 static RF_RaidReconDesc_t * 165 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col, 166 RF_RaidDisk_t *spareDiskPtr, int numDisksDone, 167 RF_RowCol_t scol) 168 { 169 170 RF_RaidReconDesc_t *reconDesc; 171 172 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t), 173 (RF_RaidReconDesc_t *)); 174 reconDesc->raidPtr = raidPtr; 175 reconDesc->col = col; 176 reconDesc->spareDiskPtr = spareDiskPtr; 177 reconDesc->numDisksDone = numDisksDone; 178 reconDesc->scol = scol; 179 reconDesc->next = NULL; 180 181 return (reconDesc); 182 } 183 184 static void 185 FreeReconDesc(RF_RaidReconDesc_t *reconDesc) 186 { 187 #if RF_RECON_STATS > 0 188 printf("raid%d: %lu recon event waits, %lu recon delays\n", 189 reconDesc->raidPtr->raidid, 190 (long) reconDesc->numReconEventWaits, 191 (long) reconDesc->numReconExecDelays); 192 #endif /* RF_RECON_STATS > 0 */ 193 printf("raid%d: %lu max exec ticks\n", 194 reconDesc->raidPtr->raidid, 195 (long) reconDesc->maxReconExecTicks); 196 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t)); 197 } 198 199 200 /***************************************************************************** 201 * 202 * primary routine to reconstruct a failed disk. This should be called from 203 * within its own thread. It won't return until reconstruction completes, 204 * fails, or is aborted. 205 *****************************************************************************/ 206 int 207 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col) 208 { 209 const RF_LayoutSW_t *lp; 210 int rc; 211 212 lp = raidPtr->Layout.map; 213 if (lp->SubmitReconBuffer) { 214 /* 215 * The current infrastructure only supports reconstructing one 216 * disk at a time for each array. 217 */ 218 rf_lock_mutex2(raidPtr->mutex); 219 while (raidPtr->reconInProgress) { 220 rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); 221 } 222 raidPtr->reconInProgress++; 223 rf_unlock_mutex2(raidPtr->mutex); 224 rc = rf_ReconstructFailedDiskBasic(raidPtr, col); 225 rf_lock_mutex2(raidPtr->mutex); 226 raidPtr->reconInProgress--; 227 } else { 228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 229 lp->parityConfig); 230 rc = EIO; 231 rf_lock_mutex2(raidPtr->mutex); 232 } 233 rf_signal_cond2(raidPtr->waitForReconCond); 234 rf_unlock_mutex2(raidPtr->mutex); 235 return (rc); 236 } 237 238 int 239 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) 240 { 241 RF_ComponentLabel_t *c_label; 242 RF_RaidDisk_t *spareDiskPtr = NULL; 243 RF_RaidReconDesc_t *reconDesc; 244 RF_RowCol_t scol; 245 int numDisksDone = 0, rc; 246 247 /* first look for a spare drive onto which to reconstruct the data */ 248 /* spare disk descriptors are stored in row 0. This may have to 249 * change eventually */ 250 251 rf_lock_mutex2(raidPtr->mutex); 252 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); 253 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 254 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 255 if (raidPtr->status != rf_rs_degraded) { 256 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col); 257 rf_unlock_mutex2(raidPtr->mutex); 258 return (EINVAL); 259 } 260 scol = (-1); 261 } else { 262 #endif 263 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { 264 if (raidPtr->Disks[scol].status == rf_ds_spare) { 265 spareDiskPtr = &raidPtr->Disks[scol]; 266 spareDiskPtr->status = rf_ds_used_spare; 267 break; 268 } 269 } 270 if (!spareDiskPtr) { 271 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col); 272 rf_unlock_mutex2(raidPtr->mutex); 273 return (ENOSPC); 274 } 275 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol); 276 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 277 } 278 #endif 279 rf_unlock_mutex2(raidPtr->mutex); 280 281 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol); 282 raidPtr->reconDesc = (void *) reconDesc; 283 #if RF_RECON_STATS > 0 284 reconDesc->hsStallCount = 0; 285 reconDesc->numReconExecDelays = 0; 286 reconDesc->numReconEventWaits = 0; 287 #endif /* RF_RECON_STATS > 0 */ 288 reconDesc->reconExecTimerRunning = 0; 289 reconDesc->reconExecTicks = 0; 290 reconDesc->maxReconExecTicks = 0; 291 rc = rf_ContinueReconstructFailedDisk(reconDesc); 292 293 if (!rc) { 294 /* fix up the component label */ 295 /* Don't actually need the read here.. */ 296 c_label = raidget_component_label(raidPtr, scol); 297 298 raid_init_component_label(raidPtr, c_label); 299 c_label->row = 0; 300 c_label->column = col; 301 c_label->clean = RF_RAID_DIRTY; 302 c_label->status = rf_ds_optimal; 303 rf_component_label_set_partitionsize(c_label, 304 raidPtr->Disks[scol].partitionSize); 305 306 /* We've just done a rebuild based on all the other 307 disks, so at this point the parity is known to be 308 clean, even if it wasn't before. */ 309 310 /* XXX doesn't hold for RAID 6!!*/ 311 312 rf_lock_mutex2(raidPtr->mutex); 313 raidPtr->parity_good = RF_RAID_CLEAN; 314 rf_unlock_mutex2(raidPtr->mutex); 315 316 /* XXXX MORE NEEDED HERE */ 317 318 raidflush_component_label(raidPtr, scol); 319 } else { 320 /* Reconstruct failed. */ 321 322 rf_lock_mutex2(raidPtr->mutex); 323 /* Failed disk goes back to "failed" status */ 324 raidPtr->Disks[col].status = rf_ds_failed; 325 326 /* Spare disk goes back to "spare" status. */ 327 spareDiskPtr->status = rf_ds_spare; 328 rf_unlock_mutex2(raidPtr->mutex); 329 330 } 331 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 332 return (rc); 333 } 334 335 /* 336 337 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, 338 and you don't get a spare until the next Monday. With this function 339 (and hot-swappable drives) you can now put your new disk containing 340 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to 341 rebuild the data "on the spot". 342 343 */ 344 345 int 346 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col) 347 { 348 RF_RaidDisk_t *spareDiskPtr = NULL; 349 RF_RaidReconDesc_t *reconDesc; 350 const RF_LayoutSW_t *lp; 351 RF_ComponentLabel_t *c_label; 352 int numDisksDone = 0, rc; 353 uint64_t numsec; 354 unsigned int secsize; 355 struct pathbuf *pb; 356 struct vnode *vp; 357 int retcode; 358 int ac; 359 360 rf_lock_mutex2(raidPtr->mutex); 361 lp = raidPtr->Layout.map; 362 if (!lp->SubmitReconBuffer) { 363 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 364 lp->parityConfig); 365 /* wakeup anyone who might be waiting to do a reconstruct */ 366 rf_signal_cond2(raidPtr->waitForReconCond); 367 rf_unlock_mutex2(raidPtr->mutex); 368 return(EIO); 369 } 370 371 /* 372 * The current infrastructure only supports reconstructing one 373 * disk at a time for each array. 374 */ 375 376 if (raidPtr->Disks[col].status != rf_ds_failed) { 377 /* "It's gone..." */ 378 raidPtr->numFailures++; 379 raidPtr->Disks[col].status = rf_ds_failed; 380 raidPtr->status = rf_rs_degraded; 381 rf_unlock_mutex2(raidPtr->mutex); 382 rf_update_component_labels(raidPtr, 383 RF_NORMAL_COMPONENT_UPDATE); 384 rf_lock_mutex2(raidPtr->mutex); 385 } 386 387 while (raidPtr->reconInProgress) { 388 rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex); 389 } 390 391 raidPtr->reconInProgress++; 392 393 /* first look for a spare drive onto which to reconstruct the 394 data. spare disk descriptors are stored in row 0. This 395 may have to change eventually */ 396 397 /* Actually, we don't care if it's failed or not... On a RAID 398 set with correct parity, this function should be callable 399 on any component without ill effects. */ 400 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */ 401 402 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 403 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 404 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col); 405 406 raidPtr->reconInProgress--; 407 rf_signal_cond2(raidPtr->waitForReconCond); 408 rf_unlock_mutex2(raidPtr->mutex); 409 return (EINVAL); 410 } 411 #endif 412 413 /* This device may have been opened successfully the 414 first time. Close it before trying to open it again.. */ 415 416 if (raidPtr->raid_cinfo[col].ci_vp != NULL) { 417 #if 0 418 printf("Closed the open device: %s\n", 419 raidPtr->Disks[col].devname); 420 #endif 421 vp = raidPtr->raid_cinfo[col].ci_vp; 422 ac = raidPtr->Disks[col].auto_configured; 423 rf_unlock_mutex2(raidPtr->mutex); 424 rf_close_component(raidPtr, vp, ac); 425 rf_lock_mutex2(raidPtr->mutex); 426 raidPtr->raid_cinfo[col].ci_vp = NULL; 427 } 428 /* note that this disk was *not* auto_configured (any longer)*/ 429 raidPtr->Disks[col].auto_configured = 0; 430 431 #if 0 432 printf("About to (re-)open the device for rebuilding: %s\n", 433 raidPtr->Disks[col].devname); 434 #endif 435 rf_unlock_mutex2(raidPtr->mutex); 436 pb = pathbuf_create(raidPtr->Disks[col].devname); 437 if (pb == NULL) { 438 retcode = ENOMEM; 439 } else { 440 retcode = dk_lookup(pb, curlwp, &vp); 441 pathbuf_destroy(pb); 442 } 443 444 if (retcode) { 445 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid, 446 raidPtr->Disks[col].devname, retcode); 447 448 /* the component isn't responding properly... 449 must be still dead :-( */ 450 rf_lock_mutex2(raidPtr->mutex); 451 raidPtr->reconInProgress--; 452 rf_signal_cond2(raidPtr->waitForReconCond); 453 rf_unlock_mutex2(raidPtr->mutex); 454 return(retcode); 455 } 456 457 /* Ok, so we can at least do a lookup... 458 How about actually getting a vp for it? */ 459 460 retcode = getdisksize(vp, &numsec, &secsize); 461 if (retcode) { 462 vn_close(vp, FREAD | FWRITE, kauth_cred_get()); 463 rf_lock_mutex2(raidPtr->mutex); 464 raidPtr->reconInProgress--; 465 rf_signal_cond2(raidPtr->waitForReconCond); 466 rf_unlock_mutex2(raidPtr->mutex); 467 return(retcode); 468 } 469 rf_lock_mutex2(raidPtr->mutex); 470 raidPtr->Disks[col].blockSize = secsize; 471 raidPtr->Disks[col].numBlocks = numsec - rf_protectedSectors; 472 473 raidPtr->raid_cinfo[col].ci_vp = vp; 474 raidPtr->raid_cinfo[col].ci_dev = vp->v_rdev; 475 476 raidPtr->Disks[col].dev = vp->v_rdev; 477 478 /* we allow the user to specify that only a fraction 479 of the disks should be used this is just for debug: 480 it speeds up * the parity scan */ 481 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks * 482 rf_sizePercentage / 100; 483 rf_unlock_mutex2(raidPtr->mutex); 484 485 spareDiskPtr = &raidPtr->Disks[col]; 486 spareDiskPtr->status = rf_ds_used_spare; 487 488 printf("raid%d: initiating in-place reconstruction on column %d\n", 489 raidPtr->raidid, col); 490 491 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, 492 numDisksDone, col); 493 raidPtr->reconDesc = (void *) reconDesc; 494 #if RF_RECON_STATS > 0 495 reconDesc->hsStallCount = 0; 496 reconDesc->numReconExecDelays = 0; 497 reconDesc->numReconEventWaits = 0; 498 #endif /* RF_RECON_STATS > 0 */ 499 reconDesc->reconExecTimerRunning = 0; 500 reconDesc->reconExecTicks = 0; 501 reconDesc->maxReconExecTicks = 0; 502 rc = rf_ContinueReconstructFailedDisk(reconDesc); 503 504 if (!rc) { 505 rf_lock_mutex2(raidPtr->mutex); 506 /* Need to set these here, as at this point it'll be claiming 507 that the disk is in rf_ds_spared! But we know better :-) */ 508 509 raidPtr->Disks[col].status = rf_ds_optimal; 510 raidPtr->status = rf_rs_optimal; 511 rf_unlock_mutex2(raidPtr->mutex); 512 513 /* fix up the component label */ 514 /* Don't actually need the read here.. */ 515 c_label = raidget_component_label(raidPtr, col); 516 517 rf_lock_mutex2(raidPtr->mutex); 518 raid_init_component_label(raidPtr, c_label); 519 520 c_label->row = 0; 521 c_label->column = col; 522 523 /* We've just done a rebuild based on all the other 524 disks, so at this point the parity is known to be 525 clean, even if it wasn't before. */ 526 527 /* XXX doesn't hold for RAID 6!!*/ 528 529 raidPtr->parity_good = RF_RAID_CLEAN; 530 rf_unlock_mutex2(raidPtr->mutex); 531 532 raidflush_component_label(raidPtr, col); 533 } else { 534 /* Reconstruct-in-place failed. Disk goes back to 535 "failed" status, regardless of what it was before. */ 536 rf_lock_mutex2(raidPtr->mutex); 537 raidPtr->Disks[col].status = rf_ds_failed; 538 rf_unlock_mutex2(raidPtr->mutex); 539 } 540 541 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 542 543 rf_lock_mutex2(raidPtr->mutex); 544 raidPtr->reconInProgress--; 545 rf_signal_cond2(raidPtr->waitForReconCond); 546 rf_unlock_mutex2(raidPtr->mutex); 547 548 return (rc); 549 } 550 551 552 int 553 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc) 554 { 555 RF_Raid_t *raidPtr = reconDesc->raidPtr; 556 RF_RowCol_t col = reconDesc->col; 557 RF_RowCol_t scol = reconDesc->scol; 558 RF_ReconMap_t *mapPtr; 559 RF_ReconCtrl_t *tmp_reconctrl; 560 RF_ReconEvent_t *event; 561 RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev; 562 #if RF_INCLUDE_RAID5_RS > 0 563 RF_StripeCount_t startPSID,endPSID,aPSID,bPSID,offPSID; 564 #endif 565 RF_ReconUnitCount_t RUsPerPU; 566 struct timeval etime, elpsd; 567 unsigned long xor_s, xor_resid_us; 568 int i, ds; 569 int status, done; 570 int recon_error, write_error; 571 572 raidPtr->accumXorTimeUs = 0; 573 #if RF_ACC_TRACE > 0 574 /* create one trace record per physical disk */ 575 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); 576 #endif 577 578 /* quiesce the array prior to starting recon. this is needed 579 * to assure no nasty interactions with pending user writes. 580 * We need to do this before we change the disk or row status. */ 581 582 Dprintf("RECON: begin request suspend\n"); 583 rf_SuspendNewRequestsAndWait(raidPtr); 584 Dprintf("RECON: end request suspend\n"); 585 586 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */ 587 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol); 588 589 rf_lock_mutex2(raidPtr->mutex); 590 591 /* create the reconstruction control pointer and install it in 592 * the right slot */ 593 raidPtr->reconControl = tmp_reconctrl; 594 mapPtr = raidPtr->reconControl->reconMap; 595 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs; 596 raidPtr->reconControl->numRUsComplete = 0; 597 raidPtr->status = rf_rs_reconstructing; 598 raidPtr->Disks[col].status = rf_ds_reconstructing; 599 raidPtr->Disks[col].spareCol = scol; 600 601 rf_unlock_mutex2(raidPtr->mutex); 602 603 RF_GETTIME(raidPtr->reconControl->starttime); 604 605 Dprintf("RECON: resume requests\n"); 606 rf_ResumeNewRequests(raidPtr); 607 608 609 mapPtr = raidPtr->reconControl->reconMap; 610 611 incPSID = RF_RECONMAP_SIZE; 612 lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU; 613 RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU; 614 recon_error = 0; 615 write_error = 0; 616 pending_writes = incPSID; 617 raidPtr->reconControl->lastPSID = incPSID - 1; 618 619 /* bounds check raidPtr->reconControl->lastPSID and 620 pending_writes so that we don't attempt to wait for more IO 621 than can possibly happen */ 622 623 if (raidPtr->reconControl->lastPSID > lastPSID) 624 raidPtr->reconControl->lastPSID = lastPSID; 625 626 if (pending_writes > lastPSID) 627 pending_writes = lastPSID; 628 629 /* start the actual reconstruction */ 630 631 done = 0; 632 while (!done) { 633 634 if (raidPtr->waitShutdown) { 635 /* someone is unconfiguring this array... bail on the reconstruct.. */ 636 recon_error = 1; 637 break; 638 } 639 640 num_writes = 0; 641 642 #if RF_INCLUDE_RAID5_RS > 0 643 /* For RAID5 with Rotated Spares we will be 'short' 644 some number of writes since no writes will get 645 issued for stripes where the spare is on the 646 component being rebuilt. Account for the shortage 647 here so that we don't hang indefinitely below 648 waiting for writes to complete that were never 649 scheduled. 650 651 XXX: Should be fixed for PARITY_DECLUSTERING and 652 others too! 653 654 */ 655 656 if (raidPtr->Layout.numDataCol < 657 raidPtr->numCol - raidPtr->Layout.numParityCol) { 658 /* numDataCol is at least 2 less than numCol, so 659 should be RAID 5 with Rotated Spares */ 660 661 /* XXX need to update for RAID 6 */ 662 663 startPSID = raidPtr->reconControl->lastPSID - pending_writes + 1; 664 endPSID = raidPtr->reconControl->lastPSID; 665 666 offPSID = raidPtr->numCol - col - 1; 667 668 aPSID = startPSID - startPSID % raidPtr->numCol + offPSID; 669 if (aPSID < startPSID) { 670 aPSID += raidPtr->numCol; 671 } 672 673 bPSID = endPSID - ((endPSID - offPSID) % raidPtr->numCol); 674 675 if (aPSID < endPSID) { 676 num_writes = ((bPSID - aPSID) / raidPtr->numCol) + 1; 677 } 678 679 if ((aPSID == endPSID) && (bPSID == endPSID)) { 680 num_writes++; 681 } 682 } 683 #endif 684 685 /* issue a read for each surviving disk */ 686 687 reconDesc->numDisksDone = 0; 688 for (i = 0; i < raidPtr->numCol; i++) { 689 if (i != col) { 690 /* find and issue the next I/O on the 691 * indicated disk */ 692 if (IssueNextReadRequest(raidPtr, i)) { 693 Dprintf1("RECON: done issuing for c%d\n", i); 694 reconDesc->numDisksDone++; 695 } 696 } 697 } 698 699 /* process reconstruction events until all disks report that 700 * they've completed all work */ 701 702 while (reconDesc->numDisksDone < raidPtr->numCol - 1) { 703 704 event = rf_GetNextReconEvent(reconDesc); 705 status = ProcessReconEvent(raidPtr, event); 706 707 /* the normal case is that a read completes, and all is well. */ 708 if (status == RF_RECON_DONE_READS) { 709 reconDesc->numDisksDone++; 710 } else if ((status == RF_RECON_READ_ERROR) || 711 (status == RF_RECON_WRITE_ERROR)) { 712 /* an error was encountered while reconstructing... 713 Pretend we've finished this disk. 714 */ 715 recon_error = 1; 716 raidPtr->reconControl->error = 1; 717 718 /* bump the numDisksDone count for reads, 719 but not for writes */ 720 if (status == RF_RECON_READ_ERROR) 721 reconDesc->numDisksDone++; 722 723 /* write errors are special -- when we are 724 done dealing with the reads that are 725 finished, we don't want to wait for any 726 writes */ 727 if (status == RF_RECON_WRITE_ERROR) { 728 write_error = 1; 729 num_writes++; 730 } 731 732 } else if (status == RF_RECON_READ_STOPPED) { 733 /* count this component as being "done" */ 734 reconDesc->numDisksDone++; 735 } else if (status == RF_RECON_WRITE_DONE) { 736 num_writes++; 737 } 738 739 if (recon_error) { 740 /* make sure any stragglers are woken up so that 741 their theads will complete, and we can get out 742 of here with all IO processed */ 743 744 rf_WakeupHeadSepCBWaiters(raidPtr); 745 } 746 747 raidPtr->reconControl->numRUsTotal = 748 mapPtr->totalRUs; 749 raidPtr->reconControl->numRUsComplete = 750 mapPtr->totalRUs - 751 rf_UnitsLeftToReconstruct(mapPtr); 752 753 #if RF_DEBUG_RECON 754 raidPtr->reconControl->percentComplete = 755 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 756 if (rf_prReconSched) { 757 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 758 } 759 #endif 760 } 761 762 /* reads done, wakeup any waiters, and then wait for writes */ 763 764 rf_WakeupHeadSepCBWaiters(raidPtr); 765 766 while (!recon_error && (num_writes < pending_writes)) { 767 event = rf_GetNextReconEvent(reconDesc); 768 status = ProcessReconEvent(raidPtr, event); 769 770 if (status == RF_RECON_WRITE_ERROR) { 771 num_writes++; 772 recon_error = 1; 773 raidPtr->reconControl->error = 1; 774 /* an error was encountered at the very end... bail */ 775 } else if (status == RF_RECON_WRITE_DONE) { 776 num_writes++; 777 } /* else it's something else, and we don't care */ 778 } 779 if (recon_error || 780 (raidPtr->reconControl->lastPSID == lastPSID)) { 781 done = 1; 782 break; 783 } 784 785 prev = raidPtr->reconControl->lastPSID; 786 raidPtr->reconControl->lastPSID += incPSID; 787 788 if (raidPtr->reconControl->lastPSID > lastPSID) { 789 pending_writes = lastPSID - prev; 790 raidPtr->reconControl->lastPSID = lastPSID; 791 } 792 793 /* back down curPSID to get ready for the next round... */ 794 for (i = 0; i < raidPtr->numCol; i++) { 795 if (i != col) { 796 raidPtr->reconControl->perDiskInfo[i].curPSID--; 797 raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1; 798 } 799 } 800 } 801 802 mapPtr = raidPtr->reconControl->reconMap; 803 if (rf_reconDebug) { 804 printf("RECON: all reads completed\n"); 805 } 806 /* at this point all the reads have completed. We now wait 807 * for any pending writes to complete, and then we're done */ 808 809 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) { 810 811 event = rf_GetNextReconEvent(reconDesc); 812 status = ProcessReconEvent(raidPtr, event); 813 814 if (status == RF_RECON_WRITE_ERROR) { 815 recon_error = 1; 816 raidPtr->reconControl->error = 1; 817 /* an error was encountered at the very end... bail */ 818 } else { 819 #if RF_DEBUG_RECON 820 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); 821 if (rf_prReconSched) { 822 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 823 } 824 #endif 825 } 826 } 827 828 if (recon_error) { 829 /* we've encountered an error in reconstructing. */ 830 printf("raid%d: reconstruction failed.\n", raidPtr->raidid); 831 832 /* we start by blocking IO to the RAID set. */ 833 rf_SuspendNewRequestsAndWait(raidPtr); 834 835 rf_lock_mutex2(raidPtr->mutex); 836 /* mark set as being degraded, rather than 837 rf_rs_reconstructing as we were before the problem. 838 After this is done we can update status of the 839 component disks without worrying about someone 840 trying to read from a failed component. 841 */ 842 raidPtr->status = rf_rs_degraded; 843 rf_unlock_mutex2(raidPtr->mutex); 844 845 /* resume IO */ 846 rf_ResumeNewRequests(raidPtr); 847 848 /* At this point there are two cases: 849 1) If we've experienced a read error, then we've 850 already waited for all the reads we're going to get, 851 and we just need to wait for the writes. 852 853 2) If we've experienced a write error, we've also 854 already waited for all the reads to complete, 855 but there is little point in waiting for the writes -- 856 when they do complete, they will just be ignored. 857 858 So we just wait for writes to complete if we didn't have a 859 write error. 860 */ 861 862 if (!write_error) { 863 /* wait for writes to complete */ 864 while (raidPtr->reconControl->pending_writes > 0) { 865 866 event = rf_GetNextReconEvent(reconDesc); 867 status = ProcessReconEvent(raidPtr, event); 868 869 if (status == RF_RECON_WRITE_ERROR) { 870 raidPtr->reconControl->error = 1; 871 /* an error was encountered at the very end... bail. 872 This will be very bad news for the user, since 873 at this point there will have been a read error 874 on one component, and a write error on another! 875 */ 876 break; 877 } 878 } 879 } 880 881 882 /* cleanup */ 883 884 /* drain the event queue - after waiting for the writes above, 885 there shouldn't be much (if anything!) left in the queue. */ 886 887 rf_DrainReconEventQueue(reconDesc); 888 889 /* XXX As much as we'd like to free the recon control structure 890 and the reconDesc, we have no way of knowing if/when those will 891 be touched by IO that has yet to occur. It is rather poor to be 892 basically causing a 'memory leak' here, but there doesn't seem to be 893 a cleaner alternative at this time. Perhaps when the reconstruct code 894 gets a makeover this problem will go away. 895 */ 896 #if 0 897 rf_FreeReconControl(raidPtr); 898 #endif 899 900 #if RF_ACC_TRACE > 0 901 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 902 #endif 903 /* XXX see comment above */ 904 #if 0 905 FreeReconDesc(reconDesc); 906 #endif 907 908 return (1); 909 } 910 911 /* Success: mark the dead disk as reconstructed. We quiesce 912 * the array here to assure no nasty interactions with pending 913 * user accesses when we free up the psstatus structure as 914 * part of FreeReconControl() */ 915 916 rf_SuspendNewRequestsAndWait(raidPtr); 917 918 rf_lock_mutex2(raidPtr->mutex); 919 raidPtr->numFailures--; 920 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); 921 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; 922 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal; 923 rf_unlock_mutex2(raidPtr->mutex); 924 RF_GETTIME(etime); 925 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd); 926 927 rf_ResumeNewRequests(raidPtr); 928 929 printf("raid%d: Reconstruction of disk at col %d completed\n", 930 raidPtr->raidid, col); 931 xor_s = raidPtr->accumXorTimeUs / 1000000; 932 xor_resid_us = raidPtr->accumXorTimeUs % 1000000; 933 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", 934 raidPtr->raidid, 935 (int) elpsd.tv_sec, (int) elpsd.tv_usec, 936 raidPtr->accumXorTimeUs, xor_s, xor_resid_us); 937 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n", 938 raidPtr->raidid, 939 (int) raidPtr->reconControl->starttime.tv_sec, 940 (int) raidPtr->reconControl->starttime.tv_usec, 941 (int) etime.tv_sec, (int) etime.tv_usec); 942 #if RF_RECON_STATS > 0 943 printf("raid%d: Total head-sep stall count was %d\n", 944 raidPtr->raidid, (int) reconDesc->hsStallCount); 945 #endif /* RF_RECON_STATS > 0 */ 946 rf_FreeReconControl(raidPtr); 947 #if RF_ACC_TRACE > 0 948 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 949 #endif 950 FreeReconDesc(reconDesc); 951 952 return (0); 953 954 } 955 /***************************************************************************** 956 * do the right thing upon each reconstruction event. 957 *****************************************************************************/ 958 static int 959 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event) 960 { 961 int retcode = 0, submitblocked; 962 RF_ReconBuffer_t *rbuf; 963 RF_SectorCount_t sectorsPerRU; 964 965 retcode = RF_RECON_READ_STOPPED; 966 967 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); 968 969 switch (event->type) { 970 971 /* a read I/O has completed */ 972 case RF_REVENT_READDONE: 973 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf; 974 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n", 975 event->col, rbuf->parityStripeID); 976 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", 977 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 978 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 979 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 980 if (!raidPtr->reconControl->error) { 981 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); 982 Dprintf1("RECON: submitblocked=%d\n", submitblocked); 983 if (!submitblocked) 984 retcode = IssueNextReadRequest(raidPtr, event->col); 985 else 986 retcode = 0; 987 } 988 break; 989 990 /* a write I/O has completed */ 991 case RF_REVENT_WRITEDONE: 992 #if RF_DEBUG_RECON 993 if (rf_floatingRbufDebug) { 994 rf_CheckFloatingRbufCount(raidPtr, 1); 995 } 996 #endif 997 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 998 rbuf = (RF_ReconBuffer_t *) event->arg; 999 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1000 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", 1001 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete); 1002 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap, 1003 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); 1004 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru); 1005 1006 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1007 raidPtr->reconControl->pending_writes--; 1008 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1009 1010 if (rbuf->type == RF_RBUF_TYPE_FLOATING) { 1011 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1012 while(raidPtr->reconControl->rb_lock) { 1013 rf_wait_cond2(raidPtr->reconControl->rb_cv, 1014 raidPtr->reconControl->rb_mutex); 1015 } 1016 raidPtr->reconControl->rb_lock = 1; 1017 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1018 1019 raidPtr->numFullReconBuffers--; 1020 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf); 1021 1022 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1023 raidPtr->reconControl->rb_lock = 0; 1024 rf_broadcast_cond2(raidPtr->reconControl->rb_cv); 1025 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1026 } else 1027 if (rbuf->type == RF_RBUF_TYPE_FORCED) 1028 rf_FreeReconBuffer(rbuf); 1029 else 1030 RF_ASSERT(0); 1031 retcode = RF_RECON_WRITE_DONE; 1032 break; 1033 1034 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been 1035 * cleared */ 1036 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col); 1037 if (!raidPtr->reconControl->error) { 1038 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf, 1039 0, (int) (long) event->arg); 1040 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the 1041 * BUFCLEAR event if we 1042 * couldn't submit */ 1043 retcode = IssueNextReadRequest(raidPtr, event->col); 1044 } 1045 break; 1046 1047 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction 1048 * blockage has been cleared */ 1049 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col); 1050 if (!raidPtr->reconControl->error) { 1051 retcode = TryToRead(raidPtr, event->col); 1052 } 1053 break; 1054 1055 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation 1056 * reconstruction blockage has been 1057 * cleared */ 1058 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col); 1059 if (!raidPtr->reconControl->error) { 1060 retcode = TryToRead(raidPtr, event->col); 1061 } 1062 break; 1063 1064 /* a buffer has become ready to write */ 1065 case RF_REVENT_BUFREADY: 1066 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col); 1067 if (!raidPtr->reconControl->error) { 1068 retcode = IssueNextWriteRequest(raidPtr); 1069 #if RF_DEBUG_RECON 1070 if (rf_floatingRbufDebug) { 1071 rf_CheckFloatingRbufCount(raidPtr, 1); 1072 } 1073 #endif 1074 } 1075 break; 1076 1077 /* we need to skip the current RU entirely because it got 1078 * recon'd while we were waiting for something else to happen */ 1079 case RF_REVENT_SKIP: 1080 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col); 1081 if (!raidPtr->reconControl->error) { 1082 retcode = IssueNextReadRequest(raidPtr, event->col); 1083 } 1084 break; 1085 1086 /* a forced-reconstruction read access has completed. Just 1087 * submit the buffer */ 1088 case RF_REVENT_FORCEDREADDONE: 1089 rbuf = (RF_ReconBuffer_t *) event->arg; 1090 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1091 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col); 1092 if (!raidPtr->reconControl->error) { 1093 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); 1094 RF_ASSERT(!submitblocked); 1095 retcode = 0; 1096 } 1097 break; 1098 1099 /* A read I/O failed to complete */ 1100 case RF_REVENT_READ_FAILED: 1101 retcode = RF_RECON_READ_ERROR; 1102 break; 1103 1104 /* A write I/O failed to complete */ 1105 case RF_REVENT_WRITE_FAILED: 1106 retcode = RF_RECON_WRITE_ERROR; 1107 1108 /* This is an error, but it was a pending write. 1109 Account for it. */ 1110 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1111 raidPtr->reconControl->pending_writes--; 1112 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1113 1114 rbuf = (RF_ReconBuffer_t *) event->arg; 1115 1116 /* cleanup the disk queue data */ 1117 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1118 1119 /* At this point we're erroring out, badly, and floatingRbufs 1120 may not even be valid. Rather than putting this back onto 1121 the floatingRbufs list, just arrange for its immediate 1122 destruction. 1123 */ 1124 rf_FreeReconBuffer(rbuf); 1125 break; 1126 1127 /* a forced read I/O failed to complete */ 1128 case RF_REVENT_FORCEDREAD_FAILED: 1129 retcode = RF_RECON_READ_ERROR; 1130 break; 1131 1132 default: 1133 RF_PANIC(); 1134 } 1135 rf_FreeReconEventDesc(event); 1136 return (retcode); 1137 } 1138 /***************************************************************************** 1139 * 1140 * find the next thing that's needed on the indicated disk, and issue 1141 * a read request for it. We assume that the reconstruction buffer 1142 * associated with this process is free to receive the data. If 1143 * reconstruction is blocked on the indicated RU, we issue a 1144 * blockage-release request instead of a physical disk read request. 1145 * If the current disk gets too far ahead of the others, we issue a 1146 * head-separation wait request and return. 1147 * 1148 * ctrl->{ru_count, curPSID, diskOffset} and 1149 * rbuf->failedDiskSectorOffset are maintained to point to the unit 1150 * we're currently accessing. Note that this deviates from the 1151 * standard C idiom of having counters point to the next thing to be 1152 * accessed. This allows us to easily retry when we're blocked by 1153 * head separation or reconstruction-blockage events. 1154 * 1155 *****************************************************************************/ 1156 static int 1157 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col) 1158 { 1159 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1160 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1161 RF_ReconBuffer_t *rbuf = ctrl->rbuf; 1162 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; 1163 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1164 int do_new_check = 0, retcode = 0, status; 1165 1166 /* if we are currently the slowest disk, mark that we have to do a new 1167 * check */ 1168 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter) 1169 do_new_check = 1; 1170 1171 while (1) { 1172 1173 ctrl->ru_count++; 1174 if (ctrl->ru_count < RUsPerPU) { 1175 ctrl->diskOffset += sectorsPerRU; 1176 rbuf->failedDiskSectorOffset += sectorsPerRU; 1177 } else { 1178 ctrl->curPSID++; 1179 ctrl->ru_count = 0; 1180 /* code left over from when head-sep was based on 1181 * parity stripe id */ 1182 if (ctrl->curPSID > raidPtr->reconControl->lastPSID) { 1183 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter)); 1184 return (RF_RECON_DONE_READS); /* finito! */ 1185 } 1186 /* find the disk offsets of the start of the parity 1187 * stripe on both the current disk and the failed 1188 * disk. skip this entire parity stripe if either disk 1189 * does not appear in the indicated PS */ 1190 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, 1191 &rbuf->spCol, &rbuf->spOffset); 1192 if (status) { 1193 ctrl->ru_count = RUsPerPU - 1; 1194 continue; 1195 } 1196 } 1197 rbuf->which_ru = ctrl->ru_count; 1198 1199 /* skip this RU if it's already been reconstructed */ 1200 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) { 1201 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count); 1202 continue; 1203 } 1204 break; 1205 } 1206 ctrl->headSepCounter++; 1207 if (do_new_check) 1208 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */ 1209 1210 1211 /* at this point, we have definitely decided what to do, and we have 1212 * only to see if we can actually do it now */ 1213 rbuf->parityStripeID = ctrl->curPSID; 1214 rbuf->which_ru = ctrl->ru_count; 1215 #if RF_ACC_TRACE > 0 1216 memset((char *) &raidPtr->recon_tracerecs[col], 0, 1217 sizeof(raidPtr->recon_tracerecs[col])); 1218 raidPtr->recon_tracerecs[col].reconacc = 1; 1219 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1220 #endif 1221 retcode = TryToRead(raidPtr, col); 1222 return (retcode); 1223 } 1224 1225 /* 1226 * tries to issue the next read on the indicated disk. We may be 1227 * blocked by (a) the heads being too far apart, or (b) recon on the 1228 * indicated RU being blocked due to a write by a user thread. In 1229 * this case, we issue a head-sep or blockage wait request, which will 1230 * cause this same routine to be invoked again later when the blockage 1231 * has cleared. 1232 */ 1233 1234 static int 1235 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col) 1236 { 1237 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1238 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 1239 RF_StripeNum_t psid = ctrl->curPSID; 1240 RF_ReconUnitNum_t which_ru = ctrl->ru_count; 1241 RF_DiskQueueData_t *req; 1242 int status; 1243 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; 1244 1245 /* if the current disk is too far ahead of the others, issue a 1246 * head-separation wait and return */ 1247 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru)) 1248 return (0); 1249 1250 /* allocate a new PSS in case we need it */ 1251 newpssPtr = rf_AllocPSStatus(raidPtr); 1252 1253 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1254 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr); 1255 1256 if (pssPtr != newpssPtr) { 1257 rf_FreePSStatus(raidPtr, newpssPtr); 1258 } 1259 1260 /* if recon is blocked on the indicated parity stripe, issue a 1261 * block-wait request and return. this also must mark the indicated RU 1262 * in the stripe as under reconstruction if not blocked. */ 1263 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru); 1264 if (status == RF_PSS_RECON_BLOCKED) { 1265 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru); 1266 goto out; 1267 } else 1268 if (status == RF_PSS_FORCED_ON_WRITE) { 1269 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1270 goto out; 1271 } 1272 /* make one last check to be sure that the indicated RU didn't get 1273 * reconstructed while we were waiting for something else to happen. 1274 * This is unfortunate in that it causes us to make this check twice 1275 * in the normal case. Might want to make some attempt to re-work 1276 * this so that we only do this check if we've definitely blocked on 1277 * one of the above checks. When this condition is detected, we may 1278 * have just created a bogus status entry, which we need to delete. */ 1279 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { 1280 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru); 1281 if (pssPtr == newpssPtr) 1282 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1283 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1284 goto out; 1285 } 1286 /* found something to read. issue the I/O */ 1287 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n", 1288 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer); 1289 #if RF_ACC_TRACE > 0 1290 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); 1291 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); 1292 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = 1293 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); 1294 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1295 #endif 1296 /* should be ok to use a NULL proc pointer here, all the bufs we use 1297 * should be in kernel space */ 1298 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, 1299 ReconReadDoneProc, (void *) ctrl, 1300 #if RF_ACC_TRACE > 0 1301 &raidPtr->recon_tracerecs[col], 1302 #else 1303 NULL, 1304 #endif 1305 (void *) raidPtr, 0, NULL, PR_WAITOK); 1306 1307 ctrl->rbuf->arg = (void *) req; 1308 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY); 1309 pssPtr->issued[col] = 1; 1310 1311 out: 1312 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1313 return (0); 1314 } 1315 1316 1317 /* 1318 * given a parity stripe ID, we want to find out whether both the 1319 * current disk and the failed disk exist in that parity stripe. If 1320 * not, we want to skip this whole PS. If so, we want to find the 1321 * disk offset of the start of the PS on both the current disk and the 1322 * failed disk. 1323 * 1324 * this works by getting a list of disks comprising the indicated 1325 * parity stripe, and searching the list for the current and failed 1326 * disks. Once we've decided they both exist in the parity stripe, we 1327 * need to decide whether each is data or parity, so that we'll know 1328 * which mapping function to call to get the corresponding disk 1329 * offsets. 1330 * 1331 * this is kind of unpleasant, but doing it this way allows the 1332 * reconstruction code to use parity stripe IDs rather than physical 1333 * disks address to march through the failed disk, which greatly 1334 * simplifies a lot of code, as well as eliminating the need for a 1335 * reverse-mapping function. I also think it will execute faster, 1336 * since the calls to the mapping module are kept to a minimum. 1337 * 1338 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING 1339 * THE STRIPE IN THE CORRECT ORDER 1340 * 1341 * raidPtr - raid descriptor 1342 * psid - parity stripe identifier 1343 * col - column of disk to find the offsets for 1344 * spCol - out: col of spare unit for failed unit 1345 * spOffset - out: offset into disk containing spare unit 1346 * 1347 */ 1348 1349 1350 static int 1351 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, 1352 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, 1353 RF_SectorNum_t *outFailedDiskSectorOffset, 1354 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset) 1355 { 1356 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1357 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1358 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ 1359 RF_RowCol_t *diskids; 1360 u_int i, j, k, i_offset, j_offset; 1361 RF_RowCol_t pcol; 1362 int testcol; 1363 RF_SectorNum_t poffset; 1364 char i_is_parity = 0, j_is_parity = 0; 1365 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; 1366 1367 /* get a listing of the disks comprising that stripe */ 1368 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); 1369 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids); 1370 RF_ASSERT(diskids); 1371 1372 /* reject this entire parity stripe if it does not contain the 1373 * indicated disk or it does not contain the failed disk */ 1374 1375 for (i = 0; i < stripeWidth; i++) { 1376 if (col == diskids[i]) 1377 break; 1378 } 1379 if (i == stripeWidth) 1380 goto skipit; 1381 for (j = 0; j < stripeWidth; j++) { 1382 if (fcol == diskids[j]) 1383 break; 1384 } 1385 if (j == stripeWidth) { 1386 goto skipit; 1387 } 1388 /* find out which disk the parity is on */ 1389 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP); 1390 1391 /* find out if either the current RU or the failed RU is parity */ 1392 /* also, if the parity occurs in this stripe prior to the data and/or 1393 * failed col, we need to decrement i and/or j */ 1394 for (k = 0; k < stripeWidth; k++) 1395 if (diskids[k] == pcol) 1396 break; 1397 RF_ASSERT(k < stripeWidth); 1398 i_offset = i; 1399 j_offset = j; 1400 if (k < i) 1401 i_offset--; 1402 else 1403 if (k == i) { 1404 i_is_parity = 1; 1405 i_offset = 0; 1406 } /* set offsets to zero to disable multiply 1407 * below */ 1408 if (k < j) 1409 j_offset--; 1410 else 1411 if (k == j) { 1412 j_is_parity = 1; 1413 j_offset = 0; 1414 } 1415 /* at this point, [ij]_is_parity tells us whether the [current,failed] 1416 * disk is parity at the start of this RU, and, if data, "[ij]_offset" 1417 * tells us how far into the stripe the [current,failed] disk is. */ 1418 1419 /* call the mapping routine to get the offset into the current disk, 1420 * repeat for failed disk. */ 1421 if (i_is_parity) 1422 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1423 else 1424 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1425 1426 RF_ASSERT(col == testcol); 1427 1428 if (j_is_parity) 1429 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1430 else 1431 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1432 RF_ASSERT(fcol == testcol); 1433 1434 /* now locate the spare unit for the failed unit */ 1435 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1436 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 1437 if (j_is_parity) 1438 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1439 else 1440 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1441 } else { 1442 #endif 1443 *spCol = raidPtr->reconControl->spareCol; 1444 *spOffset = *outFailedDiskSectorOffset; 1445 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1446 } 1447 #endif 1448 return (0); 1449 1450 skipit: 1451 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n", 1452 psid, col); 1453 return (1); 1454 } 1455 /* this is called when a buffer has become ready to write to the replacement disk */ 1456 static int 1457 IssueNextWriteRequest(RF_Raid_t *raidPtr) 1458 { 1459 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1460 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1461 #if RF_ACC_TRACE > 0 1462 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1463 #endif 1464 RF_ReconBuffer_t *rbuf; 1465 RF_DiskQueueData_t *req; 1466 1467 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl); 1468 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't 1469 * have gotten the event that sent us here */ 1470 RF_ASSERT(rbuf->pssPtr); 1471 1472 rbuf->pssPtr->writeRbuf = rbuf; 1473 rbuf->pssPtr = NULL; 1474 1475 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", 1476 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, 1477 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); 1478 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", 1479 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 1480 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 1481 1482 /* should be ok to use a NULL b_proc here b/c all addrs should be in 1483 * kernel space */ 1484 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, 1485 sectorsPerRU, rbuf->buffer, 1486 rbuf->parityStripeID, rbuf->which_ru, 1487 ReconWriteDoneProc, (void *) rbuf, 1488 #if RF_ACC_TRACE > 0 1489 &raidPtr->recon_tracerecs[fcol], 1490 #else 1491 NULL, 1492 #endif 1493 (void *) raidPtr, 0, NULL, PR_WAITOK); 1494 1495 rbuf->arg = (void *) req; 1496 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1497 raidPtr->reconControl->pending_writes++; 1498 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1499 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY); 1500 1501 return (0); 1502 } 1503 1504 /* 1505 * this gets called upon the completion of a reconstruction read 1506 * operation the arg is a pointer to the per-disk reconstruction 1507 * control structure for the process that just finished a read. 1508 * 1509 * called at interrupt context in the kernel, so don't do anything 1510 * illegal here. 1511 */ 1512 static int 1513 ReconReadDoneProc(void *arg, int status) 1514 { 1515 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; 1516 RF_Raid_t *raidPtr; 1517 1518 /* Detect that reconCtrl is no longer valid, and if that 1519 is the case, bail without calling rf_CauseReconEvent(). 1520 There won't be anyone listening for this event anyway */ 1521 1522 if (ctrl->reconCtrl == NULL) 1523 return(0); 1524 1525 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; 1526 1527 if (status) { 1528 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status); 1529 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED); 1530 return(0); 1531 } 1532 #if RF_ACC_TRACE > 0 1533 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1534 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1535 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = 1536 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1537 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1538 #endif 1539 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE); 1540 return (0); 1541 } 1542 /* this gets called upon the completion of a reconstruction write operation. 1543 * the arg is a pointer to the rbuf that was just written 1544 * 1545 * called at interrupt context in the kernel, so don't do anything illegal here. 1546 */ 1547 static int 1548 ReconWriteDoneProc(void *arg, int status) 1549 { 1550 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; 1551 1552 /* Detect that reconControl is no longer valid, and if that 1553 is the case, bail without calling rf_CauseReconEvent(). 1554 There won't be anyone listening for this event anyway */ 1555 1556 if (rbuf->raidPtr->reconControl == NULL) 1557 return(0); 1558 1559 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru); 1560 if (status) { 1561 printf("raid%d: Recon write failed (status %d(0x%x))!\n", rbuf->raidPtr->raidid,status,status); 1562 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED); 1563 return(0); 1564 } 1565 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE); 1566 return (0); 1567 } 1568 1569 1570 /* 1571 * computes a new minimum head sep, and wakes up anyone who needs to 1572 * be woken as a result 1573 */ 1574 static void 1575 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr) 1576 { 1577 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1578 RF_HeadSepLimit_t new_min; 1579 RF_RowCol_t i; 1580 RF_CallbackDesc_t *p; 1581 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition 1582 * of a minimum */ 1583 1584 1585 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1586 while(reconCtrlPtr->rb_lock) { 1587 rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex); 1588 } 1589 reconCtrlPtr->rb_lock = 1; 1590 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1591 1592 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ 1593 for (i = 0; i < raidPtr->numCol; i++) 1594 if (i != reconCtrlPtr->fcol) { 1595 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) 1596 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; 1597 } 1598 /* set the new minimum and wake up anyone who can now run again */ 1599 if (new_min != reconCtrlPtr->minHeadSepCounter) { 1600 reconCtrlPtr->minHeadSepCounter = new_min; 1601 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min); 1602 while (reconCtrlPtr->headSepCBList) { 1603 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) 1604 break; 1605 p = reconCtrlPtr->headSepCBList; 1606 reconCtrlPtr->headSepCBList = p->next; 1607 p->next = NULL; 1608 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1609 rf_FreeCallbackDesc(p); 1610 } 1611 1612 } 1613 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1614 reconCtrlPtr->rb_lock = 0; 1615 rf_broadcast_cond2(reconCtrlPtr->rb_cv); 1616 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1617 } 1618 1619 /* 1620 * checks to see that the maximum head separation will not be violated 1621 * if we initiate a reconstruction I/O on the indicated disk. 1622 * Limiting the maximum head separation between two disks eliminates 1623 * the nasty buffer-stall conditions that occur when one disk races 1624 * ahead of the others and consumes all of the floating recon buffers. 1625 * This code is complex and unpleasant but it's necessary to avoid 1626 * some very nasty, albeit fairly rare, reconstruction behavior. 1627 * 1628 * returns non-zero if and only if we have to stop working on the 1629 * indicated disk due to a head-separation delay. 1630 */ 1631 static int 1632 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, 1633 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, 1634 RF_ReconUnitNum_t which_ru) 1635 { 1636 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1637 RF_CallbackDesc_t *cb, *p, *pt; 1638 int retval = 0; 1639 1640 /* if we're too far ahead of the slowest disk, stop working on this 1641 * disk until the slower ones catch up. We do this by scheduling a 1642 * wakeup callback for the time when the slowest disk has caught up. 1643 * We define "caught up" with 20% hysteresis, i.e. the head separation 1644 * must have fallen to at most 80% of the max allowable head 1645 * separation before we'll wake up. 1646 * 1647 */ 1648 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1649 while(reconCtrlPtr->rb_lock) { 1650 rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex); 1651 } 1652 reconCtrlPtr->rb_lock = 1; 1653 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1654 if ((raidPtr->headSepLimit >= 0) && 1655 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { 1656 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n", 1657 raidPtr->raidid, col, ctrl->headSepCounter, 1658 reconCtrlPtr->minHeadSepCounter, 1659 raidPtr->headSepLimit); 1660 cb = rf_AllocCallbackDesc(); 1661 /* the minHeadSepCounter value we have to get to before we'll 1662 * wake up. build in 20% hysteresis. */ 1663 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); 1664 cb->col = col; 1665 cb->next = NULL; 1666 1667 /* insert this callback descriptor into the sorted list of 1668 * pending head-sep callbacks */ 1669 p = reconCtrlPtr->headSepCBList; 1670 if (!p) 1671 reconCtrlPtr->headSepCBList = cb; 1672 else 1673 if (cb->callbackArg.v < p->callbackArg.v) { 1674 cb->next = reconCtrlPtr->headSepCBList; 1675 reconCtrlPtr->headSepCBList = cb; 1676 } else { 1677 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); 1678 cb->next = p; 1679 pt->next = cb; 1680 } 1681 retval = 1; 1682 #if RF_RECON_STATS > 0 1683 ctrl->reconCtrl->reconDesc->hsStallCount++; 1684 #endif /* RF_RECON_STATS > 0 */ 1685 } 1686 rf_lock_mutex2(reconCtrlPtr->rb_mutex); 1687 reconCtrlPtr->rb_lock = 0; 1688 rf_broadcast_cond2(reconCtrlPtr->rb_cv); 1689 rf_unlock_mutex2(reconCtrlPtr->rb_mutex); 1690 1691 return (retval); 1692 } 1693 /* 1694 * checks to see if reconstruction has been either forced or blocked 1695 * by a user operation. if forced, we skip this RU entirely. else if 1696 * blocked, put ourselves on the wait list. else return 0. 1697 * 1698 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY 1699 */ 1700 static int 1701 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, 1702 RF_ReconParityStripeStatus_t *pssPtr, 1703 RF_PerDiskReconCtrl_t *ctrl, 1704 RF_RowCol_t col, 1705 RF_StripeNum_t psid, 1706 RF_ReconUnitNum_t which_ru) 1707 { 1708 RF_CallbackDesc_t *cb; 1709 int retcode = 0; 1710 1711 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) 1712 retcode = RF_PSS_FORCED_ON_WRITE; 1713 else 1714 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { 1715 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru); 1716 cb = rf_AllocCallbackDesc(); /* append ourselves to 1717 * the blockage-wait 1718 * list */ 1719 cb->col = col; 1720 cb->next = pssPtr->blockWaitList; 1721 pssPtr->blockWaitList = cb; 1722 retcode = RF_PSS_RECON_BLOCKED; 1723 } 1724 if (!retcode) 1725 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under 1726 * reconstruction */ 1727 1728 return (retcode); 1729 } 1730 /* 1731 * if reconstruction is currently ongoing for the indicated stripeID, 1732 * reconstruction is forced to completion and we return non-zero to 1733 * indicate that the caller must wait. If not, then reconstruction is 1734 * blocked on the indicated stripe and the routine returns zero. If 1735 * and only if we return non-zero, we'll cause the cbFunc to get 1736 * invoked with the cbArg when the reconstruction has completed. 1737 */ 1738 int 1739 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1740 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg) 1741 { 1742 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're 1743 * forcing recon on */ 1744 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ 1745 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity 1746 * stripe status structure */ 1747 RF_StripeNum_t psid; /* parity stripe id */ 1748 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk 1749 * offset */ 1750 RF_RowCol_t *diskids; 1751 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ 1752 RF_RowCol_t fcol, diskno, i; 1753 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ 1754 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ 1755 RF_CallbackDesc_t *cb; 1756 int nPromoted; 1757 1758 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1759 1760 /* allocate a new PSS in case we need it */ 1761 newpssPtr = rf_AllocPSStatus(raidPtr); 1762 1763 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1764 1765 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr); 1766 1767 if (pssPtr != newpssPtr) { 1768 rf_FreePSStatus(raidPtr, newpssPtr); 1769 } 1770 1771 /* if recon is not ongoing on this PS, just return */ 1772 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1773 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1774 return (0); 1775 } 1776 /* otherwise, we have to wait for reconstruction to complete on this 1777 * RU. */ 1778 /* In order to avoid waiting for a potentially large number of 1779 * low-priority accesses to complete, we force a normal-priority (i.e. 1780 * not low-priority) reconstruction on this RU. */ 1781 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { 1782 DDprintf1("Forcing recon on psid %ld\n", psid); 1783 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under 1784 * forced recon */ 1785 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage 1786 * that we just set */ 1787 fcol = raidPtr->reconControl->fcol; 1788 1789 /* get a listing of the disks comprising the indicated stripe */ 1790 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids); 1791 1792 /* For previously issued reads, elevate them to normal 1793 * priority. If the I/O has already completed, it won't be 1794 * found in the queue, and hence this will be a no-op. For 1795 * unissued reads, allocate buffers and issue new reads. The 1796 * fact that we've set the FORCED bit means that the regular 1797 * recon procs will not re-issue these reqs */ 1798 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) 1799 if ((diskno = diskids[i]) != fcol) { 1800 if (pssPtr->issued[diskno]) { 1801 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru); 1802 if (rf_reconDebug && nPromoted) 1803 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno); 1804 } else { 1805 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ 1806 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset, 1807 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare 1808 * location */ 1809 new_rbuf->parityStripeID = psid; /* fill in the buffer */ 1810 new_rbuf->which_ru = which_ru; 1811 new_rbuf->failedDiskSectorOffset = fd_offset; 1812 new_rbuf->priority = RF_IO_NORMAL_PRIORITY; 1813 1814 /* use NULL b_proc b/c all addrs 1815 * should be in kernel space */ 1816 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, 1817 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, 1818 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK); 1819 1820 new_rbuf->arg = req; 1821 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ 1822 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno); 1823 } 1824 } 1825 /* if the write is sitting in the disk queue, elevate its 1826 * priority */ 1827 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru)) 1828 if (rf_reconDebug) 1829 printf("raid%d: promoted write to col %d\n", 1830 raidPtr->raidid, fcol); 1831 } 1832 /* install a callback descriptor to be invoked when recon completes on 1833 * this parity stripe. */ 1834 cb = rf_AllocCallbackDesc(); 1835 /* XXX the following is bogus.. These functions don't really match!! 1836 * GO */ 1837 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; 1838 cb->callbackArg.p = (void *) cbArg; 1839 cb->next = pssPtr->procWaitList; 1840 pssPtr->procWaitList = cb; 1841 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n", 1842 raidPtr->raidid, psid); 1843 1844 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1845 return (1); 1846 } 1847 /* called upon the completion of a forced reconstruction read. 1848 * all we do is schedule the FORCEDREADONE event. 1849 * called at interrupt context in the kernel, so don't do anything illegal here. 1850 */ 1851 static void 1852 ForceReconReadDoneProc(void *arg, int status) 1853 { 1854 RF_ReconBuffer_t *rbuf = arg; 1855 1856 /* Detect that reconControl is no longer valid, and if that 1857 is the case, bail without calling rf_CauseReconEvent(). 1858 There won't be anyone listening for this event anyway */ 1859 1860 if (rbuf->raidPtr->reconControl == NULL) 1861 return; 1862 1863 if (status) { 1864 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid); 1865 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED); 1866 return; 1867 } 1868 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); 1869 } 1870 /* releases a block on the reconstruction of the indicated stripe */ 1871 int 1872 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) 1873 { 1874 RF_StripeNum_t stripeID = asmap->stripeID; 1875 RF_ReconParityStripeStatus_t *pssPtr; 1876 RF_ReconUnitNum_t which_ru; 1877 RF_StripeNum_t psid; 1878 RF_CallbackDesc_t *cb; 1879 1880 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1881 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1882 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL); 1883 1884 /* When recon is forced, the pss desc can get deleted before we get 1885 * back to unblock recon. But, this can _only_ happen when recon is 1886 * forced. It would be good to put some kind of sanity check here, but 1887 * how to decide if recon was just forced or not? */ 1888 if (!pssPtr) { 1889 /* printf("Warning: no pss descriptor upon unblock on psid %ld 1890 * RU %d\n",psid,which_ru); */ 1891 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0) 1892 if (rf_reconDebug || rf_pssDebug) 1893 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru); 1894 #endif 1895 goto out; 1896 } 1897 pssPtr->blockCount--; 1898 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n", 1899 raidPtr->raidid, psid, pssPtr->blockCount); 1900 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ 1901 1902 /* unblock recon before calling CauseReconEvent in case 1903 * CauseReconEvent causes us to try to issue a new read before 1904 * returning here. */ 1905 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; 1906 1907 1908 while (pssPtr->blockWaitList) { 1909 /* spin through the block-wait list and 1910 release all the waiters */ 1911 cb = pssPtr->blockWaitList; 1912 pssPtr->blockWaitList = cb->next; 1913 cb->next = NULL; 1914 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR); 1915 rf_FreeCallbackDesc(cb); 1916 } 1917 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1918 /* if no recon was requested while recon was blocked */ 1919 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1920 } 1921 } 1922 out: 1923 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1924 return (0); 1925 } 1926 1927 void 1928 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr) 1929 { 1930 RF_CallbackDesc_t *p; 1931 1932 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1933 while(raidPtr->reconControl->rb_lock) { 1934 rf_wait_cond2(raidPtr->reconControl->rb_cv, 1935 raidPtr->reconControl->rb_mutex); 1936 } 1937 1938 raidPtr->reconControl->rb_lock = 1; 1939 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1940 1941 while (raidPtr->reconControl->headSepCBList) { 1942 p = raidPtr->reconControl->headSepCBList; 1943 raidPtr->reconControl->headSepCBList = p->next; 1944 p->next = NULL; 1945 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1946 rf_FreeCallbackDesc(p); 1947 } 1948 rf_lock_mutex2(raidPtr->reconControl->rb_mutex); 1949 raidPtr->reconControl->rb_lock = 0; 1950 rf_broadcast_cond2(raidPtr->reconControl->rb_cv); 1951 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex); 1952 1953 } 1954 1955