1 /* $NetBSD: rf_reconstruct.c,v 1.101 2008/01/26 20:45:06 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /************************************************************ 30 * 31 * rf_reconstruct.c -- code to perform on-line reconstruction 32 * 33 ************************************************************/ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.101 2008/01/26 20:45:06 oster Exp $"); 37 38 #include <sys/param.h> 39 #include <sys/time.h> 40 #include <sys/buf.h> 41 #include <sys/errno.h> 42 #include <sys/systm.h> 43 #include <sys/proc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <dev/raidframe/raidframevar.h> 48 49 #include "rf_raid.h" 50 #include "rf_reconutil.h" 51 #include "rf_revent.h" 52 #include "rf_reconbuffer.h" 53 #include "rf_acctrace.h" 54 #include "rf_etimer.h" 55 #include "rf_dag.h" 56 #include "rf_desc.h" 57 #include "rf_debugprint.h" 58 #include "rf_general.h" 59 #include "rf_driver.h" 60 #include "rf_utils.h" 61 #include "rf_shutdown.h" 62 63 #include "rf_kintf.h" 64 65 /* setting these to -1 causes them to be set to their default values if not set by debug options */ 66 67 #if RF_DEBUG_RECON 68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) 69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) 73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) 74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) 75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) 76 77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 79 80 #else /* RF_DEBUG_RECON */ 81 82 #define Dprintf(s) {} 83 #define Dprintf1(s,a) {} 84 #define Dprintf2(s,a,b) {} 85 #define Dprintf3(s,a,b,c) {} 86 #define Dprintf4(s,a,b,c,d) {} 87 #define Dprintf5(s,a,b,c,d,e) {} 88 #define Dprintf6(s,a,b,c,d,e,f) {} 89 #define Dprintf7(s,a,b,c,d,e,f,g) {} 90 91 #define DDprintf1(s,a) {} 92 #define DDprintf2(s,a,b) {} 93 94 #endif /* RF_DEBUG_RECON */ 95 96 #define RF_RECON_DONE_READS 1 97 #define RF_RECON_READ_ERROR 2 98 #define RF_RECON_WRITE_ERROR 3 99 #define RF_RECON_READ_STOPPED 4 100 101 #define RF_MAX_FREE_RECONBUFFER 32 102 #define RF_MIN_FREE_RECONBUFFER 16 103 104 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, 105 RF_RaidDisk_t *, int, RF_RowCol_t); 106 static void FreeReconDesc(RF_RaidReconDesc_t *); 107 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *); 108 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t); 109 static int TryToRead(RF_Raid_t *, RF_RowCol_t); 110 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t, 111 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *, 112 RF_SectorNum_t *); 113 static int IssueNextWriteRequest(RF_Raid_t *); 114 static int ReconReadDoneProc(void *, int); 115 static int ReconWriteDoneProc(void *, int); 116 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t); 117 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *, 118 RF_RowCol_t, RF_HeadSepLimit_t, 119 RF_ReconUnitNum_t); 120 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *, 121 RF_ReconParityStripeStatus_t *, 122 RF_PerDiskReconCtrl_t *, 123 RF_RowCol_t, RF_StripeNum_t, 124 RF_ReconUnitNum_t); 125 static void ForceReconReadDoneProc(void *, int); 126 static void rf_ShutdownReconstruction(void *); 127 128 struct RF_ReconDoneProc_s { 129 void (*proc) (RF_Raid_t *, void *); 130 void *arg; 131 RF_ReconDoneProc_t *next; 132 }; 133 134 /************************************************************************** 135 * 136 * sets up the parameters that will be used by the reconstruction process 137 * currently there are none, except for those that the layout-specific 138 * configuration (e.g. rf_ConfigureDeclustered) routine sets up. 139 * 140 * in the kernel, we fire off the recon thread. 141 * 142 **************************************************************************/ 143 static void 144 rf_ShutdownReconstruction(void *ignored) 145 { 146 pool_destroy(&rf_pools.reconbuffer); 147 } 148 149 int 150 rf_ConfigureReconstruction(RF_ShutdownList_t **listp) 151 { 152 153 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t), 154 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER); 155 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); 156 157 return (0); 158 } 159 160 static RF_RaidReconDesc_t * 161 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col, 162 RF_RaidDisk_t *spareDiskPtr, int numDisksDone, 163 RF_RowCol_t scol) 164 { 165 166 RF_RaidReconDesc_t *reconDesc; 167 168 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t), 169 (RF_RaidReconDesc_t *)); 170 reconDesc->raidPtr = raidPtr; 171 reconDesc->col = col; 172 reconDesc->spareDiskPtr = spareDiskPtr; 173 reconDesc->numDisksDone = numDisksDone; 174 reconDesc->scol = scol; 175 reconDesc->next = NULL; 176 177 return (reconDesc); 178 } 179 180 static void 181 FreeReconDesc(RF_RaidReconDesc_t *reconDesc) 182 { 183 #if RF_RECON_STATS > 0 184 printf("raid%d: %lu recon event waits, %lu recon delays\n", 185 reconDesc->raidPtr->raidid, 186 (long) reconDesc->numReconEventWaits, 187 (long) reconDesc->numReconExecDelays); 188 #endif /* RF_RECON_STATS > 0 */ 189 printf("raid%d: %lu max exec ticks\n", 190 reconDesc->raidPtr->raidid, 191 (long) reconDesc->maxReconExecTicks); 192 #if (RF_RECON_STATS > 0) || defined(KERNEL) 193 printf("\n"); 194 #endif /* (RF_RECON_STATS > 0) || KERNEL */ 195 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t)); 196 } 197 198 199 /***************************************************************************** 200 * 201 * primary routine to reconstruct a failed disk. This should be called from 202 * within its own thread. It won't return until reconstruction completes, 203 * fails, or is aborted. 204 *****************************************************************************/ 205 int 206 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col) 207 { 208 const RF_LayoutSW_t *lp; 209 int rc; 210 211 lp = raidPtr->Layout.map; 212 if (lp->SubmitReconBuffer) { 213 /* 214 * The current infrastructure only supports reconstructing one 215 * disk at a time for each array. 216 */ 217 RF_LOCK_MUTEX(raidPtr->mutex); 218 while (raidPtr->reconInProgress) { 219 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); 220 } 221 raidPtr->reconInProgress++; 222 RF_UNLOCK_MUTEX(raidPtr->mutex); 223 rc = rf_ReconstructFailedDiskBasic(raidPtr, col); 224 RF_LOCK_MUTEX(raidPtr->mutex); 225 raidPtr->reconInProgress--; 226 RF_UNLOCK_MUTEX(raidPtr->mutex); 227 } else { 228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 229 lp->parityConfig); 230 rc = EIO; 231 } 232 RF_SIGNAL_COND(raidPtr->waitForReconCond); 233 return (rc); 234 } 235 236 int 237 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) 238 { 239 RF_ComponentLabel_t c_label; 240 RF_RaidDisk_t *spareDiskPtr = NULL; 241 RF_RaidReconDesc_t *reconDesc; 242 RF_RowCol_t scol; 243 int numDisksDone = 0, rc; 244 245 /* first look for a spare drive onto which to reconstruct the data */ 246 /* spare disk descriptors are stored in row 0. This may have to 247 * change eventually */ 248 249 RF_LOCK_MUTEX(raidPtr->mutex); 250 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); 251 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 252 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 253 if (raidPtr->status != rf_rs_degraded) { 254 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col); 255 RF_UNLOCK_MUTEX(raidPtr->mutex); 256 return (EINVAL); 257 } 258 scol = (-1); 259 } else { 260 #endif 261 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { 262 if (raidPtr->Disks[scol].status == rf_ds_spare) { 263 spareDiskPtr = &raidPtr->Disks[scol]; 264 spareDiskPtr->status = rf_ds_used_spare; 265 break; 266 } 267 } 268 if (!spareDiskPtr) { 269 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col); 270 RF_UNLOCK_MUTEX(raidPtr->mutex); 271 return (ENOSPC); 272 } 273 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol); 274 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 275 } 276 #endif 277 RF_UNLOCK_MUTEX(raidPtr->mutex); 278 279 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol); 280 raidPtr->reconDesc = (void *) reconDesc; 281 #if RF_RECON_STATS > 0 282 reconDesc->hsStallCount = 0; 283 reconDesc->numReconExecDelays = 0; 284 reconDesc->numReconEventWaits = 0; 285 #endif /* RF_RECON_STATS > 0 */ 286 reconDesc->reconExecTimerRunning = 0; 287 reconDesc->reconExecTicks = 0; 288 reconDesc->maxReconExecTicks = 0; 289 rc = rf_ContinueReconstructFailedDisk(reconDesc); 290 291 if (!rc) { 292 /* fix up the component label */ 293 /* Don't actually need the read here.. */ 294 raidread_component_label( 295 raidPtr->raid_cinfo[scol].ci_dev, 296 raidPtr->raid_cinfo[scol].ci_vp, 297 &c_label); 298 299 raid_init_component_label( raidPtr, &c_label); 300 c_label.row = 0; 301 c_label.column = col; 302 c_label.clean = RF_RAID_DIRTY; 303 c_label.status = rf_ds_optimal; 304 c_label.partitionSize = raidPtr->Disks[scol].partitionSize; 305 306 /* We've just done a rebuild based on all the other 307 disks, so at this point the parity is known to be 308 clean, even if it wasn't before. */ 309 310 /* XXX doesn't hold for RAID 6!!*/ 311 312 RF_LOCK_MUTEX(raidPtr->mutex); 313 raidPtr->parity_good = RF_RAID_CLEAN; 314 RF_UNLOCK_MUTEX(raidPtr->mutex); 315 316 /* XXXX MORE NEEDED HERE */ 317 318 raidwrite_component_label( 319 raidPtr->raid_cinfo[scol].ci_dev, 320 raidPtr->raid_cinfo[scol].ci_vp, 321 &c_label); 322 323 } else { 324 /* Reconstruct failed. */ 325 326 RF_LOCK_MUTEX(raidPtr->mutex); 327 /* Failed disk goes back to "failed" status */ 328 raidPtr->Disks[col].status = rf_ds_failed; 329 330 /* Spare disk goes back to "spare" status. */ 331 spareDiskPtr->status = rf_ds_spare; 332 RF_UNLOCK_MUTEX(raidPtr->mutex); 333 334 } 335 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 336 return (rc); 337 } 338 339 /* 340 341 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, 342 and you don't get a spare until the next Monday. With this function 343 (and hot-swappable drives) you can now put your new disk containing 344 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to 345 rebuild the data "on the spot". 346 347 */ 348 349 int 350 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col) 351 { 352 RF_RaidDisk_t *spareDiskPtr = NULL; 353 RF_RaidReconDesc_t *reconDesc; 354 const RF_LayoutSW_t *lp; 355 RF_ComponentLabel_t c_label; 356 int numDisksDone = 0, rc; 357 struct partinfo dpart; 358 struct vnode *vp; 359 struct vattr va; 360 int retcode; 361 int ac; 362 363 lp = raidPtr->Layout.map; 364 if (!lp->SubmitReconBuffer) { 365 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 366 lp->parityConfig); 367 /* wakeup anyone who might be waiting to do a reconstruct */ 368 RF_SIGNAL_COND(raidPtr->waitForReconCond); 369 return(EIO); 370 } 371 372 /* 373 * The current infrastructure only supports reconstructing one 374 * disk at a time for each array. 375 */ 376 RF_LOCK_MUTEX(raidPtr->mutex); 377 378 if (raidPtr->Disks[col].status != rf_ds_failed) { 379 /* "It's gone..." */ 380 raidPtr->numFailures++; 381 raidPtr->Disks[col].status = rf_ds_failed; 382 raidPtr->status = rf_rs_degraded; 383 RF_UNLOCK_MUTEX(raidPtr->mutex); 384 rf_update_component_labels(raidPtr, 385 RF_NORMAL_COMPONENT_UPDATE); 386 RF_LOCK_MUTEX(raidPtr->mutex); 387 } 388 389 while (raidPtr->reconInProgress) { 390 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); 391 } 392 393 raidPtr->reconInProgress++; 394 395 /* first look for a spare drive onto which to reconstruct the 396 data. spare disk descriptors are stored in row 0. This 397 may have to change eventually */ 398 399 /* Actually, we don't care if it's failed or not... On a RAID 400 set with correct parity, this function should be callable 401 on any component without ill effects. */ 402 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */ 403 404 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 405 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 406 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col); 407 408 raidPtr->reconInProgress--; 409 RF_UNLOCK_MUTEX(raidPtr->mutex); 410 RF_SIGNAL_COND(raidPtr->waitForReconCond); 411 return (EINVAL); 412 } 413 #endif 414 415 /* This device may have been opened successfully the 416 first time. Close it before trying to open it again.. */ 417 418 if (raidPtr->raid_cinfo[col].ci_vp != NULL) { 419 #if 0 420 printf("Closed the open device: %s\n", 421 raidPtr->Disks[col].devname); 422 #endif 423 vp = raidPtr->raid_cinfo[col].ci_vp; 424 ac = raidPtr->Disks[col].auto_configured; 425 RF_UNLOCK_MUTEX(raidPtr->mutex); 426 rf_close_component(raidPtr, vp, ac); 427 RF_LOCK_MUTEX(raidPtr->mutex); 428 raidPtr->raid_cinfo[col].ci_vp = NULL; 429 } 430 /* note that this disk was *not* auto_configured (any longer)*/ 431 raidPtr->Disks[col].auto_configured = 0; 432 433 #if 0 434 printf("About to (re-)open the device for rebuilding: %s\n", 435 raidPtr->Disks[col].devname); 436 #endif 437 RF_UNLOCK_MUTEX(raidPtr->mutex); 438 retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE); 439 440 if (retcode) { 441 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid, 442 raidPtr->Disks[col].devname, retcode); 443 444 /* the component isn't responding properly... 445 must be still dead :-( */ 446 RF_LOCK_MUTEX(raidPtr->mutex); 447 raidPtr->reconInProgress--; 448 RF_UNLOCK_MUTEX(raidPtr->mutex); 449 RF_SIGNAL_COND(raidPtr->waitForReconCond); 450 return(retcode); 451 } 452 453 /* Ok, so we can at least do a lookup... 454 How about actually getting a vp for it? */ 455 456 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) { 457 RF_LOCK_MUTEX(raidPtr->mutex); 458 raidPtr->reconInProgress--; 459 RF_UNLOCK_MUTEX(raidPtr->mutex); 460 RF_SIGNAL_COND(raidPtr->waitForReconCond); 461 return(retcode); 462 } 463 464 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred); 465 if (retcode) { 466 RF_LOCK_MUTEX(raidPtr->mutex); 467 raidPtr->reconInProgress--; 468 RF_UNLOCK_MUTEX(raidPtr->mutex); 469 RF_SIGNAL_COND(raidPtr->waitForReconCond); 470 return(retcode); 471 } 472 RF_LOCK_MUTEX(raidPtr->mutex); 473 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize; 474 475 raidPtr->Disks[col].numBlocks = dpart.part->p_size - 476 rf_protectedSectors; 477 478 raidPtr->raid_cinfo[col].ci_vp = vp; 479 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev; 480 481 raidPtr->Disks[col].dev = va.va_rdev; 482 483 /* we allow the user to specify that only a fraction 484 of the disks should be used this is just for debug: 485 it speeds up * the parity scan */ 486 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks * 487 rf_sizePercentage / 100; 488 RF_UNLOCK_MUTEX(raidPtr->mutex); 489 490 spareDiskPtr = &raidPtr->Disks[col]; 491 spareDiskPtr->status = rf_ds_used_spare; 492 493 printf("raid%d: initiating in-place reconstruction on column %d\n", 494 raidPtr->raidid, col); 495 496 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, 497 numDisksDone, col); 498 raidPtr->reconDesc = (void *) reconDesc; 499 #if RF_RECON_STATS > 0 500 reconDesc->hsStallCount = 0; 501 reconDesc->numReconExecDelays = 0; 502 reconDesc->numReconEventWaits = 0; 503 #endif /* RF_RECON_STATS > 0 */ 504 reconDesc->reconExecTimerRunning = 0; 505 reconDesc->reconExecTicks = 0; 506 reconDesc->maxReconExecTicks = 0; 507 rc = rf_ContinueReconstructFailedDisk(reconDesc); 508 509 if (!rc) { 510 RF_LOCK_MUTEX(raidPtr->mutex); 511 /* Need to set these here, as at this point it'll be claiming 512 that the disk is in rf_ds_spared! But we know better :-) */ 513 514 raidPtr->Disks[col].status = rf_ds_optimal; 515 raidPtr->status = rf_rs_optimal; 516 RF_UNLOCK_MUTEX(raidPtr->mutex); 517 518 /* fix up the component label */ 519 /* Don't actually need the read here.. */ 520 raidread_component_label(raidPtr->raid_cinfo[col].ci_dev, 521 raidPtr->raid_cinfo[col].ci_vp, 522 &c_label); 523 524 RF_LOCK_MUTEX(raidPtr->mutex); 525 raid_init_component_label(raidPtr, &c_label); 526 527 c_label.row = 0; 528 c_label.column = col; 529 530 /* We've just done a rebuild based on all the other 531 disks, so at this point the parity is known to be 532 clean, even if it wasn't before. */ 533 534 /* XXX doesn't hold for RAID 6!!*/ 535 536 raidPtr->parity_good = RF_RAID_CLEAN; 537 RF_UNLOCK_MUTEX(raidPtr->mutex); 538 539 raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev, 540 raidPtr->raid_cinfo[col].ci_vp, 541 &c_label); 542 543 } else { 544 /* Reconstruct-in-place failed. Disk goes back to 545 "failed" status, regardless of what it was before. */ 546 RF_LOCK_MUTEX(raidPtr->mutex); 547 raidPtr->Disks[col].status = rf_ds_failed; 548 RF_UNLOCK_MUTEX(raidPtr->mutex); 549 } 550 551 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 552 553 RF_LOCK_MUTEX(raidPtr->mutex); 554 raidPtr->reconInProgress--; 555 RF_UNLOCK_MUTEX(raidPtr->mutex); 556 557 RF_SIGNAL_COND(raidPtr->waitForReconCond); 558 return (rc); 559 } 560 561 562 int 563 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc) 564 { 565 RF_Raid_t *raidPtr = reconDesc->raidPtr; 566 RF_RowCol_t col = reconDesc->col; 567 RF_RowCol_t scol = reconDesc->scol; 568 RF_ReconMap_t *mapPtr; 569 RF_ReconCtrl_t *tmp_reconctrl; 570 RF_ReconEvent_t *event; 571 RF_CallbackDesc_t *p; 572 struct timeval etime, elpsd; 573 unsigned long xor_s, xor_resid_us; 574 int i, ds; 575 int status; 576 int recon_error, write_error; 577 578 raidPtr->accumXorTimeUs = 0; 579 #if RF_ACC_TRACE > 0 580 /* create one trace record per physical disk */ 581 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); 582 #endif 583 584 /* quiesce the array prior to starting recon. this is needed 585 * to assure no nasty interactions with pending user writes. 586 * We need to do this before we change the disk or row status. */ 587 588 Dprintf("RECON: begin request suspend\n"); 589 rf_SuspendNewRequestsAndWait(raidPtr); 590 Dprintf("RECON: end request suspend\n"); 591 592 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */ 593 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol); 594 595 RF_LOCK_MUTEX(raidPtr->mutex); 596 597 /* create the reconstruction control pointer and install it in 598 * the right slot */ 599 raidPtr->reconControl = tmp_reconctrl; 600 mapPtr = raidPtr->reconControl->reconMap; 601 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs; 602 raidPtr->reconControl->numRUsComplete = 0; 603 raidPtr->status = rf_rs_reconstructing; 604 raidPtr->Disks[col].status = rf_ds_reconstructing; 605 raidPtr->Disks[col].spareCol = scol; 606 607 RF_UNLOCK_MUTEX(raidPtr->mutex); 608 609 RF_GETTIME(raidPtr->reconControl->starttime); 610 611 /* now start up the actual reconstruction: issue a read for 612 * each surviving disk */ 613 614 reconDesc->numDisksDone = 0; 615 for (i = 0; i < raidPtr->numCol; i++) { 616 if (i != col) { 617 /* find and issue the next I/O on the 618 * indicated disk */ 619 if (IssueNextReadRequest(raidPtr, i)) { 620 Dprintf1("RECON: done issuing for c%d\n", i); 621 reconDesc->numDisksDone++; 622 } 623 } 624 } 625 626 Dprintf("RECON: resume requests\n"); 627 rf_ResumeNewRequests(raidPtr); 628 629 /* process reconstruction events until all disks report that 630 * they've completed all work */ 631 632 mapPtr = raidPtr->reconControl->reconMap; 633 recon_error = 0; 634 write_error = 0; 635 636 while (reconDesc->numDisksDone < raidPtr->numCol - 1) { 637 638 event = rf_GetNextReconEvent(reconDesc); 639 status = ProcessReconEvent(raidPtr, event); 640 641 /* the normal case is that a read completes, and all is well. */ 642 if (status == RF_RECON_DONE_READS) { 643 reconDesc->numDisksDone++; 644 } else if ((status == RF_RECON_READ_ERROR) || 645 (status == RF_RECON_WRITE_ERROR)) { 646 /* an error was encountered while reconstructing... 647 Pretend we've finished this disk. 648 */ 649 recon_error = 1; 650 raidPtr->reconControl->error = 1; 651 652 /* bump the numDisksDone count for reads, 653 but not for writes */ 654 if (status == RF_RECON_READ_ERROR) 655 reconDesc->numDisksDone++; 656 657 /* write errors are special -- when we are 658 done dealing with the reads that are 659 finished, we don't want to wait for any 660 writes */ 661 if (status == RF_RECON_WRITE_ERROR) 662 write_error = 1; 663 664 } else if (status == RF_RECON_READ_STOPPED) { 665 /* count this component as being "done" */ 666 reconDesc->numDisksDone++; 667 } 668 669 if (recon_error) { 670 671 /* make sure any stragglers are woken up so that 672 their theads will complete, and we can get out 673 of here with all IO processed */ 674 675 while (raidPtr->reconControl->headSepCBList) { 676 p = raidPtr->reconControl->headSepCBList; 677 raidPtr->reconControl->headSepCBList = p->next; 678 p->next = NULL; 679 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 680 rf_FreeCallbackDesc(p); 681 } 682 } 683 684 raidPtr->reconControl->numRUsTotal = 685 mapPtr->totalRUs; 686 raidPtr->reconControl->numRUsComplete = 687 mapPtr->totalRUs - 688 rf_UnitsLeftToReconstruct(mapPtr); 689 690 #if RF_DEBUG_RECON 691 raidPtr->reconControl->percentComplete = 692 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 693 if (rf_prReconSched) { 694 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 695 } 696 #endif 697 } 698 699 mapPtr = raidPtr->reconControl->reconMap; 700 if (rf_reconDebug) { 701 printf("RECON: all reads completed\n"); 702 } 703 /* at this point all the reads have completed. We now wait 704 * for any pending writes to complete, and then we're done */ 705 706 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) { 707 708 event = rf_GetNextReconEvent(reconDesc); 709 status = ProcessReconEvent(raidPtr, event); 710 711 if (status == RF_RECON_WRITE_ERROR) { 712 recon_error = 1; 713 raidPtr->reconControl->error = 1; 714 /* an error was encountered at the very end... bail */ 715 } else { 716 #if RF_DEBUG_RECON 717 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); 718 if (rf_prReconSched) { 719 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 720 } 721 #endif 722 } 723 } 724 725 if (recon_error) { 726 /* we've encountered an error in reconstructing. */ 727 printf("raid%d: reconstruction failed.\n", raidPtr->raidid); 728 729 /* we start by blocking IO to the RAID set. */ 730 rf_SuspendNewRequestsAndWait(raidPtr); 731 732 RF_LOCK_MUTEX(raidPtr->mutex); 733 /* mark set as being degraded, rather than 734 rf_rs_reconstructing as we were before the problem. 735 After this is done we can update status of the 736 component disks without worrying about someone 737 trying to read from a failed component. 738 */ 739 raidPtr->status = rf_rs_degraded; 740 RF_UNLOCK_MUTEX(raidPtr->mutex); 741 742 /* resume IO */ 743 rf_ResumeNewRequests(raidPtr); 744 745 /* At this point there are two cases: 746 1) If we've experienced a read error, then we've 747 already waited for all the reads we're going to get, 748 and we just need to wait for the writes. 749 750 2) If we've experienced a write error, we've also 751 already waited for all the reads to complete, 752 but there is little point in waiting for the writes -- 753 when they do complete, they will just be ignored. 754 755 So we just wait for writes to complete if we didn't have a 756 write error. 757 */ 758 759 if (!write_error) { 760 /* wait for writes to complete */ 761 while (raidPtr->reconControl->pending_writes > 0) { 762 763 event = rf_GetNextReconEvent(reconDesc); 764 status = ProcessReconEvent(raidPtr, event); 765 766 if (status == RF_RECON_WRITE_ERROR) { 767 raidPtr->reconControl->error = 1; 768 /* an error was encountered at the very end... bail. 769 This will be very bad news for the user, since 770 at this point there will have been a read error 771 on one component, and a write error on another! 772 */ 773 break; 774 } 775 } 776 } 777 778 779 /* cleanup */ 780 781 /* drain the event queue - after waiting for the writes above, 782 there shouldn't be much (if anything!) left in the queue. */ 783 784 rf_DrainReconEventQueue(reconDesc); 785 786 /* XXX As much as we'd like to free the recon control structure 787 and the reconDesc, we have no way of knowing if/when those will 788 be touched by IO that has yet to occur. It is rather poor to be 789 basically causing a 'memory leak' here, but there doesn't seem to be 790 a cleaner alternative at this time. Perhaps when the reconstruct code 791 gets a makeover this problem will go away. 792 */ 793 #if 0 794 rf_FreeReconControl(raidPtr); 795 #endif 796 797 #if RF_ACC_TRACE > 0 798 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 799 #endif 800 /* XXX see comment above */ 801 #if 0 802 FreeReconDesc(reconDesc); 803 #endif 804 805 return (1); 806 } 807 808 /* Success: mark the dead disk as reconstructed. We quiesce 809 * the array here to assure no nasty interactions with pending 810 * user accesses when we free up the psstatus structure as 811 * part of FreeReconControl() */ 812 813 rf_SuspendNewRequestsAndWait(raidPtr); 814 815 RF_LOCK_MUTEX(raidPtr->mutex); 816 raidPtr->numFailures--; 817 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); 818 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; 819 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal; 820 RF_UNLOCK_MUTEX(raidPtr->mutex); 821 RF_GETTIME(etime); 822 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd); 823 824 rf_ResumeNewRequests(raidPtr); 825 826 printf("raid%d: Reconstruction of disk at col %d completed\n", 827 raidPtr->raidid, col); 828 xor_s = raidPtr->accumXorTimeUs / 1000000; 829 xor_resid_us = raidPtr->accumXorTimeUs % 1000000; 830 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", 831 raidPtr->raidid, 832 (int) elpsd.tv_sec, (int) elpsd.tv_usec, 833 raidPtr->accumXorTimeUs, xor_s, xor_resid_us); 834 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n", 835 raidPtr->raidid, 836 (int) raidPtr->reconControl->starttime.tv_sec, 837 (int) raidPtr->reconControl->starttime.tv_usec, 838 (int) etime.tv_sec, (int) etime.tv_usec); 839 #if RF_RECON_STATS > 0 840 printf("raid%d: Total head-sep stall count was %d\n", 841 raidPtr->raidid, (int) reconDesc->hsStallCount); 842 #endif /* RF_RECON_STATS > 0 */ 843 rf_FreeReconControl(raidPtr); 844 #if RF_ACC_TRACE > 0 845 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 846 #endif 847 FreeReconDesc(reconDesc); 848 849 return (0); 850 851 } 852 /***************************************************************************** 853 * do the right thing upon each reconstruction event. 854 *****************************************************************************/ 855 static int 856 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event) 857 { 858 int retcode = 0, submitblocked; 859 RF_ReconBuffer_t *rbuf; 860 RF_SectorCount_t sectorsPerRU; 861 862 retcode = RF_RECON_READ_STOPPED; 863 864 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); 865 switch (event->type) { 866 867 /* a read I/O has completed */ 868 case RF_REVENT_READDONE: 869 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf; 870 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n", 871 event->col, rbuf->parityStripeID); 872 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", 873 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 874 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 875 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 876 if (!raidPtr->reconControl->error) { 877 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); 878 Dprintf1("RECON: submitblocked=%d\n", submitblocked); 879 if (!submitblocked) 880 retcode = IssueNextReadRequest(raidPtr, event->col); 881 else 882 retcode = 0; 883 } 884 break; 885 886 /* a write I/O has completed */ 887 case RF_REVENT_WRITEDONE: 888 #if RF_DEBUG_RECON 889 if (rf_floatingRbufDebug) { 890 rf_CheckFloatingRbufCount(raidPtr, 1); 891 } 892 #endif 893 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 894 rbuf = (RF_ReconBuffer_t *) event->arg; 895 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 896 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", 897 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete); 898 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap, 899 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); 900 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru); 901 902 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 903 raidPtr->reconControl->pending_writes--; 904 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 905 906 if (rbuf->type == RF_RBUF_TYPE_FLOATING) { 907 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 908 while(raidPtr->reconControl->rb_lock) { 909 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0, 910 &raidPtr->reconControl->rb_mutex); 911 } 912 raidPtr->reconControl->rb_lock = 1; 913 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 914 915 raidPtr->numFullReconBuffers--; 916 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf); 917 918 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 919 raidPtr->reconControl->rb_lock = 0; 920 wakeup(&raidPtr->reconControl->rb_lock); 921 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 922 } else 923 if (rbuf->type == RF_RBUF_TYPE_FORCED) 924 rf_FreeReconBuffer(rbuf); 925 else 926 RF_ASSERT(0); 927 retcode = 0; 928 break; 929 930 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been 931 * cleared */ 932 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col); 933 if (!raidPtr->reconControl->error) { 934 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf, 935 0, (int) (long) event->arg); 936 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the 937 * BUFCLEAR event if we 938 * couldn't submit */ 939 retcode = IssueNextReadRequest(raidPtr, event->col); 940 } 941 break; 942 943 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction 944 * blockage has been cleared */ 945 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col); 946 if (!raidPtr->reconControl->error) { 947 retcode = TryToRead(raidPtr, event->col); 948 } 949 break; 950 951 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation 952 * reconstruction blockage has been 953 * cleared */ 954 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col); 955 if (!raidPtr->reconControl->error) { 956 retcode = TryToRead(raidPtr, event->col); 957 } 958 break; 959 960 /* a buffer has become ready to write */ 961 case RF_REVENT_BUFREADY: 962 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col); 963 if (!raidPtr->reconControl->error) { 964 retcode = IssueNextWriteRequest(raidPtr); 965 #if RF_DEBUG_RECON 966 if (rf_floatingRbufDebug) { 967 rf_CheckFloatingRbufCount(raidPtr, 1); 968 } 969 #endif 970 } 971 break; 972 973 /* we need to skip the current RU entirely because it got 974 * recon'd while we were waiting for something else to happen */ 975 case RF_REVENT_SKIP: 976 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col); 977 if (!raidPtr->reconControl->error) { 978 retcode = IssueNextReadRequest(raidPtr, event->col); 979 } 980 break; 981 982 /* a forced-reconstruction read access has completed. Just 983 * submit the buffer */ 984 case RF_REVENT_FORCEDREADDONE: 985 rbuf = (RF_ReconBuffer_t *) event->arg; 986 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 987 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col); 988 if (!raidPtr->reconControl->error) { 989 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); 990 RF_ASSERT(!submitblocked); 991 } 992 break; 993 994 /* A read I/O failed to complete */ 995 case RF_REVENT_READ_FAILED: 996 retcode = RF_RECON_READ_ERROR; 997 break; 998 999 /* A write I/O failed to complete */ 1000 case RF_REVENT_WRITE_FAILED: 1001 retcode = RF_RECON_WRITE_ERROR; 1002 1003 rbuf = (RF_ReconBuffer_t *) event->arg; 1004 1005 /* cleanup the disk queue data */ 1006 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1007 1008 /* At this point we're erroring out, badly, and floatingRbufs 1009 may not even be valid. Rather than putting this back onto 1010 the floatingRbufs list, just arrange for its immediate 1011 destruction. 1012 */ 1013 rf_FreeReconBuffer(rbuf); 1014 break; 1015 1016 /* a forced read I/O failed to complete */ 1017 case RF_REVENT_FORCEDREAD_FAILED: 1018 retcode = RF_RECON_READ_ERROR; 1019 break; 1020 1021 default: 1022 RF_PANIC(); 1023 } 1024 rf_FreeReconEventDesc(event); 1025 return (retcode); 1026 } 1027 /***************************************************************************** 1028 * 1029 * find the next thing that's needed on the indicated disk, and issue 1030 * a read request for it. We assume that the reconstruction buffer 1031 * associated with this process is free to receive the data. If 1032 * reconstruction is blocked on the indicated RU, we issue a 1033 * blockage-release request instead of a physical disk read request. 1034 * If the current disk gets too far ahead of the others, we issue a 1035 * head-separation wait request and return. 1036 * 1037 * ctrl->{ru_count, curPSID, diskOffset} and 1038 * rbuf->failedDiskSectorOffset are maintained to point to the unit 1039 * we're currently accessing. Note that this deviates from the 1040 * standard C idiom of having counters point to the next thing to be 1041 * accessed. This allows us to easily retry when we're blocked by 1042 * head separation or reconstruction-blockage events. 1043 * 1044 *****************************************************************************/ 1045 static int 1046 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col) 1047 { 1048 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1049 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1050 RF_ReconBuffer_t *rbuf = ctrl->rbuf; 1051 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; 1052 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1053 int do_new_check = 0, retcode = 0, status; 1054 1055 /* if we are currently the slowest disk, mark that we have to do a new 1056 * check */ 1057 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter) 1058 do_new_check = 1; 1059 1060 while (1) { 1061 1062 ctrl->ru_count++; 1063 if (ctrl->ru_count < RUsPerPU) { 1064 ctrl->diskOffset += sectorsPerRU; 1065 rbuf->failedDiskSectorOffset += sectorsPerRU; 1066 } else { 1067 ctrl->curPSID++; 1068 ctrl->ru_count = 0; 1069 /* code left over from when head-sep was based on 1070 * parity stripe id */ 1071 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) { 1072 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter)); 1073 return (RF_RECON_DONE_READS); /* finito! */ 1074 } 1075 /* find the disk offsets of the start of the parity 1076 * stripe on both the current disk and the failed 1077 * disk. skip this entire parity stripe if either disk 1078 * does not appear in the indicated PS */ 1079 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, 1080 &rbuf->spCol, &rbuf->spOffset); 1081 if (status) { 1082 ctrl->ru_count = RUsPerPU - 1; 1083 continue; 1084 } 1085 } 1086 rbuf->which_ru = ctrl->ru_count; 1087 1088 /* skip this RU if it's already been reconstructed */ 1089 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) { 1090 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count); 1091 continue; 1092 } 1093 break; 1094 } 1095 ctrl->headSepCounter++; 1096 if (do_new_check) 1097 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */ 1098 1099 1100 /* at this point, we have definitely decided what to do, and we have 1101 * only to see if we can actually do it now */ 1102 rbuf->parityStripeID = ctrl->curPSID; 1103 rbuf->which_ru = ctrl->ru_count; 1104 #if RF_ACC_TRACE > 0 1105 memset((char *) &raidPtr->recon_tracerecs[col], 0, 1106 sizeof(raidPtr->recon_tracerecs[col])); 1107 raidPtr->recon_tracerecs[col].reconacc = 1; 1108 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1109 #endif 1110 retcode = TryToRead(raidPtr, col); 1111 return (retcode); 1112 } 1113 1114 /* 1115 * tries to issue the next read on the indicated disk. We may be 1116 * blocked by (a) the heads being too far apart, or (b) recon on the 1117 * indicated RU being blocked due to a write by a user thread. In 1118 * this case, we issue a head-sep or blockage wait request, which will 1119 * cause this same routine to be invoked again later when the blockage 1120 * has cleared. 1121 */ 1122 1123 static int 1124 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col) 1125 { 1126 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1127 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 1128 RF_StripeNum_t psid = ctrl->curPSID; 1129 RF_ReconUnitNum_t which_ru = ctrl->ru_count; 1130 RF_DiskQueueData_t *req; 1131 int status; 1132 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; 1133 1134 /* if the current disk is too far ahead of the others, issue a 1135 * head-separation wait and return */ 1136 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru)) 1137 return (0); 1138 1139 /* allocate a new PSS in case we need it */ 1140 newpssPtr = rf_AllocPSStatus(raidPtr); 1141 1142 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1143 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr); 1144 1145 if (pssPtr != newpssPtr) { 1146 rf_FreePSStatus(raidPtr, newpssPtr); 1147 } 1148 1149 /* if recon is blocked on the indicated parity stripe, issue a 1150 * block-wait request and return. this also must mark the indicated RU 1151 * in the stripe as under reconstruction if not blocked. */ 1152 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru); 1153 if (status == RF_PSS_RECON_BLOCKED) { 1154 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru); 1155 goto out; 1156 } else 1157 if (status == RF_PSS_FORCED_ON_WRITE) { 1158 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1159 goto out; 1160 } 1161 /* make one last check to be sure that the indicated RU didn't get 1162 * reconstructed while we were waiting for something else to happen. 1163 * This is unfortunate in that it causes us to make this check twice 1164 * in the normal case. Might want to make some attempt to re-work 1165 * this so that we only do this check if we've definitely blocked on 1166 * one of the above checks. When this condition is detected, we may 1167 * have just created a bogus status entry, which we need to delete. */ 1168 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { 1169 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru); 1170 if (pssPtr == newpssPtr) 1171 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1172 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1173 goto out; 1174 } 1175 /* found something to read. issue the I/O */ 1176 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n", 1177 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer); 1178 #if RF_ACC_TRACE > 0 1179 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); 1180 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); 1181 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = 1182 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); 1183 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1184 #endif 1185 /* should be ok to use a NULL proc pointer here, all the bufs we use 1186 * should be in kernel space */ 1187 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, 1188 ReconReadDoneProc, (void *) ctrl, 1189 #if RF_ACC_TRACE > 0 1190 &raidPtr->recon_tracerecs[col], 1191 #else 1192 NULL, 1193 #endif 1194 (void *) raidPtr, 0, NULL, PR_WAITOK); 1195 1196 ctrl->rbuf->arg = (void *) req; 1197 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY); 1198 pssPtr->issued[col] = 1; 1199 1200 out: 1201 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1202 return (0); 1203 } 1204 1205 1206 /* 1207 * given a parity stripe ID, we want to find out whether both the 1208 * current disk and the failed disk exist in that parity stripe. If 1209 * not, we want to skip this whole PS. If so, we want to find the 1210 * disk offset of the start of the PS on both the current disk and the 1211 * failed disk. 1212 * 1213 * this works by getting a list of disks comprising the indicated 1214 * parity stripe, and searching the list for the current and failed 1215 * disks. Once we've decided they both exist in the parity stripe, we 1216 * need to decide whether each is data or parity, so that we'll know 1217 * which mapping function to call to get the corresponding disk 1218 * offsets. 1219 * 1220 * this is kind of unpleasant, but doing it this way allows the 1221 * reconstruction code to use parity stripe IDs rather than physical 1222 * disks address to march through the failed disk, which greatly 1223 * simplifies a lot of code, as well as eliminating the need for a 1224 * reverse-mapping function. I also think it will execute faster, 1225 * since the calls to the mapping module are kept to a minimum. 1226 * 1227 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING 1228 * THE STRIPE IN THE CORRECT ORDER 1229 * 1230 * raidPtr - raid descriptor 1231 * psid - parity stripe identifier 1232 * col - column of disk to find the offsets for 1233 * spCol - out: col of spare unit for failed unit 1234 * spOffset - out: offset into disk containing spare unit 1235 * 1236 */ 1237 1238 1239 static int 1240 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, 1241 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, 1242 RF_SectorNum_t *outFailedDiskSectorOffset, 1243 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset) 1244 { 1245 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1246 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1247 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ 1248 RF_RowCol_t *diskids; 1249 u_int i, j, k, i_offset, j_offset; 1250 RF_RowCol_t pcol; 1251 int testcol; 1252 RF_SectorNum_t poffset; 1253 char i_is_parity = 0, j_is_parity = 0; 1254 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; 1255 1256 /* get a listing of the disks comprising that stripe */ 1257 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); 1258 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids); 1259 RF_ASSERT(diskids); 1260 1261 /* reject this entire parity stripe if it does not contain the 1262 * indicated disk or it does not contain the failed disk */ 1263 1264 for (i = 0; i < stripeWidth; i++) { 1265 if (col == diskids[i]) 1266 break; 1267 } 1268 if (i == stripeWidth) 1269 goto skipit; 1270 for (j = 0; j < stripeWidth; j++) { 1271 if (fcol == diskids[j]) 1272 break; 1273 } 1274 if (j == stripeWidth) { 1275 goto skipit; 1276 } 1277 /* find out which disk the parity is on */ 1278 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP); 1279 1280 /* find out if either the current RU or the failed RU is parity */ 1281 /* also, if the parity occurs in this stripe prior to the data and/or 1282 * failed col, we need to decrement i and/or j */ 1283 for (k = 0; k < stripeWidth; k++) 1284 if (diskids[k] == pcol) 1285 break; 1286 RF_ASSERT(k < stripeWidth); 1287 i_offset = i; 1288 j_offset = j; 1289 if (k < i) 1290 i_offset--; 1291 else 1292 if (k == i) { 1293 i_is_parity = 1; 1294 i_offset = 0; 1295 } /* set offsets to zero to disable multiply 1296 * below */ 1297 if (k < j) 1298 j_offset--; 1299 else 1300 if (k == j) { 1301 j_is_parity = 1; 1302 j_offset = 0; 1303 } 1304 /* at this point, [ij]_is_parity tells us whether the [current,failed] 1305 * disk is parity at the start of this RU, and, if data, "[ij]_offset" 1306 * tells us how far into the stripe the [current,failed] disk is. */ 1307 1308 /* call the mapping routine to get the offset into the current disk, 1309 * repeat for failed disk. */ 1310 if (i_is_parity) 1311 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1312 else 1313 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1314 1315 RF_ASSERT(col == testcol); 1316 1317 if (j_is_parity) 1318 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1319 else 1320 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1321 RF_ASSERT(fcol == testcol); 1322 1323 /* now locate the spare unit for the failed unit */ 1324 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1325 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 1326 if (j_is_parity) 1327 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1328 else 1329 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1330 } else { 1331 #endif 1332 *spCol = raidPtr->reconControl->spareCol; 1333 *spOffset = *outFailedDiskSectorOffset; 1334 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1335 } 1336 #endif 1337 return (0); 1338 1339 skipit: 1340 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n", 1341 psid, col); 1342 return (1); 1343 } 1344 /* this is called when a buffer has become ready to write to the replacement disk */ 1345 static int 1346 IssueNextWriteRequest(RF_Raid_t *raidPtr) 1347 { 1348 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1349 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1350 #if RF_ACC_TRACE > 0 1351 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1352 #endif 1353 RF_ReconBuffer_t *rbuf; 1354 RF_DiskQueueData_t *req; 1355 1356 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl); 1357 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't 1358 * have gotten the event that sent us here */ 1359 RF_ASSERT(rbuf->pssPtr); 1360 1361 rbuf->pssPtr->writeRbuf = rbuf; 1362 rbuf->pssPtr = NULL; 1363 1364 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", 1365 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, 1366 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); 1367 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", 1368 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 1369 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 1370 1371 /* should be ok to use a NULL b_proc here b/c all addrs should be in 1372 * kernel space */ 1373 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, 1374 sectorsPerRU, rbuf->buffer, 1375 rbuf->parityStripeID, rbuf->which_ru, 1376 ReconWriteDoneProc, (void *) rbuf, 1377 #if RF_ACC_TRACE > 0 1378 &raidPtr->recon_tracerecs[fcol], 1379 #else 1380 NULL, 1381 #endif 1382 (void *) raidPtr, 0, NULL, PR_WAITOK); 1383 1384 rbuf->arg = (void *) req; 1385 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 1386 raidPtr->reconControl->pending_writes++; 1387 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 1388 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY); 1389 1390 return (0); 1391 } 1392 1393 /* 1394 * this gets called upon the completion of a reconstruction read 1395 * operation the arg is a pointer to the per-disk reconstruction 1396 * control structure for the process that just finished a read. 1397 * 1398 * called at interrupt context in the kernel, so don't do anything 1399 * illegal here. 1400 */ 1401 static int 1402 ReconReadDoneProc(void *arg, int status) 1403 { 1404 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; 1405 RF_Raid_t *raidPtr; 1406 1407 /* Detect that reconCtrl is no longer valid, and if that 1408 is the case, bail without calling rf_CauseReconEvent(). 1409 There won't be anyone listening for this event anyway */ 1410 1411 if (ctrl->reconCtrl == NULL) 1412 return(0); 1413 1414 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; 1415 1416 if (status) { 1417 printf("raid%d: Recon read failed!\n", raidPtr->raidid); 1418 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED); 1419 return(0); 1420 } 1421 #if RF_ACC_TRACE > 0 1422 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1423 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1424 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = 1425 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1426 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1427 #endif 1428 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE); 1429 return (0); 1430 } 1431 /* this gets called upon the completion of a reconstruction write operation. 1432 * the arg is a pointer to the rbuf that was just written 1433 * 1434 * called at interrupt context in the kernel, so don't do anything illegal here. 1435 */ 1436 static int 1437 ReconWriteDoneProc(void *arg, int status) 1438 { 1439 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; 1440 1441 /* Detect that reconControl is no longer valid, and if that 1442 is the case, bail without calling rf_CauseReconEvent(). 1443 There won't be anyone listening for this event anyway */ 1444 1445 if (rbuf->raidPtr->reconControl == NULL) 1446 return(0); 1447 1448 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru); 1449 if (status) { 1450 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid); 1451 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED); 1452 return(0); 1453 } 1454 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE); 1455 return (0); 1456 } 1457 1458 1459 /* 1460 * computes a new minimum head sep, and wakes up anyone who needs to 1461 * be woken as a result 1462 */ 1463 static void 1464 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr) 1465 { 1466 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1467 RF_HeadSepLimit_t new_min; 1468 RF_RowCol_t i; 1469 RF_CallbackDesc_t *p; 1470 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition 1471 * of a minimum */ 1472 1473 1474 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1475 while(reconCtrlPtr->rb_lock) { 1476 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex); 1477 } 1478 reconCtrlPtr->rb_lock = 1; 1479 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1480 1481 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ 1482 for (i = 0; i < raidPtr->numCol; i++) 1483 if (i != reconCtrlPtr->fcol) { 1484 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) 1485 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; 1486 } 1487 /* set the new minimum and wake up anyone who can now run again */ 1488 if (new_min != reconCtrlPtr->minHeadSepCounter) { 1489 reconCtrlPtr->minHeadSepCounter = new_min; 1490 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min); 1491 while (reconCtrlPtr->headSepCBList) { 1492 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) 1493 break; 1494 p = reconCtrlPtr->headSepCBList; 1495 reconCtrlPtr->headSepCBList = p->next; 1496 p->next = NULL; 1497 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1498 rf_FreeCallbackDesc(p); 1499 } 1500 1501 } 1502 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1503 reconCtrlPtr->rb_lock = 0; 1504 wakeup(&reconCtrlPtr->rb_lock); 1505 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1506 } 1507 1508 /* 1509 * checks to see that the maximum head separation will not be violated 1510 * if we initiate a reconstruction I/O on the indicated disk. 1511 * Limiting the maximum head separation between two disks eliminates 1512 * the nasty buffer-stall conditions that occur when one disk races 1513 * ahead of the others and consumes all of the floating recon buffers. 1514 * This code is complex and unpleasant but it's necessary to avoid 1515 * some very nasty, albeit fairly rare, reconstruction behavior. 1516 * 1517 * returns non-zero if and only if we have to stop working on the 1518 * indicated disk due to a head-separation delay. 1519 */ 1520 static int 1521 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, 1522 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, 1523 RF_ReconUnitNum_t which_ru) 1524 { 1525 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1526 RF_CallbackDesc_t *cb, *p, *pt; 1527 int retval = 0; 1528 1529 /* if we're too far ahead of the slowest disk, stop working on this 1530 * disk until the slower ones catch up. We do this by scheduling a 1531 * wakeup callback for the time when the slowest disk has caught up. 1532 * We define "caught up" with 20% hysteresis, i.e. the head separation 1533 * must have fallen to at most 80% of the max allowable head 1534 * separation before we'll wake up. 1535 * 1536 */ 1537 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1538 while(reconCtrlPtr->rb_lock) { 1539 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex); 1540 } 1541 reconCtrlPtr->rb_lock = 1; 1542 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1543 if ((raidPtr->headSepLimit >= 0) && 1544 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { 1545 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n", 1546 raidPtr->raidid, col, ctrl->headSepCounter, 1547 reconCtrlPtr->minHeadSepCounter, 1548 raidPtr->headSepLimit); 1549 cb = rf_AllocCallbackDesc(); 1550 /* the minHeadSepCounter value we have to get to before we'll 1551 * wake up. build in 20% hysteresis. */ 1552 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); 1553 cb->col = col; 1554 cb->next = NULL; 1555 1556 /* insert this callback descriptor into the sorted list of 1557 * pending head-sep callbacks */ 1558 p = reconCtrlPtr->headSepCBList; 1559 if (!p) 1560 reconCtrlPtr->headSepCBList = cb; 1561 else 1562 if (cb->callbackArg.v < p->callbackArg.v) { 1563 cb->next = reconCtrlPtr->headSepCBList; 1564 reconCtrlPtr->headSepCBList = cb; 1565 } else { 1566 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); 1567 cb->next = p; 1568 pt->next = cb; 1569 } 1570 retval = 1; 1571 #if RF_RECON_STATS > 0 1572 ctrl->reconCtrl->reconDesc->hsStallCount++; 1573 #endif /* RF_RECON_STATS > 0 */ 1574 } 1575 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1576 reconCtrlPtr->rb_lock = 0; 1577 wakeup(&reconCtrlPtr->rb_lock); 1578 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1579 1580 return (retval); 1581 } 1582 /* 1583 * checks to see if reconstruction has been either forced or blocked 1584 * by a user operation. if forced, we skip this RU entirely. else if 1585 * blocked, put ourselves on the wait list. else return 0. 1586 * 1587 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY 1588 */ 1589 static int 1590 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, 1591 RF_ReconParityStripeStatus_t *pssPtr, 1592 RF_PerDiskReconCtrl_t *ctrl, 1593 RF_RowCol_t col, 1594 RF_StripeNum_t psid, 1595 RF_ReconUnitNum_t which_ru) 1596 { 1597 RF_CallbackDesc_t *cb; 1598 int retcode = 0; 1599 1600 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) 1601 retcode = RF_PSS_FORCED_ON_WRITE; 1602 else 1603 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { 1604 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru); 1605 cb = rf_AllocCallbackDesc(); /* append ourselves to 1606 * the blockage-wait 1607 * list */ 1608 cb->col = col; 1609 cb->next = pssPtr->blockWaitList; 1610 pssPtr->blockWaitList = cb; 1611 retcode = RF_PSS_RECON_BLOCKED; 1612 } 1613 if (!retcode) 1614 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under 1615 * reconstruction */ 1616 1617 return (retcode); 1618 } 1619 /* 1620 * if reconstruction is currently ongoing for the indicated stripeID, 1621 * reconstruction is forced to completion and we return non-zero to 1622 * indicate that the caller must wait. If not, then reconstruction is 1623 * blocked on the indicated stripe and the routine returns zero. If 1624 * and only if we return non-zero, we'll cause the cbFunc to get 1625 * invoked with the cbArg when the reconstruction has completed. 1626 */ 1627 int 1628 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1629 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg) 1630 { 1631 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're 1632 * forcing recon on */ 1633 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ 1634 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity 1635 * stripe status structure */ 1636 RF_StripeNum_t psid; /* parity stripe id */ 1637 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk 1638 * offset */ 1639 RF_RowCol_t *diskids; 1640 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ 1641 RF_RowCol_t fcol, diskno, i; 1642 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ 1643 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ 1644 RF_CallbackDesc_t *cb; 1645 int nPromoted; 1646 1647 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1648 1649 /* allocate a new PSS in case we need it */ 1650 newpssPtr = rf_AllocPSStatus(raidPtr); 1651 1652 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1653 1654 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr); 1655 1656 if (pssPtr != newpssPtr) { 1657 rf_FreePSStatus(raidPtr, newpssPtr); 1658 } 1659 1660 /* if recon is not ongoing on this PS, just return */ 1661 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1662 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1663 return (0); 1664 } 1665 /* otherwise, we have to wait for reconstruction to complete on this 1666 * RU. */ 1667 /* In order to avoid waiting for a potentially large number of 1668 * low-priority accesses to complete, we force a normal-priority (i.e. 1669 * not low-priority) reconstruction on this RU. */ 1670 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { 1671 DDprintf1("Forcing recon on psid %ld\n", psid); 1672 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under 1673 * forced recon */ 1674 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage 1675 * that we just set */ 1676 fcol = raidPtr->reconControl->fcol; 1677 1678 /* get a listing of the disks comprising the indicated stripe */ 1679 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids); 1680 1681 /* For previously issued reads, elevate them to normal 1682 * priority. If the I/O has already completed, it won't be 1683 * found in the queue, and hence this will be a no-op. For 1684 * unissued reads, allocate buffers and issue new reads. The 1685 * fact that we've set the FORCED bit means that the regular 1686 * recon procs will not re-issue these reqs */ 1687 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) 1688 if ((diskno = diskids[i]) != fcol) { 1689 if (pssPtr->issued[diskno]) { 1690 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru); 1691 if (rf_reconDebug && nPromoted) 1692 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno); 1693 } else { 1694 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ 1695 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset, 1696 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare 1697 * location */ 1698 new_rbuf->parityStripeID = psid; /* fill in the buffer */ 1699 new_rbuf->which_ru = which_ru; 1700 new_rbuf->failedDiskSectorOffset = fd_offset; 1701 new_rbuf->priority = RF_IO_NORMAL_PRIORITY; 1702 1703 /* use NULL b_proc b/c all addrs 1704 * should be in kernel space */ 1705 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, 1706 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, 1707 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK); 1708 1709 new_rbuf->arg = req; 1710 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ 1711 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno); 1712 } 1713 } 1714 /* if the write is sitting in the disk queue, elevate its 1715 * priority */ 1716 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru)) 1717 printf("raid%d: promoted write to col %d\n", 1718 raidPtr->raidid, fcol); 1719 } 1720 /* install a callback descriptor to be invoked when recon completes on 1721 * this parity stripe. */ 1722 cb = rf_AllocCallbackDesc(); 1723 /* XXX the following is bogus.. These functions don't really match!! 1724 * GO */ 1725 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; 1726 cb->callbackArg.p = (void *) cbArg; 1727 cb->next = pssPtr->procWaitList; 1728 pssPtr->procWaitList = cb; 1729 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n", 1730 raidPtr->raidid, psid); 1731 1732 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1733 return (1); 1734 } 1735 /* called upon the completion of a forced reconstruction read. 1736 * all we do is schedule the FORCEDREADONE event. 1737 * called at interrupt context in the kernel, so don't do anything illegal here. 1738 */ 1739 static void 1740 ForceReconReadDoneProc(void *arg, int status) 1741 { 1742 RF_ReconBuffer_t *rbuf = arg; 1743 1744 /* Detect that reconControl is no longer valid, and if that 1745 is the case, bail without calling rf_CauseReconEvent(). 1746 There won't be anyone listening for this event anyway */ 1747 1748 if (rbuf->raidPtr->reconControl == NULL) 1749 return; 1750 1751 if (status) { 1752 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid); 1753 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED); 1754 return; 1755 } 1756 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); 1757 } 1758 /* releases a block on the reconstruction of the indicated stripe */ 1759 int 1760 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) 1761 { 1762 RF_StripeNum_t stripeID = asmap->stripeID; 1763 RF_ReconParityStripeStatus_t *pssPtr; 1764 RF_ReconUnitNum_t which_ru; 1765 RF_StripeNum_t psid; 1766 RF_CallbackDesc_t *cb; 1767 1768 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1769 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1770 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL); 1771 1772 /* When recon is forced, the pss desc can get deleted before we get 1773 * back to unblock recon. But, this can _only_ happen when recon is 1774 * forced. It would be good to put some kind of sanity check here, but 1775 * how to decide if recon was just forced or not? */ 1776 if (!pssPtr) { 1777 /* printf("Warning: no pss descriptor upon unblock on psid %ld 1778 * RU %d\n",psid,which_ru); */ 1779 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0) 1780 if (rf_reconDebug || rf_pssDebug) 1781 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru); 1782 #endif 1783 goto out; 1784 } 1785 pssPtr->blockCount--; 1786 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n", 1787 raidPtr->raidid, psid, pssPtr->blockCount); 1788 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ 1789 1790 /* unblock recon before calling CauseReconEvent in case 1791 * CauseReconEvent causes us to try to issue a new read before 1792 * returning here. */ 1793 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; 1794 1795 1796 while (pssPtr->blockWaitList) { 1797 /* spin through the block-wait list and 1798 release all the waiters */ 1799 cb = pssPtr->blockWaitList; 1800 pssPtr->blockWaitList = cb->next; 1801 cb->next = NULL; 1802 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR); 1803 rf_FreeCallbackDesc(cb); 1804 } 1805 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1806 /* if no recon was requested while recon was blocked */ 1807 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1808 } 1809 } 1810 out: 1811 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1812 return (0); 1813 } 1814