1 /* $NetBSD: rf_reconstruct.c,v 1.103 2008/04/15 16:05:43 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /************************************************************ 30 * 31 * rf_reconstruct.c -- code to perform on-line reconstruction 32 * 33 ************************************************************/ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.103 2008/04/15 16:05:43 oster Exp $"); 37 38 #include <sys/param.h> 39 #include <sys/time.h> 40 #include <sys/buf.h> 41 #include <sys/errno.h> 42 #include <sys/systm.h> 43 #include <sys/proc.h> 44 #include <sys/ioctl.h> 45 #include <sys/fcntl.h> 46 #include <sys/vnode.h> 47 #include <dev/raidframe/raidframevar.h> 48 49 #include "rf_raid.h" 50 #include "rf_reconutil.h" 51 #include "rf_revent.h" 52 #include "rf_reconbuffer.h" 53 #include "rf_acctrace.h" 54 #include "rf_etimer.h" 55 #include "rf_dag.h" 56 #include "rf_desc.h" 57 #include "rf_debugprint.h" 58 #include "rf_general.h" 59 #include "rf_driver.h" 60 #include "rf_utils.h" 61 #include "rf_shutdown.h" 62 63 #include "rf_kintf.h" 64 65 /* setting these to -1 causes them to be set to their default values if not set by debug options */ 66 67 #if RF_DEBUG_RECON 68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) 69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) 73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) 74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) 75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) 76 77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 79 80 #else /* RF_DEBUG_RECON */ 81 82 #define Dprintf(s) {} 83 #define Dprintf1(s,a) {} 84 #define Dprintf2(s,a,b) {} 85 #define Dprintf3(s,a,b,c) {} 86 #define Dprintf4(s,a,b,c,d) {} 87 #define Dprintf5(s,a,b,c,d,e) {} 88 #define Dprintf6(s,a,b,c,d,e,f) {} 89 #define Dprintf7(s,a,b,c,d,e,f,g) {} 90 91 #define DDprintf1(s,a) {} 92 #define DDprintf2(s,a,b) {} 93 94 #endif /* RF_DEBUG_RECON */ 95 96 #define RF_RECON_DONE_READS 1 97 #define RF_RECON_READ_ERROR 2 98 #define RF_RECON_WRITE_ERROR 3 99 #define RF_RECON_READ_STOPPED 4 100 101 #define RF_MAX_FREE_RECONBUFFER 32 102 #define RF_MIN_FREE_RECONBUFFER 16 103 104 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, 105 RF_RaidDisk_t *, int, RF_RowCol_t); 106 static void FreeReconDesc(RF_RaidReconDesc_t *); 107 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *); 108 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t); 109 static int TryToRead(RF_Raid_t *, RF_RowCol_t); 110 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t, 111 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *, 112 RF_SectorNum_t *); 113 static int IssueNextWriteRequest(RF_Raid_t *); 114 static int ReconReadDoneProc(void *, int); 115 static int ReconWriteDoneProc(void *, int); 116 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t); 117 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *, 118 RF_RowCol_t, RF_HeadSepLimit_t, 119 RF_ReconUnitNum_t); 120 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *, 121 RF_ReconParityStripeStatus_t *, 122 RF_PerDiskReconCtrl_t *, 123 RF_RowCol_t, RF_StripeNum_t, 124 RF_ReconUnitNum_t); 125 static void ForceReconReadDoneProc(void *, int); 126 static void rf_ShutdownReconstruction(void *); 127 128 struct RF_ReconDoneProc_s { 129 void (*proc) (RF_Raid_t *, void *); 130 void *arg; 131 RF_ReconDoneProc_t *next; 132 }; 133 134 /************************************************************************** 135 * 136 * sets up the parameters that will be used by the reconstruction process 137 * currently there are none, except for those that the layout-specific 138 * configuration (e.g. rf_ConfigureDeclustered) routine sets up. 139 * 140 * in the kernel, we fire off the recon thread. 141 * 142 **************************************************************************/ 143 static void 144 rf_ShutdownReconstruction(void *ignored) 145 { 146 pool_destroy(&rf_pools.reconbuffer); 147 } 148 149 int 150 rf_ConfigureReconstruction(RF_ShutdownList_t **listp) 151 { 152 153 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t), 154 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER); 155 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); 156 157 return (0); 158 } 159 160 static RF_RaidReconDesc_t * 161 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col, 162 RF_RaidDisk_t *spareDiskPtr, int numDisksDone, 163 RF_RowCol_t scol) 164 { 165 166 RF_RaidReconDesc_t *reconDesc; 167 168 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t), 169 (RF_RaidReconDesc_t *)); 170 reconDesc->raidPtr = raidPtr; 171 reconDesc->col = col; 172 reconDesc->spareDiskPtr = spareDiskPtr; 173 reconDesc->numDisksDone = numDisksDone; 174 reconDesc->scol = scol; 175 reconDesc->next = NULL; 176 177 return (reconDesc); 178 } 179 180 static void 181 FreeReconDesc(RF_RaidReconDesc_t *reconDesc) 182 { 183 #if RF_RECON_STATS > 0 184 printf("raid%d: %lu recon event waits, %lu recon delays\n", 185 reconDesc->raidPtr->raidid, 186 (long) reconDesc->numReconEventWaits, 187 (long) reconDesc->numReconExecDelays); 188 #endif /* RF_RECON_STATS > 0 */ 189 printf("raid%d: %lu max exec ticks\n", 190 reconDesc->raidPtr->raidid, 191 (long) reconDesc->maxReconExecTicks); 192 #if (RF_RECON_STATS > 0) || defined(KERNEL) 193 printf("\n"); 194 #endif /* (RF_RECON_STATS > 0) || KERNEL */ 195 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t)); 196 } 197 198 199 /***************************************************************************** 200 * 201 * primary routine to reconstruct a failed disk. This should be called from 202 * within its own thread. It won't return until reconstruction completes, 203 * fails, or is aborted. 204 *****************************************************************************/ 205 int 206 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col) 207 { 208 const RF_LayoutSW_t *lp; 209 int rc; 210 211 lp = raidPtr->Layout.map; 212 if (lp->SubmitReconBuffer) { 213 /* 214 * The current infrastructure only supports reconstructing one 215 * disk at a time for each array. 216 */ 217 RF_LOCK_MUTEX(raidPtr->mutex); 218 while (raidPtr->reconInProgress) { 219 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); 220 } 221 raidPtr->reconInProgress++; 222 RF_UNLOCK_MUTEX(raidPtr->mutex); 223 rc = rf_ReconstructFailedDiskBasic(raidPtr, col); 224 RF_LOCK_MUTEX(raidPtr->mutex); 225 raidPtr->reconInProgress--; 226 RF_UNLOCK_MUTEX(raidPtr->mutex); 227 } else { 228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 229 lp->parityConfig); 230 rc = EIO; 231 } 232 RF_SIGNAL_COND(raidPtr->waitForReconCond); 233 return (rc); 234 } 235 236 int 237 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) 238 { 239 RF_ComponentLabel_t c_label; 240 RF_RaidDisk_t *spareDiskPtr = NULL; 241 RF_RaidReconDesc_t *reconDesc; 242 RF_RowCol_t scol; 243 int numDisksDone = 0, rc; 244 245 /* first look for a spare drive onto which to reconstruct the data */ 246 /* spare disk descriptors are stored in row 0. This may have to 247 * change eventually */ 248 249 RF_LOCK_MUTEX(raidPtr->mutex); 250 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); 251 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 252 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 253 if (raidPtr->status != rf_rs_degraded) { 254 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col); 255 RF_UNLOCK_MUTEX(raidPtr->mutex); 256 return (EINVAL); 257 } 258 scol = (-1); 259 } else { 260 #endif 261 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { 262 if (raidPtr->Disks[scol].status == rf_ds_spare) { 263 spareDiskPtr = &raidPtr->Disks[scol]; 264 spareDiskPtr->status = rf_ds_used_spare; 265 break; 266 } 267 } 268 if (!spareDiskPtr) { 269 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col); 270 RF_UNLOCK_MUTEX(raidPtr->mutex); 271 return (ENOSPC); 272 } 273 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol); 274 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 275 } 276 #endif 277 RF_UNLOCK_MUTEX(raidPtr->mutex); 278 279 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol); 280 raidPtr->reconDesc = (void *) reconDesc; 281 #if RF_RECON_STATS > 0 282 reconDesc->hsStallCount = 0; 283 reconDesc->numReconExecDelays = 0; 284 reconDesc->numReconEventWaits = 0; 285 #endif /* RF_RECON_STATS > 0 */ 286 reconDesc->reconExecTimerRunning = 0; 287 reconDesc->reconExecTicks = 0; 288 reconDesc->maxReconExecTicks = 0; 289 rc = rf_ContinueReconstructFailedDisk(reconDesc); 290 291 if (!rc) { 292 /* fix up the component label */ 293 /* Don't actually need the read here.. */ 294 raidread_component_label( 295 raidPtr->raid_cinfo[scol].ci_dev, 296 raidPtr->raid_cinfo[scol].ci_vp, 297 &c_label); 298 299 raid_init_component_label( raidPtr, &c_label); 300 c_label.row = 0; 301 c_label.column = col; 302 c_label.clean = RF_RAID_DIRTY; 303 c_label.status = rf_ds_optimal; 304 c_label.partitionSize = raidPtr->Disks[scol].partitionSize; 305 306 /* We've just done a rebuild based on all the other 307 disks, so at this point the parity is known to be 308 clean, even if it wasn't before. */ 309 310 /* XXX doesn't hold for RAID 6!!*/ 311 312 RF_LOCK_MUTEX(raidPtr->mutex); 313 raidPtr->parity_good = RF_RAID_CLEAN; 314 RF_UNLOCK_MUTEX(raidPtr->mutex); 315 316 /* XXXX MORE NEEDED HERE */ 317 318 raidwrite_component_label( 319 raidPtr->raid_cinfo[scol].ci_dev, 320 raidPtr->raid_cinfo[scol].ci_vp, 321 &c_label); 322 323 } else { 324 /* Reconstruct failed. */ 325 326 RF_LOCK_MUTEX(raidPtr->mutex); 327 /* Failed disk goes back to "failed" status */ 328 raidPtr->Disks[col].status = rf_ds_failed; 329 330 /* Spare disk goes back to "spare" status. */ 331 spareDiskPtr->status = rf_ds_spare; 332 RF_UNLOCK_MUTEX(raidPtr->mutex); 333 334 } 335 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 336 return (rc); 337 } 338 339 /* 340 341 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, 342 and you don't get a spare until the next Monday. With this function 343 (and hot-swappable drives) you can now put your new disk containing 344 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to 345 rebuild the data "on the spot". 346 347 */ 348 349 int 350 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col) 351 { 352 RF_RaidDisk_t *spareDiskPtr = NULL; 353 RF_RaidReconDesc_t *reconDesc; 354 const RF_LayoutSW_t *lp; 355 RF_ComponentLabel_t c_label; 356 int numDisksDone = 0, rc; 357 struct partinfo dpart; 358 struct vnode *vp; 359 struct vattr va; 360 int retcode; 361 int ac; 362 363 lp = raidPtr->Layout.map; 364 if (!lp->SubmitReconBuffer) { 365 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 366 lp->parityConfig); 367 /* wakeup anyone who might be waiting to do a reconstruct */ 368 RF_SIGNAL_COND(raidPtr->waitForReconCond); 369 return(EIO); 370 } 371 372 /* 373 * The current infrastructure only supports reconstructing one 374 * disk at a time for each array. 375 */ 376 RF_LOCK_MUTEX(raidPtr->mutex); 377 378 if (raidPtr->Disks[col].status != rf_ds_failed) { 379 /* "It's gone..." */ 380 raidPtr->numFailures++; 381 raidPtr->Disks[col].status = rf_ds_failed; 382 raidPtr->status = rf_rs_degraded; 383 RF_UNLOCK_MUTEX(raidPtr->mutex); 384 rf_update_component_labels(raidPtr, 385 RF_NORMAL_COMPONENT_UPDATE); 386 RF_LOCK_MUTEX(raidPtr->mutex); 387 } 388 389 while (raidPtr->reconInProgress) { 390 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); 391 } 392 393 raidPtr->reconInProgress++; 394 395 /* first look for a spare drive onto which to reconstruct the 396 data. spare disk descriptors are stored in row 0. This 397 may have to change eventually */ 398 399 /* Actually, we don't care if it's failed or not... On a RAID 400 set with correct parity, this function should be callable 401 on any component without ill effects. */ 402 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */ 403 404 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 405 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 406 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col); 407 408 raidPtr->reconInProgress--; 409 RF_UNLOCK_MUTEX(raidPtr->mutex); 410 RF_SIGNAL_COND(raidPtr->waitForReconCond); 411 return (EINVAL); 412 } 413 #endif 414 415 /* This device may have been opened successfully the 416 first time. Close it before trying to open it again.. */ 417 418 if (raidPtr->raid_cinfo[col].ci_vp != NULL) { 419 #if 0 420 printf("Closed the open device: %s\n", 421 raidPtr->Disks[col].devname); 422 #endif 423 vp = raidPtr->raid_cinfo[col].ci_vp; 424 ac = raidPtr->Disks[col].auto_configured; 425 RF_UNLOCK_MUTEX(raidPtr->mutex); 426 rf_close_component(raidPtr, vp, ac); 427 RF_LOCK_MUTEX(raidPtr->mutex); 428 raidPtr->raid_cinfo[col].ci_vp = NULL; 429 } 430 /* note that this disk was *not* auto_configured (any longer)*/ 431 raidPtr->Disks[col].auto_configured = 0; 432 433 #if 0 434 printf("About to (re-)open the device for rebuilding: %s\n", 435 raidPtr->Disks[col].devname); 436 #endif 437 RF_UNLOCK_MUTEX(raidPtr->mutex); 438 retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE); 439 440 if (retcode) { 441 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid, 442 raidPtr->Disks[col].devname, retcode); 443 444 /* the component isn't responding properly... 445 must be still dead :-( */ 446 RF_LOCK_MUTEX(raidPtr->mutex); 447 raidPtr->reconInProgress--; 448 RF_UNLOCK_MUTEX(raidPtr->mutex); 449 RF_SIGNAL_COND(raidPtr->waitForReconCond); 450 return(retcode); 451 } 452 453 /* Ok, so we can at least do a lookup... 454 How about actually getting a vp for it? */ 455 456 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) { 457 RF_LOCK_MUTEX(raidPtr->mutex); 458 raidPtr->reconInProgress--; 459 RF_UNLOCK_MUTEX(raidPtr->mutex); 460 RF_SIGNAL_COND(raidPtr->waitForReconCond); 461 return(retcode); 462 } 463 464 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred); 465 if (retcode) { 466 RF_LOCK_MUTEX(raidPtr->mutex); 467 raidPtr->reconInProgress--; 468 RF_UNLOCK_MUTEX(raidPtr->mutex); 469 RF_SIGNAL_COND(raidPtr->waitForReconCond); 470 return(retcode); 471 } 472 RF_LOCK_MUTEX(raidPtr->mutex); 473 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize; 474 475 raidPtr->Disks[col].numBlocks = dpart.part->p_size - 476 rf_protectedSectors; 477 478 raidPtr->raid_cinfo[col].ci_vp = vp; 479 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev; 480 481 raidPtr->Disks[col].dev = va.va_rdev; 482 483 /* we allow the user to specify that only a fraction 484 of the disks should be used this is just for debug: 485 it speeds up * the parity scan */ 486 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks * 487 rf_sizePercentage / 100; 488 RF_UNLOCK_MUTEX(raidPtr->mutex); 489 490 spareDiskPtr = &raidPtr->Disks[col]; 491 spareDiskPtr->status = rf_ds_used_spare; 492 493 printf("raid%d: initiating in-place reconstruction on column %d\n", 494 raidPtr->raidid, col); 495 496 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, 497 numDisksDone, col); 498 raidPtr->reconDesc = (void *) reconDesc; 499 #if RF_RECON_STATS > 0 500 reconDesc->hsStallCount = 0; 501 reconDesc->numReconExecDelays = 0; 502 reconDesc->numReconEventWaits = 0; 503 #endif /* RF_RECON_STATS > 0 */ 504 reconDesc->reconExecTimerRunning = 0; 505 reconDesc->reconExecTicks = 0; 506 reconDesc->maxReconExecTicks = 0; 507 rc = rf_ContinueReconstructFailedDisk(reconDesc); 508 509 if (!rc) { 510 RF_LOCK_MUTEX(raidPtr->mutex); 511 /* Need to set these here, as at this point it'll be claiming 512 that the disk is in rf_ds_spared! But we know better :-) */ 513 514 raidPtr->Disks[col].status = rf_ds_optimal; 515 raidPtr->status = rf_rs_optimal; 516 RF_UNLOCK_MUTEX(raidPtr->mutex); 517 518 /* fix up the component label */ 519 /* Don't actually need the read here.. */ 520 raidread_component_label(raidPtr->raid_cinfo[col].ci_dev, 521 raidPtr->raid_cinfo[col].ci_vp, 522 &c_label); 523 524 RF_LOCK_MUTEX(raidPtr->mutex); 525 raid_init_component_label(raidPtr, &c_label); 526 527 c_label.row = 0; 528 c_label.column = col; 529 530 /* We've just done a rebuild based on all the other 531 disks, so at this point the parity is known to be 532 clean, even if it wasn't before. */ 533 534 /* XXX doesn't hold for RAID 6!!*/ 535 536 raidPtr->parity_good = RF_RAID_CLEAN; 537 RF_UNLOCK_MUTEX(raidPtr->mutex); 538 539 raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev, 540 raidPtr->raid_cinfo[col].ci_vp, 541 &c_label); 542 543 } else { 544 /* Reconstruct-in-place failed. Disk goes back to 545 "failed" status, regardless of what it was before. */ 546 RF_LOCK_MUTEX(raidPtr->mutex); 547 raidPtr->Disks[col].status = rf_ds_failed; 548 RF_UNLOCK_MUTEX(raidPtr->mutex); 549 } 550 551 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 552 553 RF_LOCK_MUTEX(raidPtr->mutex); 554 raidPtr->reconInProgress--; 555 RF_UNLOCK_MUTEX(raidPtr->mutex); 556 557 RF_SIGNAL_COND(raidPtr->waitForReconCond); 558 return (rc); 559 } 560 561 562 int 563 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc) 564 { 565 RF_Raid_t *raidPtr = reconDesc->raidPtr; 566 RF_RowCol_t col = reconDesc->col; 567 RF_RowCol_t scol = reconDesc->scol; 568 RF_ReconMap_t *mapPtr; 569 RF_ReconCtrl_t *tmp_reconctrl; 570 RF_ReconEvent_t *event; 571 RF_CallbackDesc_t *p; 572 struct timeval etime, elpsd; 573 unsigned long xor_s, xor_resid_us; 574 int i, ds; 575 int status; 576 int recon_error, write_error; 577 578 raidPtr->accumXorTimeUs = 0; 579 #if RF_ACC_TRACE > 0 580 /* create one trace record per physical disk */ 581 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); 582 #endif 583 584 /* quiesce the array prior to starting recon. this is needed 585 * to assure no nasty interactions with pending user writes. 586 * We need to do this before we change the disk or row status. */ 587 588 Dprintf("RECON: begin request suspend\n"); 589 rf_SuspendNewRequestsAndWait(raidPtr); 590 Dprintf("RECON: end request suspend\n"); 591 592 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */ 593 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol); 594 595 RF_LOCK_MUTEX(raidPtr->mutex); 596 597 /* create the reconstruction control pointer and install it in 598 * the right slot */ 599 raidPtr->reconControl = tmp_reconctrl; 600 mapPtr = raidPtr->reconControl->reconMap; 601 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs; 602 raidPtr->reconControl->numRUsComplete = 0; 603 raidPtr->status = rf_rs_reconstructing; 604 raidPtr->Disks[col].status = rf_ds_reconstructing; 605 raidPtr->Disks[col].spareCol = scol; 606 607 RF_UNLOCK_MUTEX(raidPtr->mutex); 608 609 RF_GETTIME(raidPtr->reconControl->starttime); 610 611 /* now start up the actual reconstruction: issue a read for 612 * each surviving disk */ 613 614 reconDesc->numDisksDone = 0; 615 for (i = 0; i < raidPtr->numCol; i++) { 616 if (i != col) { 617 /* find and issue the next I/O on the 618 * indicated disk */ 619 if (IssueNextReadRequest(raidPtr, i)) { 620 Dprintf1("RECON: done issuing for c%d\n", i); 621 reconDesc->numDisksDone++; 622 } 623 } 624 } 625 626 Dprintf("RECON: resume requests\n"); 627 rf_ResumeNewRequests(raidPtr); 628 629 /* process reconstruction events until all disks report that 630 * they've completed all work */ 631 632 mapPtr = raidPtr->reconControl->reconMap; 633 recon_error = 0; 634 write_error = 0; 635 636 while (reconDesc->numDisksDone < raidPtr->numCol - 1) { 637 638 event = rf_GetNextReconEvent(reconDesc); 639 status = ProcessReconEvent(raidPtr, event); 640 641 /* the normal case is that a read completes, and all is well. */ 642 if (status == RF_RECON_DONE_READS) { 643 reconDesc->numDisksDone++; 644 } else if ((status == RF_RECON_READ_ERROR) || 645 (status == RF_RECON_WRITE_ERROR)) { 646 /* an error was encountered while reconstructing... 647 Pretend we've finished this disk. 648 */ 649 recon_error = 1; 650 raidPtr->reconControl->error = 1; 651 652 /* bump the numDisksDone count for reads, 653 but not for writes */ 654 if (status == RF_RECON_READ_ERROR) 655 reconDesc->numDisksDone++; 656 657 /* write errors are special -- when we are 658 done dealing with the reads that are 659 finished, we don't want to wait for any 660 writes */ 661 if (status == RF_RECON_WRITE_ERROR) 662 write_error = 1; 663 664 } else if (status == RF_RECON_READ_STOPPED) { 665 /* count this component as being "done" */ 666 reconDesc->numDisksDone++; 667 } 668 669 if (recon_error) { 670 671 /* make sure any stragglers are woken up so that 672 their theads will complete, and we can get out 673 of here with all IO processed */ 674 675 while (raidPtr->reconControl->headSepCBList) { 676 p = raidPtr->reconControl->headSepCBList; 677 raidPtr->reconControl->headSepCBList = p->next; 678 p->next = NULL; 679 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 680 rf_FreeCallbackDesc(p); 681 } 682 } 683 684 raidPtr->reconControl->numRUsTotal = 685 mapPtr->totalRUs; 686 raidPtr->reconControl->numRUsComplete = 687 mapPtr->totalRUs - 688 rf_UnitsLeftToReconstruct(mapPtr); 689 690 #if RF_DEBUG_RECON 691 raidPtr->reconControl->percentComplete = 692 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 693 if (rf_prReconSched) { 694 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 695 } 696 #endif 697 } 698 699 mapPtr = raidPtr->reconControl->reconMap; 700 if (rf_reconDebug) { 701 printf("RECON: all reads completed\n"); 702 } 703 /* at this point all the reads have completed. We now wait 704 * for any pending writes to complete, and then we're done */ 705 706 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) { 707 708 event = rf_GetNextReconEvent(reconDesc); 709 status = ProcessReconEvent(raidPtr, event); 710 711 if (status == RF_RECON_WRITE_ERROR) { 712 recon_error = 1; 713 raidPtr->reconControl->error = 1; 714 /* an error was encountered at the very end... bail */ 715 } else { 716 #if RF_DEBUG_RECON 717 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); 718 if (rf_prReconSched) { 719 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 720 } 721 #endif 722 } 723 } 724 725 if (recon_error) { 726 /* we've encountered an error in reconstructing. */ 727 printf("raid%d: reconstruction failed.\n", raidPtr->raidid); 728 729 /* we start by blocking IO to the RAID set. */ 730 rf_SuspendNewRequestsAndWait(raidPtr); 731 732 RF_LOCK_MUTEX(raidPtr->mutex); 733 /* mark set as being degraded, rather than 734 rf_rs_reconstructing as we were before the problem. 735 After this is done we can update status of the 736 component disks without worrying about someone 737 trying to read from a failed component. 738 */ 739 raidPtr->status = rf_rs_degraded; 740 RF_UNLOCK_MUTEX(raidPtr->mutex); 741 742 /* resume IO */ 743 rf_ResumeNewRequests(raidPtr); 744 745 /* At this point there are two cases: 746 1) If we've experienced a read error, then we've 747 already waited for all the reads we're going to get, 748 and we just need to wait for the writes. 749 750 2) If we've experienced a write error, we've also 751 already waited for all the reads to complete, 752 but there is little point in waiting for the writes -- 753 when they do complete, they will just be ignored. 754 755 So we just wait for writes to complete if we didn't have a 756 write error. 757 */ 758 759 if (!write_error) { 760 /* wait for writes to complete */ 761 while (raidPtr->reconControl->pending_writes > 0) { 762 763 event = rf_GetNextReconEvent(reconDesc); 764 status = ProcessReconEvent(raidPtr, event); 765 766 if (status == RF_RECON_WRITE_ERROR) { 767 raidPtr->reconControl->error = 1; 768 /* an error was encountered at the very end... bail. 769 This will be very bad news for the user, since 770 at this point there will have been a read error 771 on one component, and a write error on another! 772 */ 773 break; 774 } 775 } 776 } 777 778 779 /* cleanup */ 780 781 /* drain the event queue - after waiting for the writes above, 782 there shouldn't be much (if anything!) left in the queue. */ 783 784 rf_DrainReconEventQueue(reconDesc); 785 786 /* XXX As much as we'd like to free the recon control structure 787 and the reconDesc, we have no way of knowing if/when those will 788 be touched by IO that has yet to occur. It is rather poor to be 789 basically causing a 'memory leak' here, but there doesn't seem to be 790 a cleaner alternative at this time. Perhaps when the reconstruct code 791 gets a makeover this problem will go away. 792 */ 793 #if 0 794 rf_FreeReconControl(raidPtr); 795 #endif 796 797 #if RF_ACC_TRACE > 0 798 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 799 #endif 800 /* XXX see comment above */ 801 #if 0 802 FreeReconDesc(reconDesc); 803 #endif 804 805 return (1); 806 } 807 808 /* Success: mark the dead disk as reconstructed. We quiesce 809 * the array here to assure no nasty interactions with pending 810 * user accesses when we free up the psstatus structure as 811 * part of FreeReconControl() */ 812 813 rf_SuspendNewRequestsAndWait(raidPtr); 814 815 RF_LOCK_MUTEX(raidPtr->mutex); 816 raidPtr->numFailures--; 817 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); 818 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; 819 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal; 820 RF_UNLOCK_MUTEX(raidPtr->mutex); 821 RF_GETTIME(etime); 822 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd); 823 824 rf_ResumeNewRequests(raidPtr); 825 826 printf("raid%d: Reconstruction of disk at col %d completed\n", 827 raidPtr->raidid, col); 828 xor_s = raidPtr->accumXorTimeUs / 1000000; 829 xor_resid_us = raidPtr->accumXorTimeUs % 1000000; 830 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", 831 raidPtr->raidid, 832 (int) elpsd.tv_sec, (int) elpsd.tv_usec, 833 raidPtr->accumXorTimeUs, xor_s, xor_resid_us); 834 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n", 835 raidPtr->raidid, 836 (int) raidPtr->reconControl->starttime.tv_sec, 837 (int) raidPtr->reconControl->starttime.tv_usec, 838 (int) etime.tv_sec, (int) etime.tv_usec); 839 #if RF_RECON_STATS > 0 840 printf("raid%d: Total head-sep stall count was %d\n", 841 raidPtr->raidid, (int) reconDesc->hsStallCount); 842 #endif /* RF_RECON_STATS > 0 */ 843 rf_FreeReconControl(raidPtr); 844 #if RF_ACC_TRACE > 0 845 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 846 #endif 847 FreeReconDesc(reconDesc); 848 849 return (0); 850 851 } 852 /***************************************************************************** 853 * do the right thing upon each reconstruction event. 854 *****************************************************************************/ 855 static int 856 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event) 857 { 858 int retcode = 0, submitblocked; 859 RF_ReconBuffer_t *rbuf; 860 RF_SectorCount_t sectorsPerRU; 861 862 retcode = RF_RECON_READ_STOPPED; 863 864 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); 865 switch (event->type) { 866 867 /* a read I/O has completed */ 868 case RF_REVENT_READDONE: 869 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf; 870 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n", 871 event->col, rbuf->parityStripeID); 872 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", 873 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 874 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 875 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 876 if (!raidPtr->reconControl->error) { 877 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); 878 Dprintf1("RECON: submitblocked=%d\n", submitblocked); 879 if (!submitblocked) 880 retcode = IssueNextReadRequest(raidPtr, event->col); 881 else 882 retcode = 0; 883 } 884 break; 885 886 /* a write I/O has completed */ 887 case RF_REVENT_WRITEDONE: 888 #if RF_DEBUG_RECON 889 if (rf_floatingRbufDebug) { 890 rf_CheckFloatingRbufCount(raidPtr, 1); 891 } 892 #endif 893 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 894 rbuf = (RF_ReconBuffer_t *) event->arg; 895 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 896 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", 897 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete); 898 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap, 899 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); 900 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru); 901 902 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 903 raidPtr->reconControl->pending_writes--; 904 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 905 906 if (rbuf->type == RF_RBUF_TYPE_FLOATING) { 907 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 908 while(raidPtr->reconControl->rb_lock) { 909 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0, 910 &raidPtr->reconControl->rb_mutex); 911 } 912 raidPtr->reconControl->rb_lock = 1; 913 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 914 915 raidPtr->numFullReconBuffers--; 916 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf); 917 918 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 919 raidPtr->reconControl->rb_lock = 0; 920 wakeup(&raidPtr->reconControl->rb_lock); 921 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 922 } else 923 if (rbuf->type == RF_RBUF_TYPE_FORCED) 924 rf_FreeReconBuffer(rbuf); 925 else 926 RF_ASSERT(0); 927 retcode = 0; 928 break; 929 930 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been 931 * cleared */ 932 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col); 933 if (!raidPtr->reconControl->error) { 934 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf, 935 0, (int) (long) event->arg); 936 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the 937 * BUFCLEAR event if we 938 * couldn't submit */ 939 retcode = IssueNextReadRequest(raidPtr, event->col); 940 } 941 break; 942 943 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction 944 * blockage has been cleared */ 945 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col); 946 if (!raidPtr->reconControl->error) { 947 retcode = TryToRead(raidPtr, event->col); 948 } 949 break; 950 951 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation 952 * reconstruction blockage has been 953 * cleared */ 954 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col); 955 if (!raidPtr->reconControl->error) { 956 retcode = TryToRead(raidPtr, event->col); 957 } 958 break; 959 960 /* a buffer has become ready to write */ 961 case RF_REVENT_BUFREADY: 962 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col); 963 if (!raidPtr->reconControl->error) { 964 retcode = IssueNextWriteRequest(raidPtr); 965 #if RF_DEBUG_RECON 966 if (rf_floatingRbufDebug) { 967 rf_CheckFloatingRbufCount(raidPtr, 1); 968 } 969 #endif 970 } 971 break; 972 973 /* we need to skip the current RU entirely because it got 974 * recon'd while we were waiting for something else to happen */ 975 case RF_REVENT_SKIP: 976 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col); 977 if (!raidPtr->reconControl->error) { 978 retcode = IssueNextReadRequest(raidPtr, event->col); 979 } 980 break; 981 982 /* a forced-reconstruction read access has completed. Just 983 * submit the buffer */ 984 case RF_REVENT_FORCEDREADDONE: 985 rbuf = (RF_ReconBuffer_t *) event->arg; 986 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 987 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col); 988 if (!raidPtr->reconControl->error) { 989 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); 990 RF_ASSERT(!submitblocked); 991 retcode = 0; 992 } 993 break; 994 995 /* A read I/O failed to complete */ 996 case RF_REVENT_READ_FAILED: 997 retcode = RF_RECON_READ_ERROR; 998 break; 999 1000 /* A write I/O failed to complete */ 1001 case RF_REVENT_WRITE_FAILED: 1002 retcode = RF_RECON_WRITE_ERROR; 1003 1004 rbuf = (RF_ReconBuffer_t *) event->arg; 1005 1006 /* cleanup the disk queue data */ 1007 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1008 1009 /* At this point we're erroring out, badly, and floatingRbufs 1010 may not even be valid. Rather than putting this back onto 1011 the floatingRbufs list, just arrange for its immediate 1012 destruction. 1013 */ 1014 rf_FreeReconBuffer(rbuf); 1015 break; 1016 1017 /* a forced read I/O failed to complete */ 1018 case RF_REVENT_FORCEDREAD_FAILED: 1019 retcode = RF_RECON_READ_ERROR; 1020 break; 1021 1022 default: 1023 RF_PANIC(); 1024 } 1025 rf_FreeReconEventDesc(event); 1026 return (retcode); 1027 } 1028 /***************************************************************************** 1029 * 1030 * find the next thing that's needed on the indicated disk, and issue 1031 * a read request for it. We assume that the reconstruction buffer 1032 * associated with this process is free to receive the data. If 1033 * reconstruction is blocked on the indicated RU, we issue a 1034 * blockage-release request instead of a physical disk read request. 1035 * If the current disk gets too far ahead of the others, we issue a 1036 * head-separation wait request and return. 1037 * 1038 * ctrl->{ru_count, curPSID, diskOffset} and 1039 * rbuf->failedDiskSectorOffset are maintained to point to the unit 1040 * we're currently accessing. Note that this deviates from the 1041 * standard C idiom of having counters point to the next thing to be 1042 * accessed. This allows us to easily retry when we're blocked by 1043 * head separation or reconstruction-blockage events. 1044 * 1045 *****************************************************************************/ 1046 static int 1047 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col) 1048 { 1049 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1050 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1051 RF_ReconBuffer_t *rbuf = ctrl->rbuf; 1052 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; 1053 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1054 int do_new_check = 0, retcode = 0, status; 1055 1056 /* if we are currently the slowest disk, mark that we have to do a new 1057 * check */ 1058 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter) 1059 do_new_check = 1; 1060 1061 while (1) { 1062 1063 ctrl->ru_count++; 1064 if (ctrl->ru_count < RUsPerPU) { 1065 ctrl->diskOffset += sectorsPerRU; 1066 rbuf->failedDiskSectorOffset += sectorsPerRU; 1067 } else { 1068 ctrl->curPSID++; 1069 ctrl->ru_count = 0; 1070 /* code left over from when head-sep was based on 1071 * parity stripe id */ 1072 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) { 1073 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter)); 1074 return (RF_RECON_DONE_READS); /* finito! */ 1075 } 1076 /* find the disk offsets of the start of the parity 1077 * stripe on both the current disk and the failed 1078 * disk. skip this entire parity stripe if either disk 1079 * does not appear in the indicated PS */ 1080 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, 1081 &rbuf->spCol, &rbuf->spOffset); 1082 if (status) { 1083 ctrl->ru_count = RUsPerPU - 1; 1084 continue; 1085 } 1086 } 1087 rbuf->which_ru = ctrl->ru_count; 1088 1089 /* skip this RU if it's already been reconstructed */ 1090 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) { 1091 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count); 1092 continue; 1093 } 1094 break; 1095 } 1096 ctrl->headSepCounter++; 1097 if (do_new_check) 1098 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */ 1099 1100 1101 /* at this point, we have definitely decided what to do, and we have 1102 * only to see if we can actually do it now */ 1103 rbuf->parityStripeID = ctrl->curPSID; 1104 rbuf->which_ru = ctrl->ru_count; 1105 #if RF_ACC_TRACE > 0 1106 memset((char *) &raidPtr->recon_tracerecs[col], 0, 1107 sizeof(raidPtr->recon_tracerecs[col])); 1108 raidPtr->recon_tracerecs[col].reconacc = 1; 1109 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1110 #endif 1111 retcode = TryToRead(raidPtr, col); 1112 return (retcode); 1113 } 1114 1115 /* 1116 * tries to issue the next read on the indicated disk. We may be 1117 * blocked by (a) the heads being too far apart, or (b) recon on the 1118 * indicated RU being blocked due to a write by a user thread. In 1119 * this case, we issue a head-sep or blockage wait request, which will 1120 * cause this same routine to be invoked again later when the blockage 1121 * has cleared. 1122 */ 1123 1124 static int 1125 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col) 1126 { 1127 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1128 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 1129 RF_StripeNum_t psid = ctrl->curPSID; 1130 RF_ReconUnitNum_t which_ru = ctrl->ru_count; 1131 RF_DiskQueueData_t *req; 1132 int status; 1133 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; 1134 1135 /* if the current disk is too far ahead of the others, issue a 1136 * head-separation wait and return */ 1137 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru)) 1138 return (0); 1139 1140 /* allocate a new PSS in case we need it */ 1141 newpssPtr = rf_AllocPSStatus(raidPtr); 1142 1143 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1144 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr); 1145 1146 if (pssPtr != newpssPtr) { 1147 rf_FreePSStatus(raidPtr, newpssPtr); 1148 } 1149 1150 /* if recon is blocked on the indicated parity stripe, issue a 1151 * block-wait request and return. this also must mark the indicated RU 1152 * in the stripe as under reconstruction if not blocked. */ 1153 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru); 1154 if (status == RF_PSS_RECON_BLOCKED) { 1155 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru); 1156 goto out; 1157 } else 1158 if (status == RF_PSS_FORCED_ON_WRITE) { 1159 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1160 goto out; 1161 } 1162 /* make one last check to be sure that the indicated RU didn't get 1163 * reconstructed while we were waiting for something else to happen. 1164 * This is unfortunate in that it causes us to make this check twice 1165 * in the normal case. Might want to make some attempt to re-work 1166 * this so that we only do this check if we've definitely blocked on 1167 * one of the above checks. When this condition is detected, we may 1168 * have just created a bogus status entry, which we need to delete. */ 1169 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { 1170 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru); 1171 if (pssPtr == newpssPtr) 1172 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1173 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1174 goto out; 1175 } 1176 /* found something to read. issue the I/O */ 1177 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n", 1178 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer); 1179 #if RF_ACC_TRACE > 0 1180 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); 1181 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); 1182 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = 1183 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); 1184 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1185 #endif 1186 /* should be ok to use a NULL proc pointer here, all the bufs we use 1187 * should be in kernel space */ 1188 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, 1189 ReconReadDoneProc, (void *) ctrl, 1190 #if RF_ACC_TRACE > 0 1191 &raidPtr->recon_tracerecs[col], 1192 #else 1193 NULL, 1194 #endif 1195 (void *) raidPtr, 0, NULL, PR_WAITOK); 1196 1197 ctrl->rbuf->arg = (void *) req; 1198 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY); 1199 pssPtr->issued[col] = 1; 1200 1201 out: 1202 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1203 return (0); 1204 } 1205 1206 1207 /* 1208 * given a parity stripe ID, we want to find out whether both the 1209 * current disk and the failed disk exist in that parity stripe. If 1210 * not, we want to skip this whole PS. If so, we want to find the 1211 * disk offset of the start of the PS on both the current disk and the 1212 * failed disk. 1213 * 1214 * this works by getting a list of disks comprising the indicated 1215 * parity stripe, and searching the list for the current and failed 1216 * disks. Once we've decided they both exist in the parity stripe, we 1217 * need to decide whether each is data or parity, so that we'll know 1218 * which mapping function to call to get the corresponding disk 1219 * offsets. 1220 * 1221 * this is kind of unpleasant, but doing it this way allows the 1222 * reconstruction code to use parity stripe IDs rather than physical 1223 * disks address to march through the failed disk, which greatly 1224 * simplifies a lot of code, as well as eliminating the need for a 1225 * reverse-mapping function. I also think it will execute faster, 1226 * since the calls to the mapping module are kept to a minimum. 1227 * 1228 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING 1229 * THE STRIPE IN THE CORRECT ORDER 1230 * 1231 * raidPtr - raid descriptor 1232 * psid - parity stripe identifier 1233 * col - column of disk to find the offsets for 1234 * spCol - out: col of spare unit for failed unit 1235 * spOffset - out: offset into disk containing spare unit 1236 * 1237 */ 1238 1239 1240 static int 1241 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, 1242 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, 1243 RF_SectorNum_t *outFailedDiskSectorOffset, 1244 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset) 1245 { 1246 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1247 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1248 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ 1249 RF_RowCol_t *diskids; 1250 u_int i, j, k, i_offset, j_offset; 1251 RF_RowCol_t pcol; 1252 int testcol; 1253 RF_SectorNum_t poffset; 1254 char i_is_parity = 0, j_is_parity = 0; 1255 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; 1256 1257 /* get a listing of the disks comprising that stripe */ 1258 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); 1259 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids); 1260 RF_ASSERT(diskids); 1261 1262 /* reject this entire parity stripe if it does not contain the 1263 * indicated disk or it does not contain the failed disk */ 1264 1265 for (i = 0; i < stripeWidth; i++) { 1266 if (col == diskids[i]) 1267 break; 1268 } 1269 if (i == stripeWidth) 1270 goto skipit; 1271 for (j = 0; j < stripeWidth; j++) { 1272 if (fcol == diskids[j]) 1273 break; 1274 } 1275 if (j == stripeWidth) { 1276 goto skipit; 1277 } 1278 /* find out which disk the parity is on */ 1279 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP); 1280 1281 /* find out if either the current RU or the failed RU is parity */ 1282 /* also, if the parity occurs in this stripe prior to the data and/or 1283 * failed col, we need to decrement i and/or j */ 1284 for (k = 0; k < stripeWidth; k++) 1285 if (diskids[k] == pcol) 1286 break; 1287 RF_ASSERT(k < stripeWidth); 1288 i_offset = i; 1289 j_offset = j; 1290 if (k < i) 1291 i_offset--; 1292 else 1293 if (k == i) { 1294 i_is_parity = 1; 1295 i_offset = 0; 1296 } /* set offsets to zero to disable multiply 1297 * below */ 1298 if (k < j) 1299 j_offset--; 1300 else 1301 if (k == j) { 1302 j_is_parity = 1; 1303 j_offset = 0; 1304 } 1305 /* at this point, [ij]_is_parity tells us whether the [current,failed] 1306 * disk is parity at the start of this RU, and, if data, "[ij]_offset" 1307 * tells us how far into the stripe the [current,failed] disk is. */ 1308 1309 /* call the mapping routine to get the offset into the current disk, 1310 * repeat for failed disk. */ 1311 if (i_is_parity) 1312 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1313 else 1314 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1315 1316 RF_ASSERT(col == testcol); 1317 1318 if (j_is_parity) 1319 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1320 else 1321 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1322 RF_ASSERT(fcol == testcol); 1323 1324 /* now locate the spare unit for the failed unit */ 1325 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1326 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 1327 if (j_is_parity) 1328 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1329 else 1330 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1331 } else { 1332 #endif 1333 *spCol = raidPtr->reconControl->spareCol; 1334 *spOffset = *outFailedDiskSectorOffset; 1335 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1336 } 1337 #endif 1338 return (0); 1339 1340 skipit: 1341 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n", 1342 psid, col); 1343 return (1); 1344 } 1345 /* this is called when a buffer has become ready to write to the replacement disk */ 1346 static int 1347 IssueNextWriteRequest(RF_Raid_t *raidPtr) 1348 { 1349 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1350 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1351 #if RF_ACC_TRACE > 0 1352 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1353 #endif 1354 RF_ReconBuffer_t *rbuf; 1355 RF_DiskQueueData_t *req; 1356 1357 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl); 1358 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't 1359 * have gotten the event that sent us here */ 1360 RF_ASSERT(rbuf->pssPtr); 1361 1362 rbuf->pssPtr->writeRbuf = rbuf; 1363 rbuf->pssPtr = NULL; 1364 1365 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", 1366 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, 1367 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); 1368 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", 1369 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 1370 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 1371 1372 /* should be ok to use a NULL b_proc here b/c all addrs should be in 1373 * kernel space */ 1374 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, 1375 sectorsPerRU, rbuf->buffer, 1376 rbuf->parityStripeID, rbuf->which_ru, 1377 ReconWriteDoneProc, (void *) rbuf, 1378 #if RF_ACC_TRACE > 0 1379 &raidPtr->recon_tracerecs[fcol], 1380 #else 1381 NULL, 1382 #endif 1383 (void *) raidPtr, 0, NULL, PR_WAITOK); 1384 1385 rbuf->arg = (void *) req; 1386 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 1387 raidPtr->reconControl->pending_writes++; 1388 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 1389 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY); 1390 1391 return (0); 1392 } 1393 1394 /* 1395 * this gets called upon the completion of a reconstruction read 1396 * operation the arg is a pointer to the per-disk reconstruction 1397 * control structure for the process that just finished a read. 1398 * 1399 * called at interrupt context in the kernel, so don't do anything 1400 * illegal here. 1401 */ 1402 static int 1403 ReconReadDoneProc(void *arg, int status) 1404 { 1405 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; 1406 RF_Raid_t *raidPtr; 1407 1408 /* Detect that reconCtrl is no longer valid, and if that 1409 is the case, bail without calling rf_CauseReconEvent(). 1410 There won't be anyone listening for this event anyway */ 1411 1412 if (ctrl->reconCtrl == NULL) 1413 return(0); 1414 1415 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; 1416 1417 if (status) { 1418 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status); 1419 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED); 1420 return(0); 1421 } 1422 #if RF_ACC_TRACE > 0 1423 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1424 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1425 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = 1426 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1427 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1428 #endif 1429 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE); 1430 return (0); 1431 } 1432 /* this gets called upon the completion of a reconstruction write operation. 1433 * the arg is a pointer to the rbuf that was just written 1434 * 1435 * called at interrupt context in the kernel, so don't do anything illegal here. 1436 */ 1437 static int 1438 ReconWriteDoneProc(void *arg, int status) 1439 { 1440 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; 1441 1442 /* Detect that reconControl is no longer valid, and if that 1443 is the case, bail without calling rf_CauseReconEvent(). 1444 There won't be anyone listening for this event anyway */ 1445 1446 if (rbuf->raidPtr->reconControl == NULL) 1447 return(0); 1448 1449 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru); 1450 if (status) { 1451 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid); 1452 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED); 1453 return(0); 1454 } 1455 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE); 1456 return (0); 1457 } 1458 1459 1460 /* 1461 * computes a new minimum head sep, and wakes up anyone who needs to 1462 * be woken as a result 1463 */ 1464 static void 1465 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr) 1466 { 1467 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1468 RF_HeadSepLimit_t new_min; 1469 RF_RowCol_t i; 1470 RF_CallbackDesc_t *p; 1471 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition 1472 * of a minimum */ 1473 1474 1475 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1476 while(reconCtrlPtr->rb_lock) { 1477 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex); 1478 } 1479 reconCtrlPtr->rb_lock = 1; 1480 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1481 1482 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ 1483 for (i = 0; i < raidPtr->numCol; i++) 1484 if (i != reconCtrlPtr->fcol) { 1485 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) 1486 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; 1487 } 1488 /* set the new minimum and wake up anyone who can now run again */ 1489 if (new_min != reconCtrlPtr->minHeadSepCounter) { 1490 reconCtrlPtr->minHeadSepCounter = new_min; 1491 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min); 1492 while (reconCtrlPtr->headSepCBList) { 1493 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) 1494 break; 1495 p = reconCtrlPtr->headSepCBList; 1496 reconCtrlPtr->headSepCBList = p->next; 1497 p->next = NULL; 1498 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1499 rf_FreeCallbackDesc(p); 1500 } 1501 1502 } 1503 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1504 reconCtrlPtr->rb_lock = 0; 1505 wakeup(&reconCtrlPtr->rb_lock); 1506 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1507 } 1508 1509 /* 1510 * checks to see that the maximum head separation will not be violated 1511 * if we initiate a reconstruction I/O on the indicated disk. 1512 * Limiting the maximum head separation between two disks eliminates 1513 * the nasty buffer-stall conditions that occur when one disk races 1514 * ahead of the others and consumes all of the floating recon buffers. 1515 * This code is complex and unpleasant but it's necessary to avoid 1516 * some very nasty, albeit fairly rare, reconstruction behavior. 1517 * 1518 * returns non-zero if and only if we have to stop working on the 1519 * indicated disk due to a head-separation delay. 1520 */ 1521 static int 1522 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, 1523 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, 1524 RF_ReconUnitNum_t which_ru) 1525 { 1526 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1527 RF_CallbackDesc_t *cb, *p, *pt; 1528 int retval = 0; 1529 1530 /* if we're too far ahead of the slowest disk, stop working on this 1531 * disk until the slower ones catch up. We do this by scheduling a 1532 * wakeup callback for the time when the slowest disk has caught up. 1533 * We define "caught up" with 20% hysteresis, i.e. the head separation 1534 * must have fallen to at most 80% of the max allowable head 1535 * separation before we'll wake up. 1536 * 1537 */ 1538 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1539 while(reconCtrlPtr->rb_lock) { 1540 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex); 1541 } 1542 reconCtrlPtr->rb_lock = 1; 1543 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1544 if ((raidPtr->headSepLimit >= 0) && 1545 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { 1546 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n", 1547 raidPtr->raidid, col, ctrl->headSepCounter, 1548 reconCtrlPtr->minHeadSepCounter, 1549 raidPtr->headSepLimit); 1550 cb = rf_AllocCallbackDesc(); 1551 /* the minHeadSepCounter value we have to get to before we'll 1552 * wake up. build in 20% hysteresis. */ 1553 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); 1554 cb->col = col; 1555 cb->next = NULL; 1556 1557 /* insert this callback descriptor into the sorted list of 1558 * pending head-sep callbacks */ 1559 p = reconCtrlPtr->headSepCBList; 1560 if (!p) 1561 reconCtrlPtr->headSepCBList = cb; 1562 else 1563 if (cb->callbackArg.v < p->callbackArg.v) { 1564 cb->next = reconCtrlPtr->headSepCBList; 1565 reconCtrlPtr->headSepCBList = cb; 1566 } else { 1567 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); 1568 cb->next = p; 1569 pt->next = cb; 1570 } 1571 retval = 1; 1572 #if RF_RECON_STATS > 0 1573 ctrl->reconCtrl->reconDesc->hsStallCount++; 1574 #endif /* RF_RECON_STATS > 0 */ 1575 } 1576 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1577 reconCtrlPtr->rb_lock = 0; 1578 wakeup(&reconCtrlPtr->rb_lock); 1579 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1580 1581 return (retval); 1582 } 1583 /* 1584 * checks to see if reconstruction has been either forced or blocked 1585 * by a user operation. if forced, we skip this RU entirely. else if 1586 * blocked, put ourselves on the wait list. else return 0. 1587 * 1588 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY 1589 */ 1590 static int 1591 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, 1592 RF_ReconParityStripeStatus_t *pssPtr, 1593 RF_PerDiskReconCtrl_t *ctrl, 1594 RF_RowCol_t col, 1595 RF_StripeNum_t psid, 1596 RF_ReconUnitNum_t which_ru) 1597 { 1598 RF_CallbackDesc_t *cb; 1599 int retcode = 0; 1600 1601 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) 1602 retcode = RF_PSS_FORCED_ON_WRITE; 1603 else 1604 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { 1605 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru); 1606 cb = rf_AllocCallbackDesc(); /* append ourselves to 1607 * the blockage-wait 1608 * list */ 1609 cb->col = col; 1610 cb->next = pssPtr->blockWaitList; 1611 pssPtr->blockWaitList = cb; 1612 retcode = RF_PSS_RECON_BLOCKED; 1613 } 1614 if (!retcode) 1615 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under 1616 * reconstruction */ 1617 1618 return (retcode); 1619 } 1620 /* 1621 * if reconstruction is currently ongoing for the indicated stripeID, 1622 * reconstruction is forced to completion and we return non-zero to 1623 * indicate that the caller must wait. If not, then reconstruction is 1624 * blocked on the indicated stripe and the routine returns zero. If 1625 * and only if we return non-zero, we'll cause the cbFunc to get 1626 * invoked with the cbArg when the reconstruction has completed. 1627 */ 1628 int 1629 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1630 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg) 1631 { 1632 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're 1633 * forcing recon on */ 1634 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ 1635 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity 1636 * stripe status structure */ 1637 RF_StripeNum_t psid; /* parity stripe id */ 1638 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk 1639 * offset */ 1640 RF_RowCol_t *diskids; 1641 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ 1642 RF_RowCol_t fcol, diskno, i; 1643 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ 1644 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ 1645 RF_CallbackDesc_t *cb; 1646 int nPromoted; 1647 1648 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1649 1650 /* allocate a new PSS in case we need it */ 1651 newpssPtr = rf_AllocPSStatus(raidPtr); 1652 1653 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1654 1655 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr); 1656 1657 if (pssPtr != newpssPtr) { 1658 rf_FreePSStatus(raidPtr, newpssPtr); 1659 } 1660 1661 /* if recon is not ongoing on this PS, just return */ 1662 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1663 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1664 return (0); 1665 } 1666 /* otherwise, we have to wait for reconstruction to complete on this 1667 * RU. */ 1668 /* In order to avoid waiting for a potentially large number of 1669 * low-priority accesses to complete, we force a normal-priority (i.e. 1670 * not low-priority) reconstruction on this RU. */ 1671 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { 1672 DDprintf1("Forcing recon on psid %ld\n", psid); 1673 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under 1674 * forced recon */ 1675 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage 1676 * that we just set */ 1677 fcol = raidPtr->reconControl->fcol; 1678 1679 /* get a listing of the disks comprising the indicated stripe */ 1680 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids); 1681 1682 /* For previously issued reads, elevate them to normal 1683 * priority. If the I/O has already completed, it won't be 1684 * found in the queue, and hence this will be a no-op. For 1685 * unissued reads, allocate buffers and issue new reads. The 1686 * fact that we've set the FORCED bit means that the regular 1687 * recon procs will not re-issue these reqs */ 1688 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) 1689 if ((diskno = diskids[i]) != fcol) { 1690 if (pssPtr->issued[diskno]) { 1691 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru); 1692 if (rf_reconDebug && nPromoted) 1693 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno); 1694 } else { 1695 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ 1696 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset, 1697 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare 1698 * location */ 1699 new_rbuf->parityStripeID = psid; /* fill in the buffer */ 1700 new_rbuf->which_ru = which_ru; 1701 new_rbuf->failedDiskSectorOffset = fd_offset; 1702 new_rbuf->priority = RF_IO_NORMAL_PRIORITY; 1703 1704 /* use NULL b_proc b/c all addrs 1705 * should be in kernel space */ 1706 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, 1707 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, 1708 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK); 1709 1710 new_rbuf->arg = req; 1711 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ 1712 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno); 1713 } 1714 } 1715 /* if the write is sitting in the disk queue, elevate its 1716 * priority */ 1717 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru)) 1718 if (rf_reconDebug) 1719 printf("raid%d: promoted write to col %d\n", 1720 raidPtr->raidid, fcol); 1721 } 1722 /* install a callback descriptor to be invoked when recon completes on 1723 * this parity stripe. */ 1724 cb = rf_AllocCallbackDesc(); 1725 /* XXX the following is bogus.. These functions don't really match!! 1726 * GO */ 1727 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; 1728 cb->callbackArg.p = (void *) cbArg; 1729 cb->next = pssPtr->procWaitList; 1730 pssPtr->procWaitList = cb; 1731 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n", 1732 raidPtr->raidid, psid); 1733 1734 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1735 return (1); 1736 } 1737 /* called upon the completion of a forced reconstruction read. 1738 * all we do is schedule the FORCEDREADONE event. 1739 * called at interrupt context in the kernel, so don't do anything illegal here. 1740 */ 1741 static void 1742 ForceReconReadDoneProc(void *arg, int status) 1743 { 1744 RF_ReconBuffer_t *rbuf = arg; 1745 1746 /* Detect that reconControl is no longer valid, and if that 1747 is the case, bail without calling rf_CauseReconEvent(). 1748 There won't be anyone listening for this event anyway */ 1749 1750 if (rbuf->raidPtr->reconControl == NULL) 1751 return; 1752 1753 if (status) { 1754 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid); 1755 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED); 1756 return; 1757 } 1758 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); 1759 } 1760 /* releases a block on the reconstruction of the indicated stripe */ 1761 int 1762 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) 1763 { 1764 RF_StripeNum_t stripeID = asmap->stripeID; 1765 RF_ReconParityStripeStatus_t *pssPtr; 1766 RF_ReconUnitNum_t which_ru; 1767 RF_StripeNum_t psid; 1768 RF_CallbackDesc_t *cb; 1769 1770 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1771 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1772 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL); 1773 1774 /* When recon is forced, the pss desc can get deleted before we get 1775 * back to unblock recon. But, this can _only_ happen when recon is 1776 * forced. It would be good to put some kind of sanity check here, but 1777 * how to decide if recon was just forced or not? */ 1778 if (!pssPtr) { 1779 /* printf("Warning: no pss descriptor upon unblock on psid %ld 1780 * RU %d\n",psid,which_ru); */ 1781 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0) 1782 if (rf_reconDebug || rf_pssDebug) 1783 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru); 1784 #endif 1785 goto out; 1786 } 1787 pssPtr->blockCount--; 1788 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n", 1789 raidPtr->raidid, psid, pssPtr->blockCount); 1790 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ 1791 1792 /* unblock recon before calling CauseReconEvent in case 1793 * CauseReconEvent causes us to try to issue a new read before 1794 * returning here. */ 1795 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; 1796 1797 1798 while (pssPtr->blockWaitList) { 1799 /* spin through the block-wait list and 1800 release all the waiters */ 1801 cb = pssPtr->blockWaitList; 1802 pssPtr->blockWaitList = cb->next; 1803 cb->next = NULL; 1804 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR); 1805 rf_FreeCallbackDesc(cb); 1806 } 1807 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1808 /* if no recon was requested while recon was blocked */ 1809 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1810 } 1811 } 1812 out: 1813 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1814 return (0); 1815 } 1816