rf_reconstruct.c revision 1.98
1/* $NetBSD: rf_reconstruct.c,v 1.98 2007/07/18 19:04:58 ad Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/************************************************************ 30 * 31 * rf_reconstruct.c -- code to perform on-line reconstruction 32 * 33 ************************************************************/ 34 35#include <sys/cdefs.h> 36__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.98 2007/07/18 19:04:58 ad Exp $"); 37 38#include <sys/param.h> 39#include <sys/time.h> 40#include <sys/buf.h> 41#include <sys/errno.h> 42#include <sys/systm.h> 43#include <sys/proc.h> 44#include <sys/ioctl.h> 45#include <sys/fcntl.h> 46#include <sys/vnode.h> 47#include <dev/raidframe/raidframevar.h> 48 49#include "rf_raid.h" 50#include "rf_reconutil.h" 51#include "rf_revent.h" 52#include "rf_reconbuffer.h" 53#include "rf_acctrace.h" 54#include "rf_etimer.h" 55#include "rf_dag.h" 56#include "rf_desc.h" 57#include "rf_debugprint.h" 58#include "rf_general.h" 59#include "rf_driver.h" 60#include "rf_utils.h" 61#include "rf_shutdown.h" 62 63#include "rf_kintf.h" 64 65/* setting these to -1 causes them to be set to their default values if not set by debug options */ 66 67#if RF_DEBUG_RECON 68#define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) 69#define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 70#define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 71#define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 72#define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) 73#define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) 74#define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) 75#define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) 76 77#define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 78#define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 79 80#else /* RF_DEBUG_RECON */ 81 82#define Dprintf(s) {} 83#define Dprintf1(s,a) {} 84#define Dprintf2(s,a,b) {} 85#define Dprintf3(s,a,b,c) {} 86#define Dprintf4(s,a,b,c,d) {} 87#define Dprintf5(s,a,b,c,d,e) {} 88#define Dprintf6(s,a,b,c,d,e,f) {} 89#define Dprintf7(s,a,b,c,d,e,f,g) {} 90 91#define DDprintf1(s,a) {} 92#define DDprintf2(s,a,b) {} 93 94#endif /* RF_DEBUG_RECON */ 95 96#define RF_RECON_DONE_READS 1 97#define RF_RECON_READ_ERROR 2 98#define RF_RECON_WRITE_ERROR 3 99#define RF_RECON_READ_STOPPED 4 100 101#define RF_MAX_FREE_RECONBUFFER 32 102#define RF_MIN_FREE_RECONBUFFER 16 103 104static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t, 105 RF_RaidDisk_t *, int, RF_RowCol_t); 106static void FreeReconDesc(RF_RaidReconDesc_t *); 107static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *); 108static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t); 109static int TryToRead(RF_Raid_t *, RF_RowCol_t); 110static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t, 111 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *, 112 RF_SectorNum_t *); 113static int IssueNextWriteRequest(RF_Raid_t *); 114static int ReconReadDoneProc(void *, int); 115static int ReconWriteDoneProc(void *, int); 116static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t); 117static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *, 118 RF_RowCol_t, RF_HeadSepLimit_t, 119 RF_ReconUnitNum_t); 120static int CheckForcedOrBlockedReconstruction(RF_Raid_t *, 121 RF_ReconParityStripeStatus_t *, 122 RF_PerDiskReconCtrl_t *, 123 RF_RowCol_t, RF_StripeNum_t, 124 RF_ReconUnitNum_t); 125static void ForceReconReadDoneProc(void *, int); 126static void rf_ShutdownReconstruction(void *); 127 128struct RF_ReconDoneProc_s { 129 void (*proc) (RF_Raid_t *, void *); 130 void *arg; 131 RF_ReconDoneProc_t *next; 132}; 133 134/************************************************************************** 135 * 136 * sets up the parameters that will be used by the reconstruction process 137 * currently there are none, except for those that the layout-specific 138 * configuration (e.g. rf_ConfigureDeclustered) routine sets up. 139 * 140 * in the kernel, we fire off the recon thread. 141 * 142 **************************************************************************/ 143static void 144rf_ShutdownReconstruction(void *ignored) 145{ 146 pool_destroy(&rf_pools.reconbuffer); 147} 148 149int 150rf_ConfigureReconstruction(RF_ShutdownList_t **listp) 151{ 152 153 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t), 154 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER); 155 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); 156 157 return (0); 158} 159 160static RF_RaidReconDesc_t * 161AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col, 162 RF_RaidDisk_t *spareDiskPtr, int numDisksDone, 163 RF_RowCol_t scol) 164{ 165 166 RF_RaidReconDesc_t *reconDesc; 167 168 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t), 169 (RF_RaidReconDesc_t *)); 170 reconDesc->raidPtr = raidPtr; 171 reconDesc->col = col; 172 reconDesc->spareDiskPtr = spareDiskPtr; 173 reconDesc->numDisksDone = numDisksDone; 174 reconDesc->scol = scol; 175 reconDesc->next = NULL; 176 177 return (reconDesc); 178} 179 180static void 181FreeReconDesc(RF_RaidReconDesc_t *reconDesc) 182{ 183#if RF_RECON_STATS > 0 184 printf("raid%d: %lu recon event waits, %lu recon delays\n", 185 reconDesc->raidPtr->raidid, 186 (long) reconDesc->numReconEventWaits, 187 (long) reconDesc->numReconExecDelays); 188#endif /* RF_RECON_STATS > 0 */ 189 printf("raid%d: %lu max exec ticks\n", 190 reconDesc->raidPtr->raidid, 191 (long) reconDesc->maxReconExecTicks); 192#if (RF_RECON_STATS > 0) || defined(KERNEL) 193 printf("\n"); 194#endif /* (RF_RECON_STATS > 0) || KERNEL */ 195 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t)); 196} 197 198 199/***************************************************************************** 200 * 201 * primary routine to reconstruct a failed disk. This should be called from 202 * within its own thread. It won't return until reconstruction completes, 203 * fails, or is aborted. 204 *****************************************************************************/ 205int 206rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col) 207{ 208 const RF_LayoutSW_t *lp; 209 int rc; 210 211 lp = raidPtr->Layout.map; 212 if (lp->SubmitReconBuffer) { 213 /* 214 * The current infrastructure only supports reconstructing one 215 * disk at a time for each array. 216 */ 217 RF_LOCK_MUTEX(raidPtr->mutex); 218 while (raidPtr->reconInProgress) { 219 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); 220 } 221 raidPtr->reconInProgress++; 222 RF_UNLOCK_MUTEX(raidPtr->mutex); 223 rc = rf_ReconstructFailedDiskBasic(raidPtr, col); 224 RF_LOCK_MUTEX(raidPtr->mutex); 225 raidPtr->reconInProgress--; 226 RF_UNLOCK_MUTEX(raidPtr->mutex); 227 } else { 228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 229 lp->parityConfig); 230 rc = EIO; 231 } 232 RF_SIGNAL_COND(raidPtr->waitForReconCond); 233 return (rc); 234} 235 236int 237rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) 238{ 239 RF_ComponentLabel_t c_label; 240 RF_RaidDisk_t *spareDiskPtr = NULL; 241 RF_RaidReconDesc_t *reconDesc; 242 RF_RowCol_t scol; 243 int numDisksDone = 0, rc; 244 245 /* first look for a spare drive onto which to reconstruct the data */ 246 /* spare disk descriptors are stored in row 0. This may have to 247 * change eventually */ 248 249 RF_LOCK_MUTEX(raidPtr->mutex); 250 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); 251#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 252 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 253 if (raidPtr->status != rf_rs_degraded) { 254 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col); 255 RF_UNLOCK_MUTEX(raidPtr->mutex); 256 return (EINVAL); 257 } 258 scol = (-1); 259 } else { 260#endif 261 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { 262 if (raidPtr->Disks[scol].status == rf_ds_spare) { 263 spareDiskPtr = &raidPtr->Disks[scol]; 264 spareDiskPtr->status = rf_ds_used_spare; 265 break; 266 } 267 } 268 if (!spareDiskPtr) { 269 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col); 270 RF_UNLOCK_MUTEX(raidPtr->mutex); 271 return (ENOSPC); 272 } 273 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol); 274#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 275 } 276#endif 277 RF_UNLOCK_MUTEX(raidPtr->mutex); 278 279 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol); 280 raidPtr->reconDesc = (void *) reconDesc; 281#if RF_RECON_STATS > 0 282 reconDesc->hsStallCount = 0; 283 reconDesc->numReconExecDelays = 0; 284 reconDesc->numReconEventWaits = 0; 285#endif /* RF_RECON_STATS > 0 */ 286 reconDesc->reconExecTimerRunning = 0; 287 reconDesc->reconExecTicks = 0; 288 reconDesc->maxReconExecTicks = 0; 289 rc = rf_ContinueReconstructFailedDisk(reconDesc); 290 291 if (!rc) { 292 /* fix up the component label */ 293 /* Don't actually need the read here.. */ 294 raidread_component_label( 295 raidPtr->raid_cinfo[scol].ci_dev, 296 raidPtr->raid_cinfo[scol].ci_vp, 297 &c_label); 298 299 raid_init_component_label( raidPtr, &c_label); 300 c_label.row = 0; 301 c_label.column = col; 302 c_label.clean = RF_RAID_DIRTY; 303 c_label.status = rf_ds_optimal; 304 c_label.partitionSize = raidPtr->Disks[scol].partitionSize; 305 306 /* We've just done a rebuild based on all the other 307 disks, so at this point the parity is known to be 308 clean, even if it wasn't before. */ 309 310 /* XXX doesn't hold for RAID 6!!*/ 311 312 RF_LOCK_MUTEX(raidPtr->mutex); 313 raidPtr->parity_good = RF_RAID_CLEAN; 314 RF_UNLOCK_MUTEX(raidPtr->mutex); 315 316 /* XXXX MORE NEEDED HERE */ 317 318 raidwrite_component_label( 319 raidPtr->raid_cinfo[scol].ci_dev, 320 raidPtr->raid_cinfo[scol].ci_vp, 321 &c_label); 322 323 } else { 324 /* Reconstruct failed. */ 325 326 RF_LOCK_MUTEX(raidPtr->mutex); 327 /* Failed disk goes back to "failed" status */ 328 raidPtr->Disks[col].status = rf_ds_failed; 329 330 /* Spare disk goes back to "spare" status. */ 331 spareDiskPtr->status = rf_ds_spare; 332 RF_UNLOCK_MUTEX(raidPtr->mutex); 333 334 } 335 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 336 return (rc); 337} 338 339/* 340 341 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, 342 and you don't get a spare until the next Monday. With this function 343 (and hot-swappable drives) you can now put your new disk containing 344 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to 345 rebuild the data "on the spot". 346 347*/ 348 349int 350rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col) 351{ 352 RF_RaidDisk_t *spareDiskPtr = NULL; 353 RF_RaidReconDesc_t *reconDesc; 354 const RF_LayoutSW_t *lp; 355 RF_ComponentLabel_t c_label; 356 int numDisksDone = 0, rc; 357 struct partinfo dpart; 358 struct vnode *vp; 359 struct vattr va; 360 struct lwp *lwp; 361 int retcode; 362 int ac; 363 364 lp = raidPtr->Layout.map; 365 if (!lp->SubmitReconBuffer) { 366 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", 367 lp->parityConfig); 368 /* wakeup anyone who might be waiting to do a reconstruct */ 369 RF_SIGNAL_COND(raidPtr->waitForReconCond); 370 return(EIO); 371 } 372 373 /* 374 * The current infrastructure only supports reconstructing one 375 * disk at a time for each array. 376 */ 377 RF_LOCK_MUTEX(raidPtr->mutex); 378 379 if (raidPtr->Disks[col].status != rf_ds_failed) { 380 /* "It's gone..." */ 381 raidPtr->numFailures++; 382 raidPtr->Disks[col].status = rf_ds_failed; 383 raidPtr->status = rf_rs_degraded; 384 RF_UNLOCK_MUTEX(raidPtr->mutex); 385 rf_update_component_labels(raidPtr, 386 RF_NORMAL_COMPONENT_UPDATE); 387 RF_LOCK_MUTEX(raidPtr->mutex); 388 } 389 390 while (raidPtr->reconInProgress) { 391 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); 392 } 393 394 raidPtr->reconInProgress++; 395 396 /* first look for a spare drive onto which to reconstruct the 397 data. spare disk descriptors are stored in row 0. This 398 may have to change eventually */ 399 400 /* Actually, we don't care if it's failed or not... On a RAID 401 set with correct parity, this function should be callable 402 on any component without ill affects. */ 403 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */ 404 405#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 406 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { 407 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col); 408 409 raidPtr->reconInProgress--; 410 RF_UNLOCK_MUTEX(raidPtr->mutex); 411 RF_SIGNAL_COND(raidPtr->waitForReconCond); 412 return (EINVAL); 413 } 414#endif 415 lwp = raidPtr->engine_thread; 416 417 /* This device may have been opened successfully the 418 first time. Close it before trying to open it again.. */ 419 420 if (raidPtr->raid_cinfo[col].ci_vp != NULL) { 421#if 0 422 printf("Closed the open device: %s\n", 423 raidPtr->Disks[col].devname); 424#endif 425 vp = raidPtr->raid_cinfo[col].ci_vp; 426 ac = raidPtr->Disks[col].auto_configured; 427 RF_UNLOCK_MUTEX(raidPtr->mutex); 428 rf_close_component(raidPtr, vp, ac); 429 RF_LOCK_MUTEX(raidPtr->mutex); 430 raidPtr->raid_cinfo[col].ci_vp = NULL; 431 } 432 /* note that this disk was *not* auto_configured (any longer)*/ 433 raidPtr->Disks[col].auto_configured = 0; 434 435#if 0 436 printf("About to (re-)open the device for rebuilding: %s\n", 437 raidPtr->Disks[col].devname); 438#endif 439 RF_UNLOCK_MUTEX(raidPtr->mutex); 440 retcode = dk_lookup(raidPtr->Disks[col].devname, lwp, &vp, UIO_SYSSPACE); 441 442 if (retcode) { 443 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid, 444 raidPtr->Disks[col].devname, retcode); 445 446 /* the component isn't responding properly... 447 must be still dead :-( */ 448 RF_LOCK_MUTEX(raidPtr->mutex); 449 raidPtr->reconInProgress--; 450 RF_UNLOCK_MUTEX(raidPtr->mutex); 451 RF_SIGNAL_COND(raidPtr->waitForReconCond); 452 return(retcode); 453 } 454 455 /* Ok, so we can at least do a lookup... 456 How about actually getting a vp for it? */ 457 458 if ((retcode = VOP_GETATTR(vp, &va, lwp->l_cred, lwp)) != 0) { 459 RF_LOCK_MUTEX(raidPtr->mutex); 460 raidPtr->reconInProgress--; 461 RF_UNLOCK_MUTEX(raidPtr->mutex); 462 RF_SIGNAL_COND(raidPtr->waitForReconCond); 463 return(retcode); 464 } 465 466 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, lwp->l_cred, lwp); 467 if (retcode) { 468 RF_LOCK_MUTEX(raidPtr->mutex); 469 raidPtr->reconInProgress--; 470 RF_UNLOCK_MUTEX(raidPtr->mutex); 471 RF_SIGNAL_COND(raidPtr->waitForReconCond); 472 return(retcode); 473 } 474 RF_LOCK_MUTEX(raidPtr->mutex); 475 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize; 476 477 raidPtr->Disks[col].numBlocks = dpart.part->p_size - 478 rf_protectedSectors; 479 480 raidPtr->raid_cinfo[col].ci_vp = vp; 481 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev; 482 483 raidPtr->Disks[col].dev = va.va_rdev; 484 485 /* we allow the user to specify that only a fraction 486 of the disks should be used this is just for debug: 487 it speeds up * the parity scan */ 488 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks * 489 rf_sizePercentage / 100; 490 RF_UNLOCK_MUTEX(raidPtr->mutex); 491 492 spareDiskPtr = &raidPtr->Disks[col]; 493 spareDiskPtr->status = rf_ds_used_spare; 494 495 printf("raid%d: initiating in-place reconstruction on column %d\n", 496 raidPtr->raidid, col); 497 498 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, 499 numDisksDone, col); 500 raidPtr->reconDesc = (void *) reconDesc; 501#if RF_RECON_STATS > 0 502 reconDesc->hsStallCount = 0; 503 reconDesc->numReconExecDelays = 0; 504 reconDesc->numReconEventWaits = 0; 505#endif /* RF_RECON_STATS > 0 */ 506 reconDesc->reconExecTimerRunning = 0; 507 reconDesc->reconExecTicks = 0; 508 reconDesc->maxReconExecTicks = 0; 509 rc = rf_ContinueReconstructFailedDisk(reconDesc); 510 511 if (!rc) { 512 RF_LOCK_MUTEX(raidPtr->mutex); 513 /* Need to set these here, as at this point it'll be claiming 514 that the disk is in rf_ds_spared! But we know better :-) */ 515 516 raidPtr->Disks[col].status = rf_ds_optimal; 517 raidPtr->status = rf_rs_optimal; 518 RF_UNLOCK_MUTEX(raidPtr->mutex); 519 520 /* fix up the component label */ 521 /* Don't actually need the read here.. */ 522 raidread_component_label(raidPtr->raid_cinfo[col].ci_dev, 523 raidPtr->raid_cinfo[col].ci_vp, 524 &c_label); 525 526 RF_LOCK_MUTEX(raidPtr->mutex); 527 raid_init_component_label(raidPtr, &c_label); 528 529 c_label.row = 0; 530 c_label.column = col; 531 532 /* We've just done a rebuild based on all the other 533 disks, so at this point the parity is known to be 534 clean, even if it wasn't before. */ 535 536 /* XXX doesn't hold for RAID 6!!*/ 537 538 raidPtr->parity_good = RF_RAID_CLEAN; 539 RF_UNLOCK_MUTEX(raidPtr->mutex); 540 541 raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev, 542 raidPtr->raid_cinfo[col].ci_vp, 543 &c_label); 544 545 } else { 546 /* Reconstruct-in-place failed. Disk goes back to 547 "failed" status, regardless of what it was before. */ 548 RF_LOCK_MUTEX(raidPtr->mutex); 549 raidPtr->Disks[col].status = rf_ds_failed; 550 RF_UNLOCK_MUTEX(raidPtr->mutex); 551 } 552 553 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); 554 555 RF_LOCK_MUTEX(raidPtr->mutex); 556 raidPtr->reconInProgress--; 557 RF_UNLOCK_MUTEX(raidPtr->mutex); 558 559 RF_SIGNAL_COND(raidPtr->waitForReconCond); 560 return (rc); 561} 562 563 564int 565rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc) 566{ 567 RF_Raid_t *raidPtr = reconDesc->raidPtr; 568 RF_RowCol_t col = reconDesc->col; 569 RF_RowCol_t scol = reconDesc->scol; 570 RF_ReconMap_t *mapPtr; 571 RF_ReconCtrl_t *tmp_reconctrl; 572 RF_ReconEvent_t *event; 573 RF_CallbackDesc_t *p; 574 struct timeval etime, elpsd; 575 unsigned long xor_s, xor_resid_us; 576 int i, ds; 577 int status; 578 int recon_error, write_error; 579 580 raidPtr->accumXorTimeUs = 0; 581#if RF_ACC_TRACE > 0 582 /* create one trace record per physical disk */ 583 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); 584#endif 585 586 /* quiesce the array prior to starting recon. this is needed 587 * to assure no nasty interactions with pending user writes. 588 * We need to do this before we change the disk or row status. */ 589 590 Dprintf("RECON: begin request suspend\n"); 591 rf_SuspendNewRequestsAndWait(raidPtr); 592 Dprintf("RECON: end request suspend\n"); 593 594 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */ 595 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol); 596 597 RF_LOCK_MUTEX(raidPtr->mutex); 598 599 /* create the reconstruction control pointer and install it in 600 * the right slot */ 601 raidPtr->reconControl = tmp_reconctrl; 602 mapPtr = raidPtr->reconControl->reconMap; 603 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs; 604 raidPtr->reconControl->numRUsComplete = 0; 605 raidPtr->status = rf_rs_reconstructing; 606 raidPtr->Disks[col].status = rf_ds_reconstructing; 607 raidPtr->Disks[col].spareCol = scol; 608 609 RF_UNLOCK_MUTEX(raidPtr->mutex); 610 611 RF_GETTIME(raidPtr->reconControl->starttime); 612 613 /* now start up the actual reconstruction: issue a read for 614 * each surviving disk */ 615 616 reconDesc->numDisksDone = 0; 617 for (i = 0; i < raidPtr->numCol; i++) { 618 if (i != col) { 619 /* find and issue the next I/O on the 620 * indicated disk */ 621 if (IssueNextReadRequest(raidPtr, i)) { 622 Dprintf1("RECON: done issuing for c%d\n", i); 623 reconDesc->numDisksDone++; 624 } 625 } 626 } 627 628 Dprintf("RECON: resume requests\n"); 629 rf_ResumeNewRequests(raidPtr); 630 631 /* process reconstruction events until all disks report that 632 * they've completed all work */ 633 634 mapPtr = raidPtr->reconControl->reconMap; 635 recon_error = 0; 636 write_error = 0; 637 638 while (reconDesc->numDisksDone < raidPtr->numCol - 1) { 639 640 event = rf_GetNextReconEvent(reconDesc); 641 status = ProcessReconEvent(raidPtr, event); 642 643 /* the normal case is that a read completes, and all is well. */ 644 if (status == RF_RECON_DONE_READS) { 645 reconDesc->numDisksDone++; 646 } else if ((status == RF_RECON_READ_ERROR) || 647 (status == RF_RECON_WRITE_ERROR)) { 648 /* an error was encountered while reconstructing... 649 Pretend we've finished this disk. 650 */ 651 recon_error = 1; 652 raidPtr->reconControl->error = 1; 653 654 /* bump the numDisksDone count for reads, 655 but not for writes */ 656 if (status == RF_RECON_READ_ERROR) 657 reconDesc->numDisksDone++; 658 659 /* write errors are special -- when we are 660 done dealing with the reads that are 661 finished, we don't want to wait for any 662 writes */ 663 if (status == RF_RECON_WRITE_ERROR) 664 write_error = 1; 665 666 } else if (status == RF_RECON_READ_STOPPED) { 667 /* count this component as being "done" */ 668 reconDesc->numDisksDone++; 669 } 670 671 if (recon_error) { 672 673 /* make sure any stragglers are woken up so that 674 their theads will complete, and we can get out 675 of here with all IO processed */ 676 677 while (raidPtr->reconControl->headSepCBList) { 678 p = raidPtr->reconControl->headSepCBList; 679 raidPtr->reconControl->headSepCBList = p->next; 680 p->next = NULL; 681 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 682 rf_FreeCallbackDesc(p); 683 } 684 } 685 686 raidPtr->reconControl->numRUsTotal = 687 mapPtr->totalRUs; 688 raidPtr->reconControl->numRUsComplete = 689 mapPtr->totalRUs - 690 rf_UnitsLeftToReconstruct(mapPtr); 691 692#if RF_DEBUG_RECON 693 raidPtr->reconControl->percentComplete = 694 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 695 if (rf_prReconSched) { 696 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 697 } 698#endif 699 } 700 701 mapPtr = raidPtr->reconControl->reconMap; 702 if (rf_reconDebug) { 703 printf("RECON: all reads completed\n"); 704 } 705 /* at this point all the reads have completed. We now wait 706 * for any pending writes to complete, and then we're done */ 707 708 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) { 709 710 event = rf_GetNextReconEvent(reconDesc); 711 status = ProcessReconEvent(raidPtr, event); 712 713 if (status == RF_RECON_WRITE_ERROR) { 714 recon_error = 1; 715 raidPtr->reconControl->error = 1; 716 /* an error was encountered at the very end... bail */ 717 } else { 718#if RF_DEBUG_RECON 719 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); 720 if (rf_prReconSched) { 721 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime)); 722 } 723#endif 724 } 725 } 726 727 if (recon_error) { 728 /* we've encountered an error in reconstructing. */ 729 printf("raid%d: reconstruction failed.\n", raidPtr->raidid); 730 731 /* we start by blocking IO to the RAID set. */ 732 rf_SuspendNewRequestsAndWait(raidPtr); 733 734 RF_LOCK_MUTEX(raidPtr->mutex); 735 /* mark set as being degraded, rather than 736 rf_rs_reconstructing as we were before the problem. 737 After this is done we can update status of the 738 component disks without worrying about someone 739 trying to read from a failed component. 740 */ 741 raidPtr->status = rf_rs_degraded; 742 RF_UNLOCK_MUTEX(raidPtr->mutex); 743 744 /* resume IO */ 745 rf_ResumeNewRequests(raidPtr); 746 747 /* At this point there are two cases: 748 1) If we've experienced a read error, then we've 749 already waited for all the reads we're going to get, 750 and we just need to wait for the writes. 751 752 2) If we've experienced a write error, we've also 753 already waited for all the reads to complete, 754 but there is little point in waiting for the writes -- 755 when they do complete, they will just be ignored. 756 757 So we just wait for writes to complete if we didn't have a 758 write error. 759 */ 760 761 if (!write_error) { 762 /* wait for writes to complete */ 763 while (raidPtr->reconControl->pending_writes > 0) { 764 765 event = rf_GetNextReconEvent(reconDesc); 766 status = ProcessReconEvent(raidPtr, event); 767 768 if (status == RF_RECON_WRITE_ERROR) { 769 raidPtr->reconControl->error = 1; 770 /* an error was encountered at the very end... bail. 771 This will be very bad news for the user, since 772 at this point there will have been a read error 773 on one component, and a write error on another! 774 */ 775 break; 776 } 777 } 778 } 779 780 781 /* cleanup */ 782 783 /* drain the event queue - after waiting for the writes above, 784 there shouldn't be much (if anything!) left in the queue. */ 785 786 rf_DrainReconEventQueue(reconDesc); 787 788 /* XXX As much as we'd like to free the recon control structure 789 and the reconDesc, we have no way of knowing if/when those will 790 be touched by IO that has yet to occur. It is rather poor to be 791 basically causing a 'memory leak' here, but there doesn't seem to be 792 a cleaner alternative at this time. Perhaps when the reconstruct code 793 gets a makeover this problem will go away. 794 */ 795#if 0 796 rf_FreeReconControl(raidPtr); 797#endif 798 799#if RF_ACC_TRACE > 0 800 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 801#endif 802 /* XXX see comment above */ 803#if 0 804 FreeReconDesc(reconDesc); 805#endif 806 807 return (1); 808 } 809 810 /* Success: mark the dead disk as reconstructed. We quiesce 811 * the array here to assure no nasty interactions with pending 812 * user accesses when we free up the psstatus structure as 813 * part of FreeReconControl() */ 814 815 rf_SuspendNewRequestsAndWait(raidPtr); 816 817 RF_LOCK_MUTEX(raidPtr->mutex); 818 raidPtr->numFailures--; 819 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); 820 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; 821 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal; 822 RF_UNLOCK_MUTEX(raidPtr->mutex); 823 RF_GETTIME(etime); 824 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd); 825 826 rf_ResumeNewRequests(raidPtr); 827 828 printf("raid%d: Reconstruction of disk at col %d completed\n", 829 raidPtr->raidid, col); 830 xor_s = raidPtr->accumXorTimeUs / 1000000; 831 xor_resid_us = raidPtr->accumXorTimeUs % 1000000; 832 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", 833 raidPtr->raidid, 834 (int) elpsd.tv_sec, (int) elpsd.tv_usec, 835 raidPtr->accumXorTimeUs, xor_s, xor_resid_us); 836 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n", 837 raidPtr->raidid, 838 (int) raidPtr->reconControl->starttime.tv_sec, 839 (int) raidPtr->reconControl->starttime.tv_usec, 840 (int) etime.tv_sec, (int) etime.tv_usec); 841#if RF_RECON_STATS > 0 842 printf("raid%d: Total head-sep stall count was %d\n", 843 raidPtr->raidid, (int) reconDesc->hsStallCount); 844#endif /* RF_RECON_STATS > 0 */ 845 rf_FreeReconControl(raidPtr); 846#if RF_ACC_TRACE > 0 847 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); 848#endif 849 FreeReconDesc(reconDesc); 850 851 return (0); 852 853} 854/***************************************************************************** 855 * do the right thing upon each reconstruction event. 856 *****************************************************************************/ 857static int 858ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event) 859{ 860 int retcode = 0, submitblocked; 861 RF_ReconBuffer_t *rbuf; 862 RF_SectorCount_t sectorsPerRU; 863 864 retcode = RF_RECON_READ_STOPPED; 865 866 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); 867 switch (event->type) { 868 869 /* a read I/O has completed */ 870 case RF_REVENT_READDONE: 871 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf; 872 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n", 873 event->col, rbuf->parityStripeID); 874 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", 875 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 876 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 877 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 878 if (!raidPtr->reconControl->error) { 879 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); 880 Dprintf1("RECON: submitblocked=%d\n", submitblocked); 881 if (!submitblocked) 882 retcode = IssueNextReadRequest(raidPtr, event->col); 883 else 884 retcode = 0; 885 } 886 break; 887 888 /* a write I/O has completed */ 889 case RF_REVENT_WRITEDONE: 890#if RF_DEBUG_RECON 891 if (rf_floatingRbufDebug) { 892 rf_CheckFloatingRbufCount(raidPtr, 1); 893 } 894#endif 895 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 896 rbuf = (RF_ReconBuffer_t *) event->arg; 897 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 898 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", 899 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete); 900 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap, 901 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); 902 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru); 903 904 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 905 raidPtr->reconControl->pending_writes--; 906 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 907 908 if (rbuf->type == RF_RBUF_TYPE_FLOATING) { 909 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 910 while(raidPtr->reconControl->rb_lock) { 911 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0, 912 &raidPtr->reconControl->rb_mutex); 913 } 914 raidPtr->reconControl->rb_lock = 1; 915 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 916 917 raidPtr->numFullReconBuffers--; 918 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf); 919 920 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 921 raidPtr->reconControl->rb_lock = 0; 922 wakeup(&raidPtr->reconControl->rb_lock); 923 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 924 } else 925 if (rbuf->type == RF_RBUF_TYPE_FORCED) 926 rf_FreeReconBuffer(rbuf); 927 else 928 RF_ASSERT(0); 929 retcode = 0; 930 break; 931 932 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been 933 * cleared */ 934 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col); 935 if (!raidPtr->reconControl->error) { 936 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf, 937 0, (int) (long) event->arg); 938 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the 939 * BUFCLEAR event if we 940 * couldn't submit */ 941 retcode = IssueNextReadRequest(raidPtr, event->col); 942 } 943 break; 944 945 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction 946 * blockage has been cleared */ 947 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col); 948 if (!raidPtr->reconControl->error) { 949 retcode = TryToRead(raidPtr, event->col); 950 } 951 break; 952 953 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation 954 * reconstruction blockage has been 955 * cleared */ 956 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col); 957 if (!raidPtr->reconControl->error) { 958 retcode = TryToRead(raidPtr, event->col); 959 } 960 break; 961 962 /* a buffer has become ready to write */ 963 case RF_REVENT_BUFREADY: 964 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col); 965 if (!raidPtr->reconControl->error) { 966 retcode = IssueNextWriteRequest(raidPtr); 967#if RF_DEBUG_RECON 968 if (rf_floatingRbufDebug) { 969 rf_CheckFloatingRbufCount(raidPtr, 1); 970 } 971#endif 972 } 973 break; 974 975 /* we need to skip the current RU entirely because it got 976 * recon'd while we were waiting for something else to happen */ 977 case RF_REVENT_SKIP: 978 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col); 979 if (!raidPtr->reconControl->error) { 980 retcode = IssueNextReadRequest(raidPtr, event->col); 981 } 982 break; 983 984 /* a forced-reconstruction read access has completed. Just 985 * submit the buffer */ 986 case RF_REVENT_FORCEDREADDONE: 987 rbuf = (RF_ReconBuffer_t *) event->arg; 988 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 989 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col); 990 if (!raidPtr->reconControl->error) { 991 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); 992 RF_ASSERT(!submitblocked); 993 } 994 break; 995 996 /* A read I/O failed to complete */ 997 case RF_REVENT_READ_FAILED: 998 retcode = RF_RECON_READ_ERROR; 999 break; 1000 1001 /* A write I/O failed to complete */ 1002 case RF_REVENT_WRITE_FAILED: 1003 retcode = RF_RECON_WRITE_ERROR; 1004 1005 rbuf = (RF_ReconBuffer_t *) event->arg; 1006 1007 /* cleanup the disk queue data */ 1008 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); 1009 1010 /* At this point we're erroring out, badly, and floatingRbufs 1011 may not even be valid. Rather than putting this back onto 1012 the floatingRbufs list, just arrange for its immediate 1013 destruction. 1014 */ 1015 rf_FreeReconBuffer(rbuf); 1016 break; 1017 1018 /* a forced read I/O failed to complete */ 1019 case RF_REVENT_FORCEDREAD_FAILED: 1020 retcode = RF_RECON_READ_ERROR; 1021 break; 1022 1023 default: 1024 RF_PANIC(); 1025 } 1026 rf_FreeReconEventDesc(event); 1027 return (retcode); 1028} 1029/***************************************************************************** 1030 * 1031 * find the next thing that's needed on the indicated disk, and issue 1032 * a read request for it. We assume that the reconstruction buffer 1033 * associated with this process is free to receive the data. If 1034 * reconstruction is blocked on the indicated RU, we issue a 1035 * blockage-release request instead of a physical disk read request. 1036 * If the current disk gets too far ahead of the others, we issue a 1037 * head-separation wait request and return. 1038 * 1039 * ctrl->{ru_count, curPSID, diskOffset} and 1040 * rbuf->failedDiskSectorOffset are maintained to point to the unit 1041 * we're currently accessing. Note that this deviates from the 1042 * standard C idiom of having counters point to the next thing to be 1043 * accessed. This allows us to easily retry when we're blocked by 1044 * head separation or reconstruction-blockage events. 1045 * 1046 *****************************************************************************/ 1047static int 1048IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col) 1049{ 1050 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1051 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1052 RF_ReconBuffer_t *rbuf = ctrl->rbuf; 1053 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; 1054 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1055 int do_new_check = 0, retcode = 0, status; 1056 1057 /* if we are currently the slowest disk, mark that we have to do a new 1058 * check */ 1059 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter) 1060 do_new_check = 1; 1061 1062 while (1) { 1063 1064 ctrl->ru_count++; 1065 if (ctrl->ru_count < RUsPerPU) { 1066 ctrl->diskOffset += sectorsPerRU; 1067 rbuf->failedDiskSectorOffset += sectorsPerRU; 1068 } else { 1069 ctrl->curPSID++; 1070 ctrl->ru_count = 0; 1071 /* code left over from when head-sep was based on 1072 * parity stripe id */ 1073 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) { 1074 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter)); 1075 return (RF_RECON_DONE_READS); /* finito! */ 1076 } 1077 /* find the disk offsets of the start of the parity 1078 * stripe on both the current disk and the failed 1079 * disk. skip this entire parity stripe if either disk 1080 * does not appear in the indicated PS */ 1081 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, 1082 &rbuf->spCol, &rbuf->spOffset); 1083 if (status) { 1084 ctrl->ru_count = RUsPerPU - 1; 1085 continue; 1086 } 1087 } 1088 rbuf->which_ru = ctrl->ru_count; 1089 1090 /* skip this RU if it's already been reconstructed */ 1091 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) { 1092 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count); 1093 continue; 1094 } 1095 break; 1096 } 1097 ctrl->headSepCounter++; 1098 if (do_new_check) 1099 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */ 1100 1101 1102 /* at this point, we have definitely decided what to do, and we have 1103 * only to see if we can actually do it now */ 1104 rbuf->parityStripeID = ctrl->curPSID; 1105 rbuf->which_ru = ctrl->ru_count; 1106#if RF_ACC_TRACE > 0 1107 memset((char *) &raidPtr->recon_tracerecs[col], 0, 1108 sizeof(raidPtr->recon_tracerecs[col])); 1109 raidPtr->recon_tracerecs[col].reconacc = 1; 1110 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1111#endif 1112 retcode = TryToRead(raidPtr, col); 1113 return (retcode); 1114} 1115 1116/* 1117 * tries to issue the next read on the indicated disk. We may be 1118 * blocked by (a) the heads being too far apart, or (b) recon on the 1119 * indicated RU being blocked due to a write by a user thread. In 1120 * this case, we issue a head-sep or blockage wait request, which will 1121 * cause this same routine to be invoked again later when the blockage 1122 * has cleared. 1123 */ 1124 1125static int 1126TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col) 1127{ 1128 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col]; 1129 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; 1130 RF_StripeNum_t psid = ctrl->curPSID; 1131 RF_ReconUnitNum_t which_ru = ctrl->ru_count; 1132 RF_DiskQueueData_t *req; 1133 int status; 1134 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; 1135 1136 /* if the current disk is too far ahead of the others, issue a 1137 * head-separation wait and return */ 1138 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru)) 1139 return (0); 1140 1141 /* allocate a new PSS in case we need it */ 1142 newpssPtr = rf_AllocPSStatus(raidPtr); 1143 1144 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1145 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr); 1146 1147 if (pssPtr != newpssPtr) { 1148 rf_FreePSStatus(raidPtr, newpssPtr); 1149 } 1150 1151 /* if recon is blocked on the indicated parity stripe, issue a 1152 * block-wait request and return. this also must mark the indicated RU 1153 * in the stripe as under reconstruction if not blocked. */ 1154 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru); 1155 if (status == RF_PSS_RECON_BLOCKED) { 1156 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru); 1157 goto out; 1158 } else 1159 if (status == RF_PSS_FORCED_ON_WRITE) { 1160 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1161 goto out; 1162 } 1163 /* make one last check to be sure that the indicated RU didn't get 1164 * reconstructed while we were waiting for something else to happen. 1165 * This is unfortunate in that it causes us to make this check twice 1166 * in the normal case. Might want to make some attempt to re-work 1167 * this so that we only do this check if we've definitely blocked on 1168 * one of the above checks. When this condition is detected, we may 1169 * have just created a bogus status entry, which we need to delete. */ 1170 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { 1171 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru); 1172 if (pssPtr == newpssPtr) 1173 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1174 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP); 1175 goto out; 1176 } 1177 /* found something to read. issue the I/O */ 1178 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n", 1179 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer); 1180#if RF_ACC_TRACE > 0 1181 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); 1182 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); 1183 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = 1184 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); 1185 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); 1186#endif 1187 /* should be ok to use a NULL proc pointer here, all the bufs we use 1188 * should be in kernel space */ 1189 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, 1190 ReconReadDoneProc, (void *) ctrl, 1191#if RF_ACC_TRACE > 0 1192 &raidPtr->recon_tracerecs[col], 1193#else 1194 NULL, 1195#endif 1196 (void *) raidPtr, 0, NULL, PR_WAITOK); 1197 1198 ctrl->rbuf->arg = (void *) req; 1199 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY); 1200 pssPtr->issued[col] = 1; 1201 1202out: 1203 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1204 return (0); 1205} 1206 1207 1208/* 1209 * given a parity stripe ID, we want to find out whether both the 1210 * current disk and the failed disk exist in that parity stripe. If 1211 * not, we want to skip this whole PS. If so, we want to find the 1212 * disk offset of the start of the PS on both the current disk and the 1213 * failed disk. 1214 * 1215 * this works by getting a list of disks comprising the indicated 1216 * parity stripe, and searching the list for the current and failed 1217 * disks. Once we've decided they both exist in the parity stripe, we 1218 * need to decide whether each is data or parity, so that we'll know 1219 * which mapping function to call to get the corresponding disk 1220 * offsets. 1221 * 1222 * this is kind of unpleasant, but doing it this way allows the 1223 * reconstruction code to use parity stripe IDs rather than physical 1224 * disks address to march through the failed disk, which greatly 1225 * simplifies a lot of code, as well as eliminating the need for a 1226 * reverse-mapping function. I also think it will execute faster, 1227 * since the calls to the mapping module are kept to a minimum. 1228 * 1229 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING 1230 * THE STRIPE IN THE CORRECT ORDER 1231 * 1232 * raidPtr - raid descriptor 1233 * psid - parity stripe identifier 1234 * col - column of disk to find the offsets for 1235 * spCol - out: col of spare unit for failed unit 1236 * spOffset - out: offset into disk containing spare unit 1237 * 1238 */ 1239 1240 1241static int 1242ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, 1243 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, 1244 RF_SectorNum_t *outFailedDiskSectorOffset, 1245 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset) 1246{ 1247 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1248 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1249 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ 1250 RF_RowCol_t *diskids; 1251 u_int i, j, k, i_offset, j_offset; 1252 RF_RowCol_t pcol; 1253 int testcol; 1254 RF_SectorNum_t poffset; 1255 char i_is_parity = 0, j_is_parity = 0; 1256 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; 1257 1258 /* get a listing of the disks comprising that stripe */ 1259 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); 1260 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids); 1261 RF_ASSERT(diskids); 1262 1263 /* reject this entire parity stripe if it does not contain the 1264 * indicated disk or it does not contain the failed disk */ 1265 1266 for (i = 0; i < stripeWidth; i++) { 1267 if (col == diskids[i]) 1268 break; 1269 } 1270 if (i == stripeWidth) 1271 goto skipit; 1272 for (j = 0; j < stripeWidth; j++) { 1273 if (fcol == diskids[j]) 1274 break; 1275 } 1276 if (j == stripeWidth) { 1277 goto skipit; 1278 } 1279 /* find out which disk the parity is on */ 1280 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP); 1281 1282 /* find out if either the current RU or the failed RU is parity */ 1283 /* also, if the parity occurs in this stripe prior to the data and/or 1284 * failed col, we need to decrement i and/or j */ 1285 for (k = 0; k < stripeWidth; k++) 1286 if (diskids[k] == pcol) 1287 break; 1288 RF_ASSERT(k < stripeWidth); 1289 i_offset = i; 1290 j_offset = j; 1291 if (k < i) 1292 i_offset--; 1293 else 1294 if (k == i) { 1295 i_is_parity = 1; 1296 i_offset = 0; 1297 } /* set offsets to zero to disable multiply 1298 * below */ 1299 if (k < j) 1300 j_offset--; 1301 else 1302 if (k == j) { 1303 j_is_parity = 1; 1304 j_offset = 0; 1305 } 1306 /* at this point, [ij]_is_parity tells us whether the [current,failed] 1307 * disk is parity at the start of this RU, and, if data, "[ij]_offset" 1308 * tells us how far into the stripe the [current,failed] disk is. */ 1309 1310 /* call the mapping routine to get the offset into the current disk, 1311 * repeat for failed disk. */ 1312 if (i_is_parity) 1313 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1314 else 1315 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP); 1316 1317 RF_ASSERT(col == testcol); 1318 1319 if (j_is_parity) 1320 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1321 else 1322 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); 1323 RF_ASSERT(fcol == testcol); 1324 1325 /* now locate the spare unit for the failed unit */ 1326#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1327 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { 1328 if (j_is_parity) 1329 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1330 else 1331 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP); 1332 } else { 1333#endif 1334 *spCol = raidPtr->reconControl->spareCol; 1335 *spOffset = *outFailedDiskSectorOffset; 1336#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0 1337 } 1338#endif 1339 return (0); 1340 1341skipit: 1342 Dprintf2("RECON: Skipping psid %ld: nothing needed from r%d c%d\n", 1343 psid, col); 1344 return (1); 1345} 1346/* this is called when a buffer has become ready to write to the replacement disk */ 1347static int 1348IssueNextWriteRequest(RF_Raid_t *raidPtr) 1349{ 1350 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; 1351 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; 1352#if RF_ACC_TRACE > 0 1353 RF_RowCol_t fcol = raidPtr->reconControl->fcol; 1354#endif 1355 RF_ReconBuffer_t *rbuf; 1356 RF_DiskQueueData_t *req; 1357 1358 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl); 1359 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't 1360 * have gotten the event that sent us here */ 1361 RF_ASSERT(rbuf->pssPtr); 1362 1363 rbuf->pssPtr->writeRbuf = rbuf; 1364 rbuf->pssPtr = NULL; 1365 1366 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", 1367 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, 1368 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); 1369 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", 1370 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, 1371 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); 1372 1373 /* should be ok to use a NULL b_proc here b/c all addrs should be in 1374 * kernel space */ 1375 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, 1376 sectorsPerRU, rbuf->buffer, 1377 rbuf->parityStripeID, rbuf->which_ru, 1378 ReconWriteDoneProc, (void *) rbuf, 1379#if RF_ACC_TRACE > 0 1380 &raidPtr->recon_tracerecs[fcol], 1381#else 1382 NULL, 1383#endif 1384 (void *) raidPtr, 0, NULL, PR_WAITOK); 1385 1386 rbuf->arg = (void *) req; 1387 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex); 1388 raidPtr->reconControl->pending_writes++; 1389 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex); 1390 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY); 1391 1392 return (0); 1393} 1394 1395/* 1396 * this gets called upon the completion of a reconstruction read 1397 * operation the arg is a pointer to the per-disk reconstruction 1398 * control structure for the process that just finished a read. 1399 * 1400 * called at interrupt context in the kernel, so don't do anything 1401 * illegal here. 1402 */ 1403static int 1404ReconReadDoneProc(void *arg, int status) 1405{ 1406 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; 1407 RF_Raid_t *raidPtr; 1408 1409 /* Detect that reconCtrl is no longer valid, and if that 1410 is the case, bail without calling rf_CauseReconEvent(). 1411 There won't be anyone listening for this event anyway */ 1412 1413 if (ctrl->reconCtrl == NULL) 1414 return(0); 1415 1416 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; 1417 1418 if (status) { 1419 printf("raid%d: Recon read failed!\n", raidPtr->raidid); 1420 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED); 1421 return(0); 1422 } 1423#if RF_ACC_TRACE > 0 1424 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1425 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1426 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = 1427 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1428 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); 1429#endif 1430 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE); 1431 return (0); 1432} 1433/* this gets called upon the completion of a reconstruction write operation. 1434 * the arg is a pointer to the rbuf that was just written 1435 * 1436 * called at interrupt context in the kernel, so don't do anything illegal here. 1437 */ 1438static int 1439ReconWriteDoneProc(void *arg, int status) 1440{ 1441 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; 1442 1443 /* Detect that reconControl is no longer valid, and if that 1444 is the case, bail without calling rf_CauseReconEvent(). 1445 There won't be anyone listening for this event anyway */ 1446 1447 if (rbuf->raidPtr->reconControl == NULL) 1448 return(0); 1449 1450 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru); 1451 if (status) { 1452 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid); 1453 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED); 1454 return(0); 1455 } 1456 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE); 1457 return (0); 1458} 1459 1460 1461/* 1462 * computes a new minimum head sep, and wakes up anyone who needs to 1463 * be woken as a result 1464 */ 1465static void 1466CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr) 1467{ 1468 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1469 RF_HeadSepLimit_t new_min; 1470 RF_RowCol_t i; 1471 RF_CallbackDesc_t *p; 1472 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition 1473 * of a minimum */ 1474 1475 1476 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1477 while(reconCtrlPtr->rb_lock) { 1478 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex); 1479 } 1480 reconCtrlPtr->rb_lock = 1; 1481 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1482 1483 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ 1484 for (i = 0; i < raidPtr->numCol; i++) 1485 if (i != reconCtrlPtr->fcol) { 1486 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) 1487 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; 1488 } 1489 /* set the new minimum and wake up anyone who can now run again */ 1490 if (new_min != reconCtrlPtr->minHeadSepCounter) { 1491 reconCtrlPtr->minHeadSepCounter = new_min; 1492 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min); 1493 while (reconCtrlPtr->headSepCBList) { 1494 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) 1495 break; 1496 p = reconCtrlPtr->headSepCBList; 1497 reconCtrlPtr->headSepCBList = p->next; 1498 p->next = NULL; 1499 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR); 1500 rf_FreeCallbackDesc(p); 1501 } 1502 1503 } 1504 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1505 reconCtrlPtr->rb_lock = 0; 1506 wakeup(&reconCtrlPtr->rb_lock); 1507 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1508} 1509 1510/* 1511 * checks to see that the maximum head separation will not be violated 1512 * if we initiate a reconstruction I/O on the indicated disk. 1513 * Limiting the maximum head separation between two disks eliminates 1514 * the nasty buffer-stall conditions that occur when one disk races 1515 * ahead of the others and consumes all of the floating recon buffers. 1516 * This code is complex and unpleasant but it's necessary to avoid 1517 * some very nasty, albeit fairly rare, reconstruction behavior. 1518 * 1519 * returns non-zero if and only if we have to stop working on the 1520 * indicated disk due to a head-separation delay. 1521 */ 1522static int 1523CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, 1524 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, 1525 RF_ReconUnitNum_t which_ru) 1526{ 1527 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl; 1528 RF_CallbackDesc_t *cb, *p, *pt; 1529 int retval = 0; 1530 1531 /* if we're too far ahead of the slowest disk, stop working on this 1532 * disk until the slower ones catch up. We do this by scheduling a 1533 * wakeup callback for the time when the slowest disk has caught up. 1534 * We define "caught up" with 20% hysteresis, i.e. the head separation 1535 * must have fallen to at most 80% of the max allowable head 1536 * separation before we'll wake up. 1537 * 1538 */ 1539 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1540 while(reconCtrlPtr->rb_lock) { 1541 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex); 1542 } 1543 reconCtrlPtr->rb_lock = 1; 1544 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1545 if ((raidPtr->headSepLimit >= 0) && 1546 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { 1547 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n", 1548 raidPtr->raidid, col, ctrl->headSepCounter, 1549 reconCtrlPtr->minHeadSepCounter, 1550 raidPtr->headSepLimit); 1551 cb = rf_AllocCallbackDesc(); 1552 /* the minHeadSepCounter value we have to get to before we'll 1553 * wake up. build in 20% hysteresis. */ 1554 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); 1555 cb->col = col; 1556 cb->next = NULL; 1557 1558 /* insert this callback descriptor into the sorted list of 1559 * pending head-sep callbacks */ 1560 p = reconCtrlPtr->headSepCBList; 1561 if (!p) 1562 reconCtrlPtr->headSepCBList = cb; 1563 else 1564 if (cb->callbackArg.v < p->callbackArg.v) { 1565 cb->next = reconCtrlPtr->headSepCBList; 1566 reconCtrlPtr->headSepCBList = cb; 1567 } else { 1568 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); 1569 cb->next = p; 1570 pt->next = cb; 1571 } 1572 retval = 1; 1573#if RF_RECON_STATS > 0 1574 ctrl->reconCtrl->reconDesc->hsStallCount++; 1575#endif /* RF_RECON_STATS > 0 */ 1576 } 1577 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); 1578 reconCtrlPtr->rb_lock = 0; 1579 wakeup(&reconCtrlPtr->rb_lock); 1580 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); 1581 1582 return (retval); 1583} 1584/* 1585 * checks to see if reconstruction has been either forced or blocked 1586 * by a user operation. if forced, we skip this RU entirely. else if 1587 * blocked, put ourselves on the wait list. else return 0. 1588 * 1589 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY 1590 */ 1591static int 1592CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, 1593 RF_ReconParityStripeStatus_t *pssPtr, 1594 RF_PerDiskReconCtrl_t *ctrl, 1595 RF_RowCol_t col, 1596 RF_StripeNum_t psid, 1597 RF_ReconUnitNum_t which_ru) 1598{ 1599 RF_CallbackDesc_t *cb; 1600 int retcode = 0; 1601 1602 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) 1603 retcode = RF_PSS_FORCED_ON_WRITE; 1604 else 1605 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { 1606 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru); 1607 cb = rf_AllocCallbackDesc(); /* append ourselves to 1608 * the blockage-wait 1609 * list */ 1610 cb->col = col; 1611 cb->next = pssPtr->blockWaitList; 1612 pssPtr->blockWaitList = cb; 1613 retcode = RF_PSS_RECON_BLOCKED; 1614 } 1615 if (!retcode) 1616 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under 1617 * reconstruction */ 1618 1619 return (retcode); 1620} 1621/* 1622 * if reconstruction is currently ongoing for the indicated stripeID, 1623 * reconstruction is forced to completion and we return non-zero to 1624 * indicate that the caller must wait. If not, then reconstruction is 1625 * blocked on the indicated stripe and the routine returns zero. If 1626 * and only if we return non-zero, we'll cause the cbFunc to get 1627 * invoked with the cbArg when the reconstruction has completed. 1628 */ 1629int 1630rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1631 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg) 1632{ 1633 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're 1634 * forcing recon on */ 1635 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ 1636 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity 1637 * stripe status structure */ 1638 RF_StripeNum_t psid; /* parity stripe id */ 1639 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk 1640 * offset */ 1641 RF_RowCol_t *diskids; 1642 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ 1643 RF_RowCol_t fcol, diskno, i; 1644 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ 1645 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ 1646 RF_CallbackDesc_t *cb; 1647 int nPromoted; 1648 1649 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1650 1651 /* allocate a new PSS in case we need it */ 1652 newpssPtr = rf_AllocPSStatus(raidPtr); 1653 1654 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1655 1656 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr); 1657 1658 if (pssPtr != newpssPtr) { 1659 rf_FreePSStatus(raidPtr, newpssPtr); 1660 } 1661 1662 /* if recon is not ongoing on this PS, just return */ 1663 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1664 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1665 return (0); 1666 } 1667 /* otherwise, we have to wait for reconstruction to complete on this 1668 * RU. */ 1669 /* In order to avoid waiting for a potentially large number of 1670 * low-priority accesses to complete, we force a normal-priority (i.e. 1671 * not low-priority) reconstruction on this RU. */ 1672 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { 1673 DDprintf1("Forcing recon on psid %ld\n", psid); 1674 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under 1675 * forced recon */ 1676 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage 1677 * that we just set */ 1678 fcol = raidPtr->reconControl->fcol; 1679 1680 /* get a listing of the disks comprising the indicated stripe */ 1681 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids); 1682 1683 /* For previously issued reads, elevate them to normal 1684 * priority. If the I/O has already completed, it won't be 1685 * found in the queue, and hence this will be a no-op. For 1686 * unissued reads, allocate buffers and issue new reads. The 1687 * fact that we've set the FORCED bit means that the regular 1688 * recon procs will not re-issue these reqs */ 1689 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) 1690 if ((diskno = diskids[i]) != fcol) { 1691 if (pssPtr->issued[diskno]) { 1692 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru); 1693 if (rf_reconDebug && nPromoted) 1694 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno); 1695 } else { 1696 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ 1697 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset, 1698 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare 1699 * location */ 1700 new_rbuf->parityStripeID = psid; /* fill in the buffer */ 1701 new_rbuf->which_ru = which_ru; 1702 new_rbuf->failedDiskSectorOffset = fd_offset; 1703 new_rbuf->priority = RF_IO_NORMAL_PRIORITY; 1704 1705 /* use NULL b_proc b/c all addrs 1706 * should be in kernel space */ 1707 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, 1708 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, 1709 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK); 1710 1711 new_rbuf->arg = req; 1712 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ 1713 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno); 1714 } 1715 } 1716 /* if the write is sitting in the disk queue, elevate its 1717 * priority */ 1718 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru)) 1719 printf("raid%d: promoted write to col %d\n", 1720 raidPtr->raidid, fcol); 1721 } 1722 /* install a callback descriptor to be invoked when recon completes on 1723 * this parity stripe. */ 1724 cb = rf_AllocCallbackDesc(); 1725 /* XXX the following is bogus.. These functions don't really match!! 1726 * GO */ 1727 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; 1728 cb->callbackArg.p = (void *) cbArg; 1729 cb->next = pssPtr->procWaitList; 1730 pssPtr->procWaitList = cb; 1731 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n", 1732 raidPtr->raidid, psid); 1733 1734 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1735 return (1); 1736} 1737/* called upon the completion of a forced reconstruction read. 1738 * all we do is schedule the FORCEDREADONE event. 1739 * called at interrupt context in the kernel, so don't do anything illegal here. 1740 */ 1741static void 1742ForceReconReadDoneProc(void *arg, int status) 1743{ 1744 RF_ReconBuffer_t *rbuf = arg; 1745 1746 /* Detect that reconControl is no longer valid, and if that 1747 is the case, bail without calling rf_CauseReconEvent(). 1748 There won't be anyone listening for this event anyway */ 1749 1750 if (rbuf->raidPtr->reconControl == NULL) 1751 return; 1752 1753 if (status) { 1754 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid); 1755 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED); 1756 return; 1757 } 1758 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); 1759} 1760/* releases a block on the reconstruction of the indicated stripe */ 1761int 1762rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap) 1763{ 1764 RF_StripeNum_t stripeID = asmap->stripeID; 1765 RF_ReconParityStripeStatus_t *pssPtr; 1766 RF_ReconUnitNum_t which_ru; 1767 RF_StripeNum_t psid; 1768 RF_CallbackDesc_t *cb; 1769 1770 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); 1771 RF_LOCK_PSS_MUTEX(raidPtr, psid); 1772 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL); 1773 1774 /* When recon is forced, the pss desc can get deleted before we get 1775 * back to unblock recon. But, this can _only_ happen when recon is 1776 * forced. It would be good to put some kind of sanity check here, but 1777 * how to decide if recon was just forced or not? */ 1778 if (!pssPtr) { 1779 /* printf("Warning: no pss descriptor upon unblock on psid %ld 1780 * RU %d\n",psid,which_ru); */ 1781#if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0) 1782 if (rf_reconDebug || rf_pssDebug) 1783 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru); 1784#endif 1785 goto out; 1786 } 1787 pssPtr->blockCount--; 1788 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n", 1789 raidPtr->raidid, psid, pssPtr->blockCount); 1790 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ 1791 1792 /* unblock recon before calling CauseReconEvent in case 1793 * CauseReconEvent causes us to try to issue a new read before 1794 * returning here. */ 1795 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; 1796 1797 1798 while (pssPtr->blockWaitList) { 1799 /* spin through the block-wait list and 1800 release all the waiters */ 1801 cb = pssPtr->blockWaitList; 1802 pssPtr->blockWaitList = cb->next; 1803 cb->next = NULL; 1804 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR); 1805 rf_FreeCallbackDesc(cb); 1806 } 1807 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { 1808 /* if no recon was requested while recon was blocked */ 1809 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr); 1810 } 1811 } 1812out: 1813 RF_UNLOCK_PSS_MUTEX(raidPtr, psid); 1814 return (0); 1815} 1816