rf_dagdegrd.c revision 1.1
1/* $NetBSD: rf_dagdegrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/* 30 * rf_dagdegrd.c 31 * 32 * code for creating degraded read DAGs 33 * 34 * : 35 * Log: rf_dagdegrd.c,v 36 * Revision 1.20 1996/11/05 21:10:40 jimz 37 * failed pda generalization 38 * 39 * Revision 1.19 1996/08/19 23:30:36 jimz 40 * fix chained declustered accesses in degraded mode when mirror copy is failed 41 * (workload shifting not allowed when there are no duplicate copies extant) 42 * 43 * Revision 1.18 1996/07/31 16:29:01 jimz 44 * asm/asmap re-fix (EO merge) 45 * 46 * Revision 1.17 1996/07/31 15:34:34 jimz 47 * evenodd changes; bugfixes for double-degraded archs, generalize 48 * some formerly PQ-only functions 49 * 50 * Revision 1.16 1996/07/28 20:31:39 jimz 51 * i386netbsd port 52 * true/false fixup 53 * 54 * Revision 1.15 1996/07/27 23:36:08 jimz 55 * Solaris port of simulator 56 * 57 * Revision 1.14 1996/07/22 19:52:16 jimz 58 * switched node params to RF_DagParam_t, a union of 59 * a 64-bit int and a void *, for better portability 60 * attempted hpux port, but failed partway through for 61 * lack of a single C compiler capable of compiling all 62 * source files 63 * 64 * Revision 1.13 1996/06/09 02:36:46 jimz 65 * lots of little crufty cleanup- fixup whitespace 66 * issues, comment #ifdefs, improve typing in some 67 * places (esp size-related) 68 * 69 * Revision 1.12 1996/06/07 22:26:27 jimz 70 * type-ify which_ru (RF_ReconUnitNum_t) 71 * 72 * Revision 1.11 1996/06/07 21:33:04 jimz 73 * begin using consistent types for sector numbers, 74 * stripe numbers, row+col numbers, recon unit numbers 75 * 76 * Revision 1.10 1996/05/31 22:26:54 jimz 77 * fix a lot of mapping problems, memory allocation problems 78 * found some weird lock issues, fixed 'em 79 * more code cleanup 80 * 81 * Revision 1.9 1996/05/30 11:29:41 jimz 82 * Numerous bug fixes. Stripe lock release code disagreed with the taking code 83 * about when stripes should be locked (I made it consistent: no parity, no lock) 84 * There was a lot of extra serialization of I/Os which I've removed- a lot of 85 * it was to calculate values for the cache code, which is no longer with us. 86 * More types, function, macro cleanup. Added code to properly quiesce the array 87 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general 88 * before. Fixed memory allocation, freeing bugs. 89 * 90 * Revision 1.8 1996/05/27 18:56:37 jimz 91 * more code cleanup 92 * better typing 93 * compiles in all 3 environments 94 * 95 * Revision 1.7 1996/05/24 22:17:04 jimz 96 * continue code + namespace cleanup 97 * typed a bunch of flags 98 * 99 * Revision 1.6 1996/05/24 04:28:55 jimz 100 * release cleanup ckpt 101 * 102 * Revision 1.5 1996/05/23 21:46:35 jimz 103 * checkpoint in code cleanup (release prep) 104 * lots of types, function names have been fixed 105 * 106 * Revision 1.4 1996/05/23 00:33:23 jimz 107 * code cleanup: move all debug decls to rf_options.c, all extern 108 * debug decls to rf_options.h, all debug vars preceded by rf_ 109 * 110 * Revision 1.3 1996/05/18 19:51:34 jimz 111 * major code cleanup- fix syntax, make some types consistent, 112 * add prototypes, clean out dead code, et cetera 113 * 114 * Revision 1.2 1996/05/08 21:01:24 jimz 115 * fixed up enum type names that were conflicting with other 116 * enums and function names (ie, "panic") 117 * future naming trends will be towards RF_ and rf_ for 118 * everything raidframe-related 119 * 120 * Revision 1.1 1996/05/03 19:22:23 wvcii 121 * Initial revision 122 * 123 */ 124 125#include "rf_types.h" 126#include "rf_raid.h" 127#include "rf_dag.h" 128#include "rf_dagutils.h" 129#include "rf_dagfuncs.h" 130#include "rf_threadid.h" 131#include "rf_debugMem.h" 132#include "rf_memchunk.h" 133#include "rf_general.h" 134#include "rf_dagdegrd.h" 135#include "rf_sys.h" 136 137 138/****************************************************************************** 139 * 140 * General comments on DAG creation: 141 * 142 * All DAGs in this file use roll-away error recovery. Each DAG has a single 143 * commit node, usually called "Cmt." If an error occurs before the Cmt node 144 * is reached, the execution engine will halt forward execution and work 145 * backward through the graph, executing the undo functions. Assuming that 146 * each node in the graph prior to the Cmt node are undoable and atomic - or - 147 * does not make changes to permanent state, the graph will fail atomically. 148 * If an error occurs after the Cmt node executes, the engine will roll-forward 149 * through the graph, blindly executing nodes until it reaches the end. 150 * If a graph reaches the end, it is assumed to have completed successfully. 151 * 152 * A graph has only 1 Cmt node. 153 * 154 */ 155 156 157/****************************************************************************** 158 * 159 * The following wrappers map the standard DAG creation interface to the 160 * DAG creation routines. Additionally, these wrappers enable experimentation 161 * with new DAG structures by providing an extra level of indirection, allowing 162 * the DAG creation routines to be replaced at this single point. 163 */ 164 165void rf_CreateRaidFiveDegradedReadDAG( 166 RF_Raid_t *raidPtr, 167 RF_AccessStripeMap_t *asmap, 168 RF_DagHeader_t *dag_h, 169 void *bp, 170 RF_RaidAccessFlags_t flags, 171 RF_AllocListElem_t *allocList) 172{ 173 rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 174 &rf_xorRecoveryFuncs); 175} 176 177 178/****************************************************************************** 179 * 180 * DAG creation code begins here 181 */ 182 183 184/****************************************************************************** 185 * Create a degraded read DAG for RAID level 1 186 * 187 * Hdr -> Nil -> R(p/s)d -> Commit -> Trm 188 * 189 * The "Rd" node reads data from the surviving disk in the mirror pair 190 * Rpd - read of primary copy 191 * Rsd - read of secondary copy 192 * 193 * Parameters: raidPtr - description of the physical array 194 * asmap - logical & physical addresses for this access 195 * bp - buffer ptr (for holding write data) 196 * flags - general flags (e.g. disk locking) 197 * allocList - list of memory allocated in DAG creation 198 *****************************************************************************/ 199 200void rf_CreateRaidOneDegradedReadDAG( 201 RF_Raid_t *raidPtr, 202 RF_AccessStripeMap_t *asmap, 203 RF_DagHeader_t *dag_h, 204 void *bp, 205 RF_RaidAccessFlags_t flags, 206 RF_AllocListElem_t *allocList) 207{ 208 RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode; 209 RF_StripeNum_t parityStripeID; 210 RF_ReconUnitNum_t which_ru; 211 RF_PhysDiskAddr_t *pda; 212 int useMirror, i; 213 214 useMirror = 0; 215 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 216 asmap->raidAddress, &which_ru); 217 if (rf_dagDebug) { 218 printf("[Creating RAID level 1 degraded read DAG]\n"); 219 } 220 dag_h->creator = "RaidOneDegradedReadDAG"; 221 /* alloc the Wnd nodes and the Wmir node */ 222 if (asmap->numDataFailed == 0) 223 useMirror = RF_FALSE; 224 else 225 useMirror = RF_TRUE; 226 227 /* total number of nodes = 1 + (block + commit + terminator) */ 228 RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); 229 i = 0; 230 rdNode = &nodes[i]; i++; 231 blockNode = &nodes[i]; i++; 232 commitNode = &nodes[i]; i++; 233 termNode = &nodes[i]; i++; 234 235 /* this dag can not commit until the commit node is reached. errors prior 236 * to the commit point imply the dag has failed and must be retried 237 */ 238 dag_h->numCommitNodes = 1; 239 dag_h->numCommits = 0; 240 dag_h->numSuccedents = 1; 241 242 /* initialize the block, commit, and terminator nodes */ 243 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 244 NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); 245 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 246 NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 247 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 248 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 249 250 pda = asmap->physInfo; 251 RF_ASSERT(pda != NULL); 252 /* parityInfo must describe entire parity unit */ 253 RF_ASSERT(asmap->parityInfo->next == NULL); 254 255 /* initialize the data node */ 256 if (!useMirror) { 257 /* read primary copy of data */ 258 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 259 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); 260 rdNode->params[0].p = pda; 261 rdNode->params[1].p = pda->bufPtr; 262 rdNode->params[2].v = parityStripeID; 263 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 264 } 265 else { 266 /* read secondary copy of data */ 267 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 268 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); 269 rdNode->params[0].p = asmap->parityInfo; 270 rdNode->params[1].p = pda->bufPtr; 271 rdNode->params[2].v = parityStripeID; 272 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 273 } 274 275 /* connect header to block node */ 276 RF_ASSERT(dag_h->numSuccedents == 1); 277 RF_ASSERT(blockNode->numAntecedents == 0); 278 dag_h->succedents[0] = blockNode; 279 280 /* connect block node to rdnode */ 281 RF_ASSERT(blockNode->numSuccedents == 1); 282 RF_ASSERT(rdNode->numAntecedents == 1); 283 blockNode->succedents[0] = rdNode; 284 rdNode->antecedents[0] = blockNode; 285 rdNode->antType[0] = rf_control; 286 287 /* connect rdnode to commit node */ 288 RF_ASSERT(rdNode->numSuccedents == 1); 289 RF_ASSERT(commitNode->numAntecedents == 1); 290 rdNode->succedents[0] = commitNode; 291 commitNode->antecedents[0] = rdNode; 292 commitNode->antType[0] = rf_control; 293 294 /* connect commit node to terminator */ 295 RF_ASSERT(commitNode->numSuccedents == 1); 296 RF_ASSERT(termNode->numAntecedents == 1); 297 RF_ASSERT(termNode->numSuccedents == 0); 298 commitNode->succedents[0] = termNode; 299 termNode->antecedents[0] = commitNode; 300 termNode->antType[0] = rf_control; 301} 302 303 304 305/****************************************************************************** 306 * 307 * creates a DAG to perform a degraded-mode read of data within one stripe. 308 * This DAG is as follows: 309 * 310 * Hdr -> Block -> Rud -> Xor -> Cmt -> T 311 * -> Rrd -> 312 * -> Rp --> 313 * 314 * Each R node is a successor of the L node 315 * One successor arc from each R node goes to C, and the other to X 316 * There is one Rud for each chunk of surviving user data requested by the 317 * user, and one Rrd for each chunk of surviving user data _not_ being read by 318 * the user 319 * R = read, ud = user data, rd = recovery (surviving) data, p = parity 320 * X = XOR, C = Commit, T = terminate 321 * 322 * The block node guarantees a single source node. 323 * 324 * Note: The target buffer for the XOR node is set to the actual user buffer 325 * where the failed data is supposed to end up. This buffer is zero'd by the 326 * code here. Thus, if you create a degraded read dag, use it, and then 327 * re-use, you have to be sure to zero the target buffer prior to the re-use. 328 * 329 * The recfunc argument at the end specifies the name and function used for 330 * the redundancy 331 * recovery function. 332 * 333 *****************************************************************************/ 334 335void rf_CreateDegradedReadDAG( 336 RF_Raid_t *raidPtr, 337 RF_AccessStripeMap_t *asmap, 338 RF_DagHeader_t *dag_h, 339 void *bp, 340 RF_RaidAccessFlags_t flags, 341 RF_AllocListElem_t *allocList, 342 RF_RedFuncs_t *recFunc) 343{ 344 RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode; 345 RF_DagNode_t *commitNode, *rpNode, *termNode; 346 int nNodes, nRrdNodes, nRudNodes, nXorBufs, i; 347 int j, paramNum; 348 RF_SectorCount_t sectorsPerSU; 349 RF_ReconUnitNum_t which_ru; 350 char *overlappingPDAs; /* a temporary array of flags */ 351 RF_AccessStripeMapHeader_t *new_asm_h[2]; 352 RF_PhysDiskAddr_t *pda, *parityPDA; 353 RF_StripeNum_t parityStripeID; 354 RF_PhysDiskAddr_t *failedPDA; 355 RF_RaidLayout_t *layoutPtr; 356 char *rpBuf; 357 358 layoutPtr = &(raidPtr->Layout); 359 /* failedPDA points to the pda within the asm that targets the failed disk */ 360 failedPDA = asmap->failedPDAs[0]; 361 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, 362 asmap->raidAddress, &which_ru); 363 sectorsPerSU = layoutPtr->sectorsPerStripeUnit; 364 365 if (rf_dagDebug) { 366 printf("[Creating degraded read DAG]\n"); 367 } 368 369 RF_ASSERT( asmap->numDataFailed == 1 ); 370 dag_h->creator = "DegradedReadDAG"; 371 372 /* 373 * generate two ASMs identifying the surviving data we need 374 * in order to recover the lost data 375 */ 376 377 /* overlappingPDAs array must be zero'd */ 378 RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *)); 379 rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs, 380 &rpBuf, overlappingPDAs, allocList); 381 382 /* 383 * create all the nodes at once 384 * 385 * -1 because no access is generated for the failed pda 386 */ 387 nRudNodes = asmap->numStripeUnitsAccessed-1; 388 nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + 389 ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); 390 nNodes = 5 + nRudNodes + nRrdNodes; /* lock, unlock, xor, Rp, Rud, Rrd */ 391 RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), 392 allocList); 393 i = 0; 394 blockNode = &nodes[i]; i++; 395 commitNode = &nodes[i]; i++; 396 xorNode = &nodes[i]; i++; 397 rpNode = &nodes[i]; i++; 398 termNode = &nodes[i]; i++; 399 rudNodes = &nodes[i]; i += nRudNodes; 400 rrdNodes = &nodes[i]; i += nRrdNodes; 401 RF_ASSERT(i == nNodes); 402 403 /* initialize nodes */ 404 dag_h->numCommitNodes = 1; 405 dag_h->numCommits = 0; 406 /* this dag can not commit until the commit node is reached 407 * errors prior to the commit point imply the dag has failed 408 */ 409 dag_h->numSuccedents = 1; 410 411 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 412 NULL, nRudNodes+nRrdNodes+1, 0, 0, 0, dag_h, "Nil", allocList); 413 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 414 NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 415 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 416 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 417 rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc, 418 NULL, 1, nRudNodes+nRrdNodes+1, 2*nXorBufs+2, 1, dag_h, 419 recFunc->SimpleName, allocList); 420 421 /* fill in the Rud nodes */ 422 for (pda=asmap->physInfo, i=0; i<nRudNodes; i++, pda=pda->next) { 423 if (pda == failedPDA) {i--; continue;} 424 rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, 425 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, 426 "Rud", allocList); 427 RF_ASSERT(pda); 428 rudNodes[i].params[0].p = pda; 429 rudNodes[i].params[1].p = pda->bufPtr; 430 rudNodes[i].params[2].v = parityStripeID; 431 rudNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 432 } 433 434 /* fill in the Rrd nodes */ 435 i = 0; 436 if (new_asm_h[0]) { 437 for (pda=new_asm_h[0]->stripeMap->physInfo; 438 i<new_asm_h[0]->stripeMap->numStripeUnitsAccessed; 439 i++, pda=pda->next) 440 { 441 rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, 442 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 443 dag_h, "Rrd", allocList); 444 RF_ASSERT(pda); 445 rrdNodes[i].params[0].p = pda; 446 rrdNodes[i].params[1].p = pda->bufPtr; 447 rrdNodes[i].params[2].v = parityStripeID; 448 rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 449 } 450 } 451 if (new_asm_h[1]) { 452 for (j=0,pda=new_asm_h[1]->stripeMap->physInfo; 453 j<new_asm_h[1]->stripeMap->numStripeUnitsAccessed; 454 j++, pda=pda->next) 455 { 456 rf_InitNode(&rrdNodes[i+j], rf_wait, RF_FALSE, rf_DiskReadFunc, 457 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 458 dag_h, "Rrd", allocList); 459 RF_ASSERT(pda); 460 rrdNodes[i+j].params[0].p = pda; 461 rrdNodes[i+j].params[1].p = pda->bufPtr; 462 rrdNodes[i+j].params[2].v = parityStripeID; 463 rrdNodes[i+j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 464 } 465 } 466 467 /* make a PDA for the parity unit */ 468 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); 469 parityPDA->row = asmap->parityInfo->row; 470 parityPDA->col = asmap->parityInfo->col; 471 parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) 472 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); 473 parityPDA->numSector = failedPDA->numSector; 474 475 /* initialize the Rp node */ 476 rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 477 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList); 478 rpNode->params[0].p = parityPDA; 479 rpNode->params[1].p = rpBuf; 480 rpNode->params[2].v = parityStripeID; 481 rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 482 483 /* 484 * the last and nastiest step is to assign all 485 * the parameters of the Xor node 486 */ 487 paramNum=0; 488 for (i=0; i<nRrdNodes; i++) { 489 /* all the Rrd nodes need to be xored together */ 490 xorNode->params[paramNum++] = rrdNodes[i].params[0]; 491 xorNode->params[paramNum++] = rrdNodes[i].params[1]; 492 } 493 for (i=0; i<nRudNodes; i++) { 494 /* any Rud nodes that overlap the failed access need to be xored in */ 495 if (overlappingPDAs[i]) { 496 RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); 497 bcopy((char *)rudNodes[i].params[0].p, (char *)pda, sizeof(RF_PhysDiskAddr_t)); 498 rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); 499 xorNode->params[paramNum++].p = pda; 500 xorNode->params[paramNum++].p = pda->bufPtr; 501 } 502 } 503 RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char)); 504 505 /* install parity pda as last set of params to be xor'd */ 506 xorNode->params[paramNum++].p = parityPDA; 507 xorNode->params[paramNum++].p = rpBuf; 508 509 /* 510 * the last 2 params to the recovery xor node are 511 * the failed PDA and the raidPtr 512 */ 513 xorNode->params[paramNum++].p = failedPDA; 514 xorNode->params[paramNum++].p = raidPtr; 515 RF_ASSERT( paramNum == 2*nXorBufs+2 ); 516 517 /* 518 * The xor node uses results[0] as the target buffer. 519 * Set pointer and zero the buffer. In the kernel, this 520 * may be a user buffer in which case we have to remap it. 521 */ 522 xorNode->results[0] = failedPDA->bufPtr; 523 RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr, 524 failedPDA->numSector)); 525 526 /* connect nodes to form graph */ 527 /* connect the header to the block node */ 528 RF_ASSERT(dag_h->numSuccedents == 1); 529 RF_ASSERT(blockNode->numAntecedents == 0); 530 dag_h->succedents[0] = blockNode; 531 532 /* connect the block node to the read nodes */ 533 RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes)); 534 RF_ASSERT(rpNode->numAntecedents == 1); 535 blockNode->succedents[0] = rpNode; 536 rpNode->antecedents[0] = blockNode; 537 rpNode->antType[0] = rf_control; 538 for (i = 0; i < nRrdNodes; i++) { 539 RF_ASSERT(rrdNodes[i].numSuccedents == 1); 540 blockNode->succedents[1 + i] = &rrdNodes[i]; 541 rrdNodes[i].antecedents[0] = blockNode; 542 rrdNodes[i].antType[0] = rf_control; 543 } 544 for (i = 0; i < nRudNodes; i++) { 545 RF_ASSERT(rudNodes[i].numSuccedents == 1); 546 blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i]; 547 rudNodes[i].antecedents[0] = blockNode; 548 rudNodes[i].antType[0] = rf_control; 549 } 550 551 /* connect the read nodes to the xor node */ 552 RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes)); 553 RF_ASSERT(rpNode->numSuccedents == 1); 554 rpNode->succedents[0] = xorNode; 555 xorNode->antecedents[0] = rpNode; 556 xorNode->antType[0] = rf_trueData; 557 for (i = 0; i < nRrdNodes; i++) { 558 RF_ASSERT(rrdNodes[i].numSuccedents == 1); 559 rrdNodes[i].succedents[0] = xorNode; 560 xorNode->antecedents[1 + i] = &rrdNodes[i]; 561 xorNode->antType[1 + i] = rf_trueData; 562 } 563 for (i = 0; i < nRudNodes; i++) { 564 RF_ASSERT(rudNodes[i].numSuccedents == 1); 565 rudNodes[i].succedents[0] = xorNode; 566 xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i]; 567 xorNode->antType[1 + nRrdNodes + i] = rf_trueData; 568 } 569 570 /* connect the xor node to the commit node */ 571 RF_ASSERT(xorNode->numSuccedents == 1); 572 RF_ASSERT(commitNode->numAntecedents == 1); 573 xorNode->succedents[0] = commitNode; 574 commitNode->antecedents[0] = xorNode; 575 commitNode->antType[0] = rf_control; 576 577 /* connect the termNode to the commit node */ 578 RF_ASSERT(commitNode->numSuccedents == 1); 579 RF_ASSERT(termNode->numAntecedents == 1); 580 RF_ASSERT(termNode->numSuccedents == 0); 581 commitNode->succedents[0] = termNode; 582 termNode->antType[0] = rf_control; 583 termNode->antecedents[0] = commitNode; 584} 585 586 587/****************************************************************************** 588 * Create a degraded read DAG for Chained Declustering 589 * 590 * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm 591 * 592 * The "Rd" node reads data from the surviving disk in the mirror pair 593 * Rpd - read of primary copy 594 * Rsd - read of secondary copy 595 * 596 * Parameters: raidPtr - description of the physical array 597 * asmap - logical & physical addresses for this access 598 * bp - buffer ptr (for holding write data) 599 * flags - general flags (e.g. disk locking) 600 * allocList - list of memory allocated in DAG creation 601 *****************************************************************************/ 602 603void rf_CreateRaidCDegradedReadDAG( 604 RF_Raid_t *raidPtr, 605 RF_AccessStripeMap_t *asmap, 606 RF_DagHeader_t *dag_h, 607 void *bp, 608 RF_RaidAccessFlags_t flags, 609 RF_AllocListElem_t *allocList) 610{ 611 RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode; 612 RF_StripeNum_t parityStripeID; 613 int useMirror, i, shiftable; 614 RF_ReconUnitNum_t which_ru; 615 RF_PhysDiskAddr_t *pda; 616 617 if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { 618 shiftable = RF_TRUE; 619 } 620 else { 621 shiftable = RF_FALSE; 622 } 623 useMirror = 0; 624 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 625 asmap->raidAddress, &which_ru); 626 627 if (rf_dagDebug) { 628 printf("[Creating RAID C degraded read DAG]\n"); 629 } 630 dag_h->creator = "RaidCDegradedReadDAG"; 631 /* alloc the Wnd nodes and the Wmir node */ 632 if (asmap->numDataFailed == 0) 633 useMirror = RF_FALSE; 634 else 635 useMirror = RF_TRUE; 636 637 /* total number of nodes = 1 + (block + commit + terminator) */ 638 RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); 639 i = 0; 640 rdNode = &nodes[i]; i++; 641 blockNode = &nodes[i]; i++; 642 commitNode = &nodes[i]; i++; 643 termNode = &nodes[i]; i++; 644 645 /* 646 * This dag can not commit until the commit node is reached. 647 * Errors prior to the commit point imply the dag has failed 648 * and must be retried. 649 */ 650 dag_h->numCommitNodes = 1; 651 dag_h->numCommits = 0; 652 dag_h->numSuccedents = 1; 653 654 /* initialize the block, commit, and terminator nodes */ 655 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 656 NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); 657 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 658 NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 659 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 660 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 661 662 pda = asmap->physInfo; 663 RF_ASSERT(pda != NULL); 664 /* parityInfo must describe entire parity unit */ 665 RF_ASSERT(asmap->parityInfo->next == NULL); 666 667 /* initialize the data node */ 668 if (!useMirror) { 669 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 670 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); 671 if (shiftable && rf_compute_workload_shift(raidPtr, pda)) { 672 /* shift this read to the next disk in line */ 673 rdNode->params[0].p = asmap->parityInfo; 674 rdNode->params[1].p = pda->bufPtr; 675 rdNode->params[2].v = parityStripeID; 676 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 677 } 678 else { 679 /* read primary copy */ 680 rdNode->params[0].p = pda; 681 rdNode->params[1].p = pda->bufPtr; 682 rdNode->params[2].v = parityStripeID; 683 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 684 } 685 } 686 else { 687 /* read secondary copy of data */ 688 rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 689 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); 690 rdNode->params[0].p = asmap->parityInfo; 691 rdNode->params[1].p = pda->bufPtr; 692 rdNode->params[2].v = parityStripeID; 693 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); 694 } 695 696 /* connect header to block node */ 697 RF_ASSERT(dag_h->numSuccedents == 1); 698 RF_ASSERT(blockNode->numAntecedents == 0); 699 dag_h->succedents[0] = blockNode; 700 701 /* connect block node to rdnode */ 702 RF_ASSERT(blockNode->numSuccedents == 1); 703 RF_ASSERT(rdNode->numAntecedents == 1); 704 blockNode->succedents[0] = rdNode; 705 rdNode->antecedents[0] = blockNode; 706 rdNode->antType[0] = rf_control; 707 708 /* connect rdnode to commit node */ 709 RF_ASSERT(rdNode->numSuccedents == 1); 710 RF_ASSERT(commitNode->numAntecedents == 1); 711 rdNode->succedents[0] = commitNode; 712 commitNode->antecedents[0] = rdNode; 713 commitNode->antType[0] = rf_control; 714 715 /* connect commit node to terminator */ 716 RF_ASSERT(commitNode->numSuccedents == 1); 717 RF_ASSERT(termNode->numAntecedents == 1); 718 RF_ASSERT(termNode->numSuccedents == 0); 719 commitNode->succedents[0] = termNode; 720 termNode->antecedents[0] = commitNode; 721 termNode->antType[0] = rf_control; 722} 723 724/* 725 * XXX move this elsewhere? 726 */ 727void rf_DD_GenerateFailedAccessASMs( 728 RF_Raid_t *raidPtr, 729 RF_AccessStripeMap_t *asmap, 730 RF_PhysDiskAddr_t **pdap, 731 int *nNodep, 732 RF_PhysDiskAddr_t **pqpdap, 733 int *nPQNodep, 734 RF_AllocListElem_t *allocList) 735{ 736 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 737 int PDAPerDisk,i; 738 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 739 int numDataCol = layoutPtr->numDataCol; 740 int state; 741 RF_SectorNum_t suoff, suend; 742 unsigned firstDataCol, napdas, count; 743 RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0; 744 RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; 745 RF_PhysDiskAddr_t *pda_p; 746 RF_PhysDiskAddr_t *phys_p; 747 RF_RaidAddr_t sosAddr; 748 749 /* determine how many pda's we will have to generate per unaccess stripe. 750 If there is only one failed data unit, it is one; if two, possibly two, 751 depending wether they overlap. */ 752 753 fone_start = rf_StripeUnitOffset(layoutPtr,fone->startSector); 754 fone_end = fone_start + fone->numSector; 755 756#define CONS_PDA(if,start,num) \ 757 pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \ 758 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ 759 pda_p->numSector = num; \ 760 pda_p->next = NULL; \ 761 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList) 762 763 if (asmap->numDataFailed==1) 764 { 765 PDAPerDisk = 1; 766 state = 1; 767 RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); 768 pda_p = *pqpdap; 769 /* build p */ 770 CONS_PDA(parityInfo,fone_start,fone->numSector); 771 pda_p->type = RF_PDA_TYPE_PARITY; 772 pda_p++; 773 /* build q */ 774 CONS_PDA(qInfo,fone_start,fone->numSector); 775 pda_p->type = RF_PDA_TYPE_Q; 776 } 777 else 778 { 779 ftwo_start = rf_StripeUnitOffset(layoutPtr,ftwo->startSector); 780 ftwo_end = ftwo_start + ftwo->numSector; 781 if (fone->numSector + ftwo->numSector > secPerSU) 782 { 783 PDAPerDisk = 1; 784 state = 2; 785 RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); 786 pda_p = *pqpdap; 787 CONS_PDA(parityInfo,0,secPerSU); 788 pda_p->type = RF_PDA_TYPE_PARITY; 789 pda_p++; 790 CONS_PDA(qInfo,0,secPerSU); 791 pda_p->type = RF_PDA_TYPE_Q; 792 } 793 else 794 { 795 PDAPerDisk = 2; 796 state = 3; 797 /* four of them, fone, then ftwo */ 798 RF_MallocAndAdd(*pqpdap,4*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); 799 pda_p = *pqpdap; 800 CONS_PDA(parityInfo,fone_start,fone->numSector); 801 pda_p->type = RF_PDA_TYPE_PARITY; 802 pda_p++; 803 CONS_PDA(qInfo,fone_start,fone->numSector); 804 pda_p->type = RF_PDA_TYPE_Q; 805 pda_p++; 806 CONS_PDA(parityInfo,ftwo_start,ftwo->numSector); 807 pda_p->type = RF_PDA_TYPE_PARITY; 808 pda_p++; 809 CONS_PDA(qInfo,ftwo_start,ftwo->numSector); 810 pda_p->type = RF_PDA_TYPE_Q; 811 } 812 } 813 /* figure out number of nonaccessed pda */ 814 napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo==NULL ? 1 : 0)); 815 *nPQNodep = PDAPerDisk; 816 817 /* sweep over the over accessed pda's, figuring out the number of 818 additional pda's to generate. Of course, skip the failed ones */ 819 820 count = 0; 821 for ( pda_p=asmap->physInfo; pda_p; pda_p= pda_p->next) 822 { 823 if ((pda_p == fone) || (pda_p == ftwo)) 824 continue; 825 suoff = rf_StripeUnitOffset(layoutPtr,pda_p->startSector); 826 suend = suoff + pda_p->numSector; 827 switch (state) 828 { 829 case 1: /* one failed PDA to overlap */ 830 /* if a PDA doesn't contain the failed unit, it can 831 only miss the start or end, not both */ 832 if ((suoff > fone_start) || (suend <fone_end)) 833 count++; 834 break; 835 case 2: /* whole stripe */ 836 if (suoff) /* leak at begining */ 837 count++; 838 if (suend < numDataCol) /* leak at end */ 839 count++; 840 break; 841 case 3: /* two disjoint units */ 842 if ((suoff > fone_start) || (suend <fone_end)) 843 count++; 844 if ((suoff > ftwo_start) || (suend <ftwo_end)) 845 count++; 846 break; 847 default: 848 RF_PANIC(); 849 } 850 } 851 852 napdas += count; 853 *nNodep = napdas; 854 if (napdas == 0) return; /* short circuit */ 855 856 /* allocate up our list of pda's */ 857 858 RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); 859 *pdap = pda_p; 860 861 /* linkem together */ 862 for (i=0; i < (napdas-1); i++) 863 pda_p[i].next = pda_p+(i+1); 864 865 /* march through the one's up to the first accessed disk */ 866 firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),asmap->physInfo->raidAddress) % numDataCol; 867 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 868 for (i=0; i < firstDataCol; i++) 869 { 870 if ((pda_p - (*pdap)) == napdas) 871 continue; 872 pda_p->type = RF_PDA_TYPE_DATA; 873 pda_p->raidAddress = sosAddr + (i * secPerSU); 874 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 875 /* skip over dead disks */ 876 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status)) 877 continue; 878 switch (state) 879 { 880 case 1: /* fone */ 881 pda_p->numSector = fone->numSector; 882 pda_p->raidAddress += fone_start; 883 pda_p->startSector += fone_start; 884 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 885 break; 886 case 2: /* full stripe */ 887 pda_p->numSector = secPerSU; 888 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList); 889 break; 890 case 3: /* two slabs */ 891 pda_p->numSector = fone->numSector; 892 pda_p->raidAddress += fone_start; 893 pda_p->startSector += fone_start; 894 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 895 pda_p++; 896 pda_p->type = RF_PDA_TYPE_DATA; 897 pda_p->raidAddress = sosAddr + (i * secPerSU); 898 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 899 pda_p->numSector = ftwo->numSector; 900 pda_p->raidAddress += ftwo_start; 901 pda_p->startSector += ftwo_start; 902 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 903 break; 904 default: 905 RF_PANIC(); 906 } 907 pda_p++; 908 } 909 910 /* march through the touched stripe units */ 911 for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) 912 { 913 if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1])) 914 continue; 915 suoff = rf_StripeUnitOffset(layoutPtr,phys_p->startSector); 916 suend = suoff + phys_p->numSector; 917 switch(state) 918 { 919 case 1: /* single buffer */ 920 if (suoff > fone_start) 921 { 922 RF_ASSERT( suend >= fone_end ); 923 /* The data read starts after the mapped access, 924 snip off the begining */ 925 pda_p->numSector = suoff - fone_start; 926 pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start; 927 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 928 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 929 pda_p++; 930 } 931 if (suend < fone_end) 932 { 933 RF_ASSERT ( suoff <= fone_start); 934 /* The data read stops before the end of the failed access, extend */ 935 pda_p->numSector = fone_end - suend; 936 pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ 937 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 938 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 939 pda_p++; 940 } 941 break; 942 case 2: /* whole stripe unit */ 943 RF_ASSERT( (suoff == 0) || (suend == secPerSU)); 944 if (suend < secPerSU) 945 { /* short read, snip from end on */ 946 pda_p->numSector = secPerSU - suend; 947 pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ 948 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 949 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 950 pda_p++; 951 } 952 else 953 if (suoff > 0) 954 { /* short at front */ 955 pda_p->numSector = suoff; 956 pda_p->raidAddress = sosAddr + (i*secPerSU); 957 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 958 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 959 pda_p++; 960 } 961 break; 962 case 3: /* two nonoverlapping failures */ 963 if ((suoff > fone_start) || (suend <fone_end)) 964 { 965 if (suoff > fone_start) 966 { 967 RF_ASSERT( suend >= fone_end ); 968 /* The data read starts after the mapped access, 969 snip off the begining */ 970 pda_p->numSector = suoff - fone_start; 971 pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start; 972 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 973 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 974 pda_p++; 975 } 976 if (suend < fone_end) 977 { 978 RF_ASSERT ( suoff <= fone_start); 979 /* The data read stops before the end of the failed access, extend */ 980 pda_p->numSector = fone_end - suend; 981 pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ 982 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 983 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 984 pda_p++; 985 } 986 } 987 if ((suoff > ftwo_start) || (suend <ftwo_end)) 988 { 989 if (suoff > ftwo_start) 990 { 991 RF_ASSERT( suend >= ftwo_end ); 992 /* The data read starts after the mapped access, 993 snip off the begining */ 994 pda_p->numSector = suoff - ftwo_start; 995 pda_p->raidAddress = sosAddr + (i*secPerSU) + ftwo_start; 996 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 997 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 998 pda_p++; 999 } 1000 if (suend < ftwo_end) 1001 { 1002 RF_ASSERT ( suoff <= ftwo_start); 1003 /* The data read stops before the end of the failed access, extend */ 1004 pda_p->numSector = ftwo_end - suend; 1005 pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ 1006 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 1007 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 1008 pda_p++; 1009 } 1010 } 1011 break; 1012 default: 1013 RF_PANIC(); 1014 } 1015 } 1016 1017 /* after the last accessed disk */ 1018 for (; i < numDataCol; i++ ) 1019 { 1020 if ((pda_p - (*pdap)) == napdas) 1021 continue; 1022 pda_p->type = RF_PDA_TYPE_DATA; 1023 pda_p->raidAddress = sosAddr + (i * secPerSU); 1024 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 1025 /* skip over dead disks */ 1026 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status)) 1027 continue; 1028 switch (state) 1029 { 1030 case 1: /* fone */ 1031 pda_p->numSector = fone->numSector; 1032 pda_p->raidAddress += fone_start; 1033 pda_p->startSector += fone_start; 1034 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 1035 break; 1036 case 2: /* full stripe */ 1037 pda_p->numSector = secPerSU; 1038 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList); 1039 break; 1040 case 3: /* two slabs */ 1041 pda_p->numSector = fone->numSector; 1042 pda_p->raidAddress += fone_start; 1043 pda_p->startSector += fone_start; 1044 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 1045 pda_p++; 1046 pda_p->type = RF_PDA_TYPE_DATA; 1047 pda_p->raidAddress = sosAddr + (i * secPerSU); 1048 (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); 1049 pda_p->numSector = ftwo->numSector; 1050 pda_p->raidAddress += ftwo_start; 1051 pda_p->startSector += ftwo_start; 1052 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); 1053 break; 1054 default: 1055 RF_PANIC(); 1056 } 1057 pda_p++; 1058 } 1059 1060 RF_ASSERT (pda_p - *pdap == napdas); 1061 return; 1062} 1063 1064#define INIT_DISK_NODE(node,name) \ 1065rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ 1066(node)->succedents[0] = unblockNode; \ 1067(node)->succedents[1] = recoveryNode; \ 1068(node)->antecedents[0] = blockNode; \ 1069(node)->antType[0] = rf_control 1070 1071#define DISK_NODE_PARAMS(_node_,_p_) \ 1072 (_node_).params[0].p = _p_ ; \ 1073 (_node_).params[1].p = (_p_)->bufPtr; \ 1074 (_node_).params[2].v = parityStripeID; \ 1075 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru) 1076 1077void rf_DoubleDegRead( 1078 RF_Raid_t *raidPtr, 1079 RF_AccessStripeMap_t *asmap, 1080 RF_DagHeader_t *dag_h, 1081 void *bp, 1082 RF_RaidAccessFlags_t flags, 1083 RF_AllocListElem_t *allocList, 1084 char *redundantReadNodeName, 1085 char *recoveryNodeName, 1086 int (*recovFunc)(RF_DagNode_t *)) 1087{ 1088 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 1089 RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode, *unblockNode, *rpNodes, *rqNodes, *termNode; 1090 RF_PhysDiskAddr_t *pda, *pqPDAs; 1091 RF_PhysDiskAddr_t *npdas; 1092 int nNodes, nRrdNodes, nRudNodes, i; 1093 RF_ReconUnitNum_t which_ru; 1094 int nReadNodes, nPQNodes; 1095 RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0]; 1096 RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1]; 1097 RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); 1098 1099 if (rf_dagDebug) printf("[Creating Double Degraded Read DAG]\n"); 1100 rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes,allocList); 1101 1102 nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); 1103 nReadNodes = nRrdNodes + nRudNodes + 2*nPQNodes; 1104 nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes; 1105 1106 RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); 1107 i = 0; 1108 blockNode = &nodes[i]; i += 1; 1109 unblockNode = &nodes[i]; i += 1; 1110 recoveryNode = &nodes[i]; i += 1; 1111 termNode = &nodes[i]; i += 1; 1112 rudNodes = &nodes[i]; i += nRudNodes; 1113 rrdNodes = &nodes[i]; i += nRrdNodes; 1114 rpNodes = &nodes[i]; i += nPQNodes; 1115 rqNodes = &nodes[i]; i += nPQNodes; 1116 RF_ASSERT(i == nNodes); 1117 1118 dag_h->numSuccedents = 1; 1119 dag_h->succedents[0] = blockNode; 1120 dag_h->creator = "DoubleDegRead"; 1121 dag_h->numCommits = 0; 1122 dag_h->numCommitNodes = 1; /*unblock */ 1123 1124 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList); 1125 termNode->antecedents[0] = unblockNode; 1126 termNode->antType[0] = rf_control; 1127 termNode->antecedents[1] = recoveryNode; 1128 termNode->antType[1] = rf_control; 1129 1130 /* init the block and unblock nodes */ 1131 /* The block node has all nodes except itself, unblock and recovery as successors. Similarly for 1132 predecessors of the unblock. */ 1133 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList); 1134 rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList); 1135 1136 for (i=0; i < nReadNodes; i++) 1137 { 1138 blockNode->succedents[i] = rudNodes+i; 1139 unblockNode->antecedents[i] = rudNodes+i; 1140 unblockNode->antType[i] = rf_control; 1141 } 1142 unblockNode->succedents[0] = termNode; 1143 1144 /* The recovery node has all the reads as predecessors, and the term node as successors. It gets a pda as a param 1145 from each of the read nodes plus the raidPtr. 1146 For each failed unit is has a result pda. */ 1147 rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, 1148 1, /* succesors */ 1149 nReadNodes, /* preds */ 1150 nReadNodes+2, /* params */ 1151 asmap->numDataFailed, /* results */ 1152 dag_h, recoveryNodeName, allocList); 1153 1154 recoveryNode->succedents[0] = termNode; 1155 for (i=0; i < nReadNodes; i++) { 1156 recoveryNode->antecedents[i] = rudNodes+i; 1157 recoveryNode->antType[i] = rf_trueData; 1158 } 1159 1160 /* build the read nodes, then come back and fill in recovery params and results */ 1161 pda = asmap->physInfo; 1162 for (i=0; i < nRudNodes; pda = pda->next) 1163 { 1164 if ((pda == failedPDA) || (pda == failedPDAtwo)) 1165 continue; 1166 INIT_DISK_NODE(rudNodes+i,"Rud"); 1167 RF_ASSERT(pda); 1168 DISK_NODE_PARAMS(rudNodes[i],pda); 1169 i++; 1170 } 1171 1172 pda = npdas; 1173 for (i=0; i < nRrdNodes; i++, pda = pda->next) 1174 { 1175 INIT_DISK_NODE(rrdNodes+i,"Rrd"); 1176 RF_ASSERT(pda); 1177 DISK_NODE_PARAMS(rrdNodes[i],pda); 1178 } 1179 1180 /* redundancy pdas */ 1181 pda = pqPDAs; 1182 INIT_DISK_NODE(rpNodes,"Rp"); 1183 RF_ASSERT(pda); 1184 DISK_NODE_PARAMS(rpNodes[0],pda); 1185 pda++; 1186 INIT_DISK_NODE(rqNodes,redundantReadNodeName ); 1187 RF_ASSERT(pda); 1188 DISK_NODE_PARAMS(rqNodes[0],pda); 1189 if (nPQNodes==2) 1190 { 1191 pda++; 1192 INIT_DISK_NODE(rpNodes+1,"Rp"); 1193 RF_ASSERT(pda); 1194 DISK_NODE_PARAMS(rpNodes[1],pda); 1195 pda++; 1196 INIT_DISK_NODE( rqNodes+1,redundantReadNodeName ); 1197 RF_ASSERT(pda); 1198 DISK_NODE_PARAMS(rqNodes[1],pda); 1199 } 1200 1201 /* fill in recovery node params */ 1202 for (i=0; i < nReadNodes; i++) 1203 recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */ 1204 recoveryNode->params[i++].p = (void *) raidPtr; 1205 recoveryNode->params[i++].p = (void *) asmap; 1206 recoveryNode->results[0] = failedPDA; 1207 if (asmap->numDataFailed ==2 ) 1208 recoveryNode->results[1] = failedPDAtwo; 1209 1210 /* zero fill the target data buffers? */ 1211} 1212