1/* $NetBSD: rf_dagffwr.c,v 1.38 2023/10/15 18:15:20 oster Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/* 30 * rf_dagff.c 31 * 32 * code for creating fault-free DAGs 33 * 34 */ 35 36#include <sys/cdefs.h> 37__KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.38 2023/10/15 18:15:20 oster Exp $"); 38 39#include <dev/raidframe/raidframevar.h> 40 41#include "rf_raid.h" 42#include "rf_dag.h" 43#include "rf_dagutils.h" 44#include "rf_dagfuncs.h" 45#include "rf_debugMem.h" 46#include "rf_dagffrd.h" 47#include "rf_general.h" 48#include "rf_dagffwr.h" 49#include "rf_map.h" 50 51/****************************************************************************** 52 * 53 * General comments on DAG creation: 54 * 55 * All DAGs in this file use roll-away error recovery. Each DAG has a single 56 * commit node, usually called "Cmt." If an error occurs before the Cmt node 57 * is reached, the execution engine will halt forward execution and work 58 * backward through the graph, executing the undo functions. Assuming that 59 * each node in the graph prior to the Cmt node are undoable and atomic - or - 60 * does not make changes to permanent state, the graph will fail atomically. 61 * If an error occurs after the Cmt node executes, the engine will roll-forward 62 * through the graph, blindly executing nodes until it reaches the end. 63 * If a graph reaches the end, it is assumed to have completed successfully. 64 * 65 * A graph has only 1 Cmt node. 66 * 67 */ 68 69 70/****************************************************************************** 71 * 72 * The following wrappers map the standard DAG creation interface to the 73 * DAG creation routines. Additionally, these wrappers enable experimentation 74 * with new DAG structures by providing an extra level of indirection, allowing 75 * the DAG creation routines to be replaced at this single point. 76 */ 77 78 79void 80rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 81 RF_DagHeader_t *dag_h, void *bp, 82 RF_RaidAccessFlags_t flags, 83 RF_AllocListElem_t *allocList, 84 RF_IoType_t type) 85{ 86 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 87 RF_IO_TYPE_WRITE); 88} 89 90void 91rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 92 RF_DagHeader_t *dag_h, void *bp, 93 RF_RaidAccessFlags_t flags, 94 RF_AllocListElem_t *allocList, 95 RF_IoType_t type) 96{ 97 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 98 RF_IO_TYPE_WRITE); 99} 100 101void 102rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 103 RF_DagHeader_t *dag_h, void *bp, 104 RF_RaidAccessFlags_t flags, 105 RF_AllocListElem_t *allocList) 106{ 107 /* "normal" rollaway */ 108 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, 109 allocList, &rf_xorFuncs, NULL); 110} 111 112void 113rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 114 RF_DagHeader_t *dag_h, void *bp, 115 RF_RaidAccessFlags_t flags, 116 RF_AllocListElem_t *allocList) 117{ 118 /* "normal" rollaway */ 119 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, 120 allocList, 1, rf_RegularXorFunc, RF_TRUE); 121} 122 123 124/****************************************************************************** 125 * 126 * DAG creation code begins here 127 */ 128#define BUF_ALLOC(num) \ 129 RF_MallocAndAdd(rf_RaidAddressToByte(raidPtr, num), allocList) 130 131 132/****************************************************************************** 133 * 134 * creates a DAG to perform a large-write operation: 135 * 136 * / Rod \ / Wnd \ 137 * H -- block- Rod - Xor - Cmt - Wnd --- T 138 * \ Rod / \ Wnp / 139 * \[Wnq]/ 140 * 141 * The XOR node also does the Q calculation in the P+Q architecture. 142 * All nodes are before the commit node (Cmt) are assumed to be atomic and 143 * undoable - or - they make no changes to permanent state. 144 * 145 * Rod = read old data 146 * Cmt = commit node 147 * Wnp = write new parity 148 * Wnd = write new data 149 * Wnq = write new "q" 150 * [] denotes optional segments in the graph 151 * 152 * Parameters: raidPtr - description of the physical array 153 * asmap - logical & physical addresses for this access 154 * bp - buffer ptr (holds write data) 155 * flags - general flags (e.g. disk locking) 156 * allocList - list of memory allocated in DAG creation 157 * nfaults - number of faults array can tolerate 158 * (equal to # redundancy units in stripe) 159 * redfuncs - list of redundancy generating functions 160 * 161 *****************************************************************************/ 162 163void 164rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 165 RF_DagHeader_t *dag_h, void *bp, 166 RF_RaidAccessFlags_t flags, 167 RF_AllocListElem_t *allocList, 168 int nfaults, void (*redFunc) (RF_DagNode_t *), 169 int allowBufferRecycle) 170{ 171 RF_DagNode_t *wndNodes, *rodNodes, *xorNode, *wnpNode, *tmpNode; 172 RF_DagNode_t *blockNode, *commitNode, *termNode; 173#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 174 RF_DagNode_t *wnqNode; 175#endif 176 int nWndNodes, nRodNodes, i, nodeNum, asmNum; 177 RF_AccessStripeMapHeader_t *new_asm_h[2]; 178 RF_StripeNum_t parityStripeID; 179 char *sosBuffer, *eosBuffer; 180 RF_ReconUnitNum_t which_ru; 181 RF_RaidLayout_t *layoutPtr; 182 RF_PhysDiskAddr_t *pda; 183 184 layoutPtr = &(raidPtr->Layout); 185 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, 186 asmap->raidAddress, 187 &which_ru); 188 189#if RF_DEBUG_DAG 190 if (rf_dagDebug) { 191 printf("[Creating large-write DAG]\n"); 192 } 193#endif 194 dag_h->creator = "LargeWriteDAG"; 195 196 dag_h->numCommitNodes = 1; 197 dag_h->numCommits = 0; 198 dag_h->numSuccedents = 1; 199 200 /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */ 201 nWndNodes = asmap->numStripeUnitsAccessed; 202 203 for (i = 0; i < nWndNodes; i++) { 204 tmpNode = rf_AllocDAGNode(raidPtr); 205 tmpNode->list_next = dag_h->nodes; 206 dag_h->nodes = tmpNode; 207 } 208 wndNodes = dag_h->nodes; 209 210 xorNode = rf_AllocDAGNode(raidPtr); 211 xorNode->list_next = dag_h->nodes; 212 dag_h->nodes = xorNode; 213 214 wnpNode = rf_AllocDAGNode(raidPtr); 215 wnpNode->list_next = dag_h->nodes; 216 dag_h->nodes = wnpNode; 217 218 blockNode = rf_AllocDAGNode(raidPtr); 219 blockNode->list_next = dag_h->nodes; 220 dag_h->nodes = blockNode; 221 222 commitNode = rf_AllocDAGNode(raidPtr); 223 commitNode->list_next = dag_h->nodes; 224 dag_h->nodes = commitNode; 225 226 termNode = rf_AllocDAGNode(raidPtr); 227 termNode->list_next = dag_h->nodes; 228 dag_h->nodes = termNode; 229 230#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 231 if (nfaults == 2) { 232 wnqNode = rf_AllocDAGNode(raidPtr); 233 } else { 234 wnqNode = NULL; 235 } 236#endif 237 rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, 238 new_asm_h, &nRodNodes, &sosBuffer, 239 &eosBuffer, allocList); 240 if (nRodNodes > 0) { 241 for (i = 0; i < nRodNodes; i++) { 242 tmpNode = rf_AllocDAGNode(raidPtr); 243 tmpNode->list_next = dag_h->nodes; 244 dag_h->nodes = tmpNode; 245 } 246 rodNodes = dag_h->nodes; 247 } else { 248 rodNodes = NULL; 249 } 250 251 /* begin node initialization */ 252 if (nRodNodes > 0) { 253 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, 254 rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0, 255 dag_h, "Nil", allocList); 256 } else { 257 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, 258 rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, 259 dag_h, "Nil", allocList); 260 } 261 262 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, 263 rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0, 264 dag_h, "Cmt", allocList); 265 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, 266 rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0, 267 dag_h, "Trm", allocList); 268 269 /* initialize the Rod nodes */ 270 tmpNode = rodNodes; 271 for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { 272 if (new_asm_h[asmNum]) { 273 pda = new_asm_h[asmNum]->stripeMap->physInfo; 274 while (pda) { 275 rf_InitNode(tmpNode, rf_wait, 276 RF_FALSE, rf_DiskReadFunc, 277 rf_DiskReadUndoFunc, 278 rf_GenericWakeupFunc, 279 1, 1, 4, 0, dag_h, 280 "Rod", allocList); 281 tmpNode->params[0].p = pda; 282 tmpNode->params[1].p = pda->bufPtr; 283 tmpNode->params[2].v = parityStripeID; 284 tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 285 which_ru); 286 nodeNum++; 287 pda = pda->next; 288 tmpNode = tmpNode->list_next; 289 } 290 } 291 } 292 RF_ASSERT(nodeNum == nRodNodes); 293 294 /* initialize the wnd nodes */ 295 pda = asmap->physInfo; 296 tmpNode = wndNodes; 297 for (i = 0; i < nWndNodes; i++) { 298 rf_InitNode(tmpNode, rf_wait, RF_FALSE, 299 rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 300 rf_GenericWakeupFunc, 1, 1, 4, 0, 301 dag_h, "Wnd", allocList); 302 RF_ASSERT(pda != NULL); 303 tmpNode->params[0].p = pda; 304 tmpNode->params[1].p = pda->bufPtr; 305 tmpNode->params[2].v = parityStripeID; 306 tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 307 pda = pda->next; 308 tmpNode = tmpNode->list_next; 309 } 310 311 /* initialize the redundancy node */ 312 if (nRodNodes > 0) { 313 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, 314 rf_NullNodeUndoFunc, NULL, 1, 315 nRodNodes, 2 * (nWndNodes + nRodNodes) + 1, 316 nfaults, dag_h, "Xr ", allocList); 317 } else { 318 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, 319 rf_NullNodeUndoFunc, NULL, 1, 320 1, 2 * (nWndNodes + nRodNodes) + 1, 321 nfaults, dag_h, "Xr ", allocList); 322 } 323 xorNode->flags |= RF_DAGNODE_FLAG_YIELD; 324 tmpNode = wndNodes; 325 for (i = 0; i < nWndNodes; i++) { 326 /* pda */ 327 xorNode->params[2 * i + 0] = tmpNode->params[0]; 328 /* buf ptr */ 329 xorNode->params[2 * i + 1] = tmpNode->params[1]; 330 tmpNode = tmpNode->list_next; 331 } 332 tmpNode = rodNodes; 333 for (i = 0; i < nRodNodes; i++) { 334 /* pda */ 335 xorNode->params[2 * (nWndNodes + i) + 0] = tmpNode->params[0]; 336 /* buf ptr */ 337 xorNode->params[2 * (nWndNodes + i) + 1] = tmpNode->params[1]; 338 tmpNode = tmpNode->list_next; 339 } 340 /* xor node needs to get at RAID information */ 341 xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; 342 343 /* 344 * Look for an Rod node that reads a complete SU. If none, 345 * alloc a buffer to receive the parity info. Note that we 346 * can't use a new data buffer because it will not have gotten 347 * written when the xor occurs. */ 348 if (allowBufferRecycle) { 349 tmpNode = rodNodes; 350 for (i = 0; i < nRodNodes; i++) { 351 if (((RF_PhysDiskAddr_t *) tmpNode->params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) 352 break; 353 tmpNode = tmpNode->list_next; 354 } 355 } 356 if ((!allowBufferRecycle) || (i == nRodNodes)) { 357 xorNode->results[0] = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit)); 358 } else { 359 /* this works because the only way we get here is if 360 allowBufferRecycle is true and we went through the 361 above for loop, and exited via the break before 362 i==nRodNodes was true. That means tmpNode will 363 still point to a valid node -- the one we want for 364 here! */ 365 xorNode->results[0] = tmpNode->params[1].p; 366 } 367 368 /* initialize the Wnp node */ 369 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, 370 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 371 dag_h, "Wnp", allocList); 372 wnpNode->params[0].p = asmap->parityInfo; 373 wnpNode->params[1].p = xorNode->results[0]; 374 wnpNode->params[2].v = parityStripeID; 375 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 376 /* parityInfo must describe entire parity unit */ 377 RF_ASSERT(asmap->parityInfo->next == NULL); 378 379#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 380 if (nfaults == 2) { 381 /* 382 * We never try to recycle a buffer for the Q calcuation 383 * in addition to the parity. This would cause two buffers 384 * to get smashed during the P and Q calculation, guaranteeing 385 * one would be wrong. 386 */ 387 xorNode->results[1] = 388 BUF_ALLOC(raidPtr->Layout.sectorsPerStripeUnit); 389 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, 390 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 391 1, 1, 4, 0, dag_h, "Wnq", allocList); 392 wnqNode->params[0].p = asmap->qInfo; 393 wnqNode->params[1].p = xorNode->results[1]; 394 wnqNode->params[2].v = parityStripeID; 395 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 396 /* parityInfo must describe entire parity unit */ 397 RF_ASSERT(asmap->parityInfo->next == NULL); 398 } 399#endif 400 /* 401 * Connect nodes to form graph. 402 */ 403 404 /* connect dag header to block node */ 405 RF_ASSERT(blockNode->numAntecedents == 0); 406 dag_h->succedents[0] = blockNode; 407 408 if (nRodNodes > 0) { 409 /* connect the block node to the Rod nodes */ 410 RF_ASSERT(blockNode->numSuccedents == nRodNodes); 411 RF_ASSERT(xorNode->numAntecedents == nRodNodes); 412 tmpNode = rodNodes; 413 for (i = 0; i < nRodNodes; i++) { 414 RF_ASSERT(tmpNode->numAntecedents == 1); 415 blockNode->succedents[i] = tmpNode; 416 tmpNode->antecedents[0] = blockNode; 417 tmpNode->antType[0] = rf_control; 418 419 /* connect the Rod nodes to the Xor node */ 420 RF_ASSERT(tmpNode->numSuccedents == 1); 421 tmpNode->succedents[0] = xorNode; 422 xorNode->antecedents[i] = tmpNode; 423 xorNode->antType[i] = rf_trueData; 424 tmpNode = tmpNode->list_next; 425 } 426 } else { 427 /* connect the block node to the Xor node */ 428 RF_ASSERT(blockNode->numSuccedents == 1); 429 RF_ASSERT(xorNode->numAntecedents == 1); 430 blockNode->succedents[0] = xorNode; 431 xorNode->antecedents[0] = blockNode; 432 xorNode->antType[0] = rf_control; 433 } 434 435 /* connect the xor node to the commit node */ 436 RF_ASSERT(xorNode->numSuccedents == 1); 437 RF_ASSERT(commitNode->numAntecedents == 1); 438 xorNode->succedents[0] = commitNode; 439 commitNode->antecedents[0] = xorNode; 440 commitNode->antType[0] = rf_control; 441 442 /* connect the commit node to the write nodes */ 443 RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults); 444 tmpNode = wndNodes; 445 for (i = 0; i < nWndNodes; i++) { 446 RF_ASSERT(wndNodes->numAntecedents == 1); 447 commitNode->succedents[i] = tmpNode; 448 tmpNode->antecedents[0] = commitNode; 449 tmpNode->antType[0] = rf_control; 450 tmpNode = tmpNode->list_next; 451 } 452 RF_ASSERT(wnpNode->numAntecedents == 1); 453 commitNode->succedents[nWndNodes] = wnpNode; 454 wnpNode->antecedents[0] = commitNode; 455 wnpNode->antType[0] = rf_trueData; 456#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 457 if (nfaults == 2) { 458 RF_ASSERT(wnqNode->numAntecedents == 1); 459 commitNode->succedents[nWndNodes + 1] = wnqNode; 460 wnqNode->antecedents[0] = commitNode; 461 wnqNode->antType[0] = rf_trueData; 462 } 463#endif 464 /* connect the write nodes to the term node */ 465 RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults); 466 RF_ASSERT(termNode->numSuccedents == 0); 467 tmpNode = wndNodes; 468 for (i = 0; i < nWndNodes; i++) { 469 RF_ASSERT(wndNodes->numSuccedents == 1); 470 tmpNode->succedents[0] = termNode; 471 termNode->antecedents[i] = tmpNode; 472 termNode->antType[i] = rf_control; 473 tmpNode = tmpNode->list_next; 474 } 475 RF_ASSERT(wnpNode->numSuccedents == 1); 476 wnpNode->succedents[0] = termNode; 477 termNode->antecedents[nWndNodes] = wnpNode; 478 termNode->antType[nWndNodes] = rf_control; 479#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 480 if (nfaults == 2) { 481 RF_ASSERT(wnqNode->numSuccedents == 1); 482 wnqNode->succedents[0] = termNode; 483 termNode->antecedents[nWndNodes + 1] = wnqNode; 484 termNode->antType[nWndNodes + 1] = rf_control; 485 } 486#endif 487} 488/****************************************************************************** 489 * 490 * creates a DAG to perform a small-write operation (either raid 5 or pq), 491 * which is as follows: 492 * 493 * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm 494 * \- Rod X / \----> Wnd [Und]-/ 495 * [\- Rod X / \---> Wnd [Und]-/] 496 * [\- Roq -> Q / \--> Wnq [Unq]-/] 497 * 498 * Rop = read old parity 499 * Rod = read old data 500 * Roq = read old "q" 501 * Cmt = commit node 502 * Und = unlock data disk 503 * Unp = unlock parity disk 504 * Unq = unlock q disk 505 * Wnp = write new parity 506 * Wnd = write new data 507 * Wnq = write new "q" 508 * [ ] denotes optional segments in the graph 509 * 510 * Parameters: raidPtr - description of the physical array 511 * asmap - logical & physical addresses for this access 512 * bp - buffer ptr (holds write data) 513 * flags - general flags (e.g. disk locking) 514 * allocList - list of memory allocated in DAG creation 515 * pfuncs - list of parity generating functions 516 * qfuncs - list of q generating functions 517 * 518 * A null qfuncs indicates single fault tolerant 519 *****************************************************************************/ 520 521void 522rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 523 RF_DagHeader_t *dag_h, void *bp, 524 RF_RaidAccessFlags_t flags, 525 RF_AllocListElem_t *allocList, 526 const RF_RedFuncs_t *pfuncs, 527 const RF_RedFuncs_t *qfuncs) 528{ 529 RF_DagNode_t *readDataNodes, *readParityNodes, *termNode; 530 RF_DagNode_t *tmpNode, *tmpreadDataNode, *tmpreadParityNode; 531 RF_DagNode_t *xorNodes, *blockNode, *commitNode; 532 RF_DagNode_t *writeDataNodes, *writeParityNodes; 533 RF_DagNode_t *tmpxorNode, *tmpwriteDataNode; 534 RF_DagNode_t *tmpwriteParityNode; 535#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 536 RF_DagNode_t *tmpwriteQNode, *tmpreadQNode, *tmpqNode, *readQNodes, 537 *writeQNodes, *qNodes; 538#endif 539 int i, j, nNodes; 540 RF_ReconUnitNum_t which_ru; 541 void (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *); 542 void (*qfunc) (RF_DagNode_t *) __unused; 543 int numDataNodes, numParityNodes; 544 RF_StripeNum_t parityStripeID; 545 RF_PhysDiskAddr_t *pda; 546 const char *name, *qname __unused; 547 long nfaults; 548 549 nfaults = qfuncs ? 2 : 1; 550 551 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 552 asmap->raidAddress, &which_ru); 553 pda = asmap->physInfo; 554 numDataNodes = asmap->numStripeUnitsAccessed; 555 numParityNodes = (asmap->parityInfo->next) ? 2 : 1; 556 557#if RF_DEBUG_DAG 558 if (rf_dagDebug) { 559 printf("[Creating small-write DAG]\n"); 560 } 561#endif 562 RF_ASSERT(numDataNodes > 0); 563 dag_h->creator = "SmallWriteDAG"; 564 565 dag_h->numCommitNodes = 1; 566 dag_h->numCommits = 0; 567 dag_h->numSuccedents = 1; 568 569 /* 570 * DAG creation occurs in four steps: 571 * 1. count the number of nodes in the DAG 572 * 2. create the nodes 573 * 3. initialize the nodes 574 * 4. connect the nodes 575 */ 576 577 /* 578 * Step 1. compute number of nodes in the graph 579 */ 580 581 /* number of nodes: a read and write for each data unit a 582 * redundancy computation node for each parity node (nfaults * 583 * nparity) a read and write for each parity unit a block and 584 * commit node (2) a terminate node if atomic RMW an unlock 585 * node for each data unit, redundancy unit 586 * totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) 587 * + (nfaults * 2 * numParityNodes) + 3; 588 */ 589 590 /* 591 * Step 2. create the nodes 592 */ 593 594 blockNode = rf_AllocDAGNode(raidPtr); 595 blockNode->list_next = dag_h->nodes; 596 dag_h->nodes = blockNode; 597 598 commitNode = rf_AllocDAGNode(raidPtr); 599 commitNode->list_next = dag_h->nodes; 600 dag_h->nodes = commitNode; 601 602 for (i = 0; i < numDataNodes; i++) { 603 tmpNode = rf_AllocDAGNode(raidPtr); 604 tmpNode->list_next = dag_h->nodes; 605 dag_h->nodes = tmpNode; 606 } 607 readDataNodes = dag_h->nodes; 608 609 for (i = 0; i < numParityNodes; i++) { 610 tmpNode = rf_AllocDAGNode(raidPtr); 611 tmpNode->list_next = dag_h->nodes; 612 dag_h->nodes = tmpNode; 613 } 614 readParityNodes = dag_h->nodes; 615 616 for (i = 0; i < numDataNodes; i++) { 617 tmpNode = rf_AllocDAGNode(raidPtr); 618 tmpNode->list_next = dag_h->nodes; 619 dag_h->nodes = tmpNode; 620 } 621 writeDataNodes = dag_h->nodes; 622 623 for (i = 0; i < numParityNodes; i++) { 624 tmpNode = rf_AllocDAGNode(raidPtr); 625 tmpNode->list_next = dag_h->nodes; 626 dag_h->nodes = tmpNode; 627 } 628 writeParityNodes = dag_h->nodes; 629 630 for (i = 0; i < numParityNodes; i++) { 631 tmpNode = rf_AllocDAGNode(raidPtr); 632 tmpNode->list_next = dag_h->nodes; 633 dag_h->nodes = tmpNode; 634 } 635 xorNodes = dag_h->nodes; 636 637 termNode = rf_AllocDAGNode(raidPtr); 638 termNode->list_next = dag_h->nodes; 639 dag_h->nodes = termNode; 640 641#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 642 if (nfaults == 2) { 643 for (i = 0; i < numParityNodes; i++) { 644 tmpNode = rf_AllocDAGNode(raidPtr); 645 tmpNode->list_next = dag_h->nodes; 646 dag_h->nodes = tmpNode; 647 } 648 readQNodes = dag_h->nodes; 649 650 for (i = 0; i < numParityNodes; i++) { 651 tmpNode = rf_AllocDAGNode(raidPtr); 652 tmpNode->list_next = dag_h->nodes; 653 dag_h->nodes = tmpNode; 654 } 655 writeQNodes = dag_h->nodes; 656 657 for (i = 0; i < numParityNodes; i++) { 658 tmpNode = rf_AllocDAGNode(raidPtr); 659 tmpNode->list_next = dag_h->nodes; 660 dag_h->nodes = tmpNode; 661 } 662 qNodes = dag_h->nodes; 663 } else { 664 readQNodes = writeQNodes = qNodes = NULL; 665 } 666#endif 667 668 /* 669 * Step 3. initialize the nodes 670 */ 671 /* initialize block node (Nil) */ 672 nNodes = numDataNodes + (nfaults * numParityNodes); 673 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, 674 rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, 675 dag_h, "Nil", allocList); 676 677 /* initialize commit node (Cmt) */ 678 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, 679 rf_NullNodeUndoFunc, NULL, nNodes, 680 (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList); 681 682 /* initialize terminate node (Trm) */ 683 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, 684 rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0, 685 dag_h, "Trm", allocList); 686 687 /* initialize nodes which read old data (Rod) */ 688 tmpreadDataNode = readDataNodes; 689 for (i = 0; i < numDataNodes; i++) { 690 rf_InitNode(tmpreadDataNode, rf_wait, RF_FALSE, 691 rf_DiskReadFunc, rf_DiskReadUndoFunc, 692 rf_GenericWakeupFunc, (nfaults * numParityNodes), 693 1, 4, 0, dag_h, "Rod", allocList); 694 RF_ASSERT(pda != NULL); 695 /* physical disk addr desc */ 696 tmpreadDataNode->params[0].p = pda; 697 /* buffer to hold old data */ 698 tmpreadDataNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); 699 tmpreadDataNode->params[2].v = parityStripeID; 700 tmpreadDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 701 which_ru); 702 pda = pda->next; 703 for (j = 0; j < tmpreadDataNode->numSuccedents; j++) { 704 tmpreadDataNode->propList[j] = NULL; 705 } 706 tmpreadDataNode = tmpreadDataNode->list_next; 707 } 708 709 /* initialize nodes which read old parity (Rop) */ 710 pda = asmap->parityInfo; 711 i = 0; 712 tmpreadParityNode = readParityNodes; 713 for (i = 0; i < numParityNodes; i++) { 714 RF_ASSERT(pda != NULL); 715 rf_InitNode(tmpreadParityNode, rf_wait, RF_FALSE, 716 rf_DiskReadFunc, rf_DiskReadUndoFunc, 717 rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, 718 dag_h, "Rop", allocList); 719 tmpreadParityNode->params[0].p = pda; 720 /* buffer to hold old parity */ 721 tmpreadParityNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); 722 tmpreadParityNode->params[2].v = parityStripeID; 723 tmpreadParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 724 which_ru); 725 pda = pda->next; 726 for (j = 0; j < tmpreadParityNode->numSuccedents; j++) { 727 tmpreadParityNode->propList[0] = NULL; 728 } 729 tmpreadParityNode = tmpreadParityNode->list_next; 730 } 731 732#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 733 /* initialize nodes which read old Q (Roq) */ 734 if (nfaults == 2) { 735 pda = asmap->qInfo; 736 tmpreadQNode = readQNodes; 737 for (i = 0; i < numParityNodes; i++) { 738 RF_ASSERT(pda != NULL); 739 rf_InitNode(tmpreadQNode, rf_wait, RF_FALSE, 740 rf_DiskReadFunc, rf_DiskReadUndoFunc, 741 rf_GenericWakeupFunc, numParityNodes, 742 1, 4, 0, dag_h, "Roq", allocList); 743 tmpreadQNode->params[0].p = pda; 744 /* buffer to hold old Q */ 745 tmpreadQNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, 746 pda->numSector << raidPtr->logBytesPerSector); 747 tmpreadQNode->params[2].v = parityStripeID; 748 tmpreadQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 749 which_ru); 750 pda = pda->next; 751 for (j = 0; j < tmpreadQNode->numSuccedents; j++) { 752 tmpreadQNode->propList[0] = NULL; 753 } 754 tmpreadQNode = tmpreadQNode->list_next; 755 } 756 } 757#endif 758 /* initialize nodes which write new data (Wnd) */ 759 pda = asmap->physInfo; 760 tmpwriteDataNode = writeDataNodes; 761 for (i = 0; i < numDataNodes; i++) { 762 RF_ASSERT(pda != NULL); 763 rf_InitNode(tmpwriteDataNode, rf_wait, RF_FALSE, 764 rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 765 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, 766 "Wnd", allocList); 767 /* physical disk addr desc */ 768 tmpwriteDataNode->params[0].p = pda; 769 /* buffer holding new data to be written */ 770 tmpwriteDataNode->params[1].p = pda->bufPtr; 771 tmpwriteDataNode->params[2].v = parityStripeID; 772 tmpwriteDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 773 which_ru); 774 pda = pda->next; 775 tmpwriteDataNode = tmpwriteDataNode->list_next; 776 } 777 778 /* 779 * Initialize nodes which compute new parity and Q. 780 */ 781 /* 782 * We use the simple XOR func in the double-XOR case, and when 783 * we're accessing only a portion of one stripe unit. The 784 * distinction between the two is that the regular XOR func 785 * assumes that the targbuf is a full SU in size, and examines 786 * the pda associated with the buffer to decide where within 787 * the buffer to XOR the data, whereas the simple XOR func 788 * just XORs the data into the start of the buffer. */ 789 if ((numParityNodes == 2) || ((numDataNodes == 1) 790 && (asmap->totalSectorsAccessed < 791 raidPtr->Layout.sectorsPerStripeUnit))) { 792 func = pfuncs->simple; 793 undoFunc = rf_NullNodeUndoFunc; 794 name = pfuncs->SimpleName; 795 if (qfuncs) { 796 qfunc = qfuncs->simple; 797 qname = qfuncs->SimpleName; 798 } else { 799 qfunc = NULL; 800 qname = NULL; 801 } 802 } else { 803 func = pfuncs->regular; 804 undoFunc = rf_NullNodeUndoFunc; 805 name = pfuncs->RegularName; 806 if (qfuncs) { 807 qfunc = qfuncs->regular; 808 qname = qfuncs->RegularName; 809 } else { 810 qfunc = NULL; 811 qname = NULL; 812 } 813 } 814 /* 815 * Initialize the xor nodes: params are {pda,buf} 816 * from {Rod,Wnd,Rop} nodes, and raidPtr 817 */ 818 if (numParityNodes == 2) { 819 /* double-xor case */ 820 tmpxorNode = xorNodes; 821 tmpreadDataNode = readDataNodes; 822 tmpreadParityNode = readParityNodes; 823 tmpwriteDataNode = writeDataNodes; 824#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 825 tmpqNode = qNodes; 826 tmpreadQNode = readQNodes; 827#endif 828 for (i = 0; i < numParityNodes; i++) { 829 /* note: no wakeup func for xor */ 830 rf_InitNode(tmpxorNode, rf_wait, RF_FALSE, func, 831 undoFunc, NULL, 1, 832 (numDataNodes + numParityNodes), 833 7, 1, dag_h, name, allocList); 834 tmpxorNode->flags |= RF_DAGNODE_FLAG_YIELD; 835 tmpxorNode->params[0] = tmpreadDataNode->params[0]; 836 tmpxorNode->params[1] = tmpreadDataNode->params[1]; 837 tmpxorNode->params[2] = tmpreadParityNode->params[0]; 838 tmpxorNode->params[3] = tmpreadParityNode->params[1]; 839 tmpxorNode->params[4] = tmpwriteDataNode->params[0]; 840 tmpxorNode->params[5] = tmpwriteDataNode->params[1]; 841 tmpxorNode->params[6].p = raidPtr; 842 /* use old parity buf as target buf */ 843 tmpxorNode->results[0] = tmpreadParityNode->params[1].p; 844#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 845 if (nfaults == 2) { 846 /* note: no wakeup func for qor */ 847 rf_InitNode(tmpqNode, rf_wait, RF_FALSE, 848 qfunc, undoFunc, NULL, 1, 849 (numDataNodes + numParityNodes), 850 7, 1, dag_h, qname, allocList); 851 tmpqNode->params[0] = tmpreadDataNode->params[0]; 852 tmpqNode->params[1] = tmpreadDataNode->params[1]; 853 tmpqNode->params[2] = tmpreadQNode->params[0]; 854 tmpqNode->params[3] = tmpreadQNode->params[1]; 855 tmpqNode->params[4] = tmpwriteDataNode->params[0]; 856 tmpqNode->params[5] = tmpwriteDataNode->params[1]; 857 tmpqNode->params[6].p = raidPtr; 858 /* use old Q buf as target buf */ 859 tmpqNode->results[0] = tmpreadQNode->params[1].p; 860 tmpqNode = tmpqNode->list_next; 861 tmpreadQNode = tmpreadQNode->list_next; 862 } 863#endif 864 tmpxorNode = tmpxorNode->list_next; 865 tmpreadDataNode = tmpreadDataNode->list_next; 866 tmpreadParityNode = tmpreadParityNode->list_next; 867 tmpwriteDataNode = tmpwriteDataNode->list_next; 868 } 869 } else { 870 /* there is only one xor node in this case */ 871 rf_InitNode(xorNodes, rf_wait, RF_FALSE, func, 872 undoFunc, NULL, 1, (numDataNodes + numParityNodes), 873 (2 * (numDataNodes + numDataNodes + 1) + 1), 1, 874 dag_h, name, allocList); 875 xorNodes->flags |= RF_DAGNODE_FLAG_YIELD; 876 tmpreadDataNode = readDataNodes; 877 for (i = 0; i < numDataNodes; i++) { /* used to be"numDataNodes + 1" until we factored 878 out the "+1" into the "deal with Rop separately below */ 879 /* set up params related to Rod nodes */ 880 xorNodes->params[2 * i + 0] = tmpreadDataNode->params[0]; /* pda */ 881 xorNodes->params[2 * i + 1] = tmpreadDataNode->params[1]; /* buffer ptr */ 882 tmpreadDataNode = tmpreadDataNode->list_next; 883 } 884 /* deal with Rop separately */ 885 xorNodes->params[2 * numDataNodes + 0] = readParityNodes->params[0]; /* pda */ 886 xorNodes->params[2 * numDataNodes + 1] = readParityNodes->params[1]; /* buffer ptr */ 887 888 tmpwriteDataNode = writeDataNodes; 889 for (i = 0; i < numDataNodes; i++) { 890 /* set up params related to Wnd and Wnp nodes */ 891 xorNodes->params[2 * (numDataNodes + 1 + i) + 0] = /* pda */ 892 tmpwriteDataNode->params[0]; 893 xorNodes->params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */ 894 tmpwriteDataNode->params[1]; 895 tmpwriteDataNode = tmpwriteDataNode->list_next; 896 } 897 /* xor node needs to get at RAID information */ 898 xorNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; 899 xorNodes->results[0] = readParityNodes->params[1].p; 900#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 901 if (nfaults == 2) { 902 rf_InitNode(qNodes, rf_wait, RF_FALSE, qfunc, 903 undoFunc, NULL, 1, 904 (numDataNodes + numParityNodes), 905 (2 * (numDataNodes + numDataNodes + 1) + 1), 1, 906 dag_h, qname, allocList); 907 tmpreadDataNode = readDataNodes; 908 for (i = 0; i < numDataNodes; i++) { 909 /* set up params related to Rod */ 910 qNodes->params[2 * i + 0] = tmpreadDataNode->params[0]; /* pda */ 911 qNodes->params[2 * i + 1] = tmpreadDataNode->params[1]; /* buffer ptr */ 912 tmpreadDataNode = tmpreadDataNode->list_next; 913 } 914 /* and read old q */ 915 qNodes->params[2 * numDataNodes + 0] = /* pda */ 916 readQNodes->params[0]; 917 qNodes->params[2 * numDataNodes + 1] = /* buffer ptr */ 918 readQNodes->params[1]; 919 tmpwriteDataNode = writeDataNodes; 920 for (i = 0; i < numDataNodes; i++) { 921 /* set up params related to Wnd nodes */ 922 qNodes->params[2 * (numDataNodes + 1 + i) + 0] = /* pda */ 923 tmpwriteDataNode->params[0]; 924 qNodes->params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */ 925 tmpwriteDataNode->params[1]; 926 tmpwriteDataNode = tmpwriteDataNode->list_next; 927 } 928 /* xor node needs to get at RAID information */ 929 qNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; 930 qNodes->results[0] = readQNodes->params[1].p; 931 } 932#endif 933 } 934 935 /* initialize nodes which write new parity (Wnp) */ 936 pda = asmap->parityInfo; 937 tmpwriteParityNode = writeParityNodes; 938 tmpxorNode = xorNodes; 939 for (i = 0; i < numParityNodes; i++) { 940 rf_InitNode(tmpwriteParityNode, rf_wait, RF_FALSE, 941 rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 942 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, 943 "Wnp", allocList); 944 RF_ASSERT(pda != NULL); 945 tmpwriteParityNode->params[0].p = pda; /* param 1 (bufPtr) 946 * filled in by xor node */ 947 tmpwriteParityNode->params[1].p = tmpxorNode->results[0]; /* buffer pointer for 948 * parity write 949 * operation */ 950 tmpwriteParityNode->params[2].v = parityStripeID; 951 tmpwriteParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 952 which_ru); 953 pda = pda->next; 954 tmpwriteParityNode = tmpwriteParityNode->list_next; 955 tmpxorNode = tmpxorNode->list_next; 956 } 957 958#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 959 /* initialize nodes which write new Q (Wnq) */ 960 if (nfaults == 2) { 961 pda = asmap->qInfo; 962 tmpwriteQNode = writeQNodes; 963 tmpqNode = qNodes; 964 for (i = 0; i < numParityNodes; i++) { 965 rf_InitNode(tmpwriteQNode, rf_wait, RF_FALSE, 966 rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 967 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, 968 "Wnq", allocList); 969 RF_ASSERT(pda != NULL); 970 tmpwriteQNode->params[0].p = pda; /* param 1 (bufPtr) 971 * filled in by xor node */ 972 tmpwriteQNode->params[1].p = tmpqNode->results[0]; /* buffer pointer for 973 * parity write 974 * operation */ 975 tmpwriteQNode->params[2].v = parityStripeID; 976 tmpwriteQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 977 which_ru); 978 pda = pda->next; 979 tmpwriteQNode = tmpwriteQNode->list_next; 980 tmpqNode = tmpqNode->list_next; 981 } 982 } 983#endif 984 /* 985 * Step 4. connect the nodes. 986 */ 987 988 /* connect header to block node */ 989 dag_h->succedents[0] = blockNode; 990 991 /* connect block node to read old data nodes */ 992 RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults))); 993 tmpreadDataNode = readDataNodes; 994 for (i = 0; i < numDataNodes; i++) { 995 blockNode->succedents[i] = tmpreadDataNode; 996 RF_ASSERT(tmpreadDataNode->numAntecedents == 1); 997 tmpreadDataNode->antecedents[0] = blockNode; 998 tmpreadDataNode->antType[0] = rf_control; 999 tmpreadDataNode = tmpreadDataNode->list_next; 1000 } 1001 1002 /* connect block node to read old parity nodes */ 1003 tmpreadParityNode = readParityNodes; 1004 for (i = 0; i < numParityNodes; i++) { 1005 blockNode->succedents[numDataNodes + i] = tmpreadParityNode; 1006 RF_ASSERT(tmpreadParityNode->numAntecedents == 1); 1007 tmpreadParityNode->antecedents[0] = blockNode; 1008 tmpreadParityNode->antType[0] = rf_control; 1009 tmpreadParityNode = tmpreadParityNode->list_next; 1010 } 1011 1012#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 1013 /* connect block node to read old Q nodes */ 1014 if (nfaults == 2) { 1015 tmpreadQNode = readQNodes; 1016 for (i = 0; i < numParityNodes; i++) { 1017 blockNode->succedents[numDataNodes + numParityNodes + i] = tmpreadQNode; 1018 RF_ASSERT(tmpreadQNode->numAntecedents == 1); 1019 tmpreadQNode->antecedents[0] = blockNode; 1020 tmpreadQNode->antType[0] = rf_control; 1021 tmpreadQNode = tmpreadQNode->list_next; 1022 } 1023 } 1024#endif 1025 /* connect read old data nodes to xor nodes */ 1026 tmpreadDataNode = readDataNodes; 1027 for (i = 0; i < numDataNodes; i++) { 1028 RF_ASSERT(tmpreadDataNode->numSuccedents == (nfaults * numParityNodes)); 1029 tmpxorNode = xorNodes; 1030 for (j = 0; j < numParityNodes; j++) { 1031 RF_ASSERT(tmpxorNode->numAntecedents == numDataNodes + numParityNodes); 1032 tmpreadDataNode->succedents[j] = tmpxorNode; 1033 tmpxorNode->antecedents[i] = tmpreadDataNode; 1034 tmpxorNode->antType[i] = rf_trueData; 1035 tmpxorNode = tmpxorNode->list_next; 1036 } 1037 tmpreadDataNode = tmpreadDataNode->list_next; 1038 } 1039 1040#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 1041 /* connect read old data nodes to q nodes */ 1042 if (nfaults == 2) { 1043 tmpreadDataNode = readDataNodes; 1044 for (i = 0; i < numDataNodes; i++) { 1045 tmpqNode = qNodes; 1046 for (j = 0; j < numParityNodes; j++) { 1047 RF_ASSERT(tmpqNode->numAntecedents == numDataNodes + numParityNodes); 1048 tmpreadDataNode->succedents[numParityNodes + j] = tmpqNode; 1049 tmpqNode->antecedents[i] = tmpreadDataNode; 1050 tmpqNode->antType[i] = rf_trueData; 1051 tmpqNode = tmpqNode->list_next; 1052 } 1053 tmpreadDataNode = tmpreadDataNode->list_next; 1054 } 1055 } 1056#endif 1057 /* connect read old parity nodes to xor nodes */ 1058 tmpreadParityNode = readParityNodes; 1059 for (i = 0; i < numParityNodes; i++) { 1060 RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes); 1061 tmpxorNode = xorNodes; 1062 for (j = 0; j < numParityNodes; j++) { 1063 tmpreadParityNode->succedents[j] = tmpxorNode; 1064 tmpxorNode->antecedents[numDataNodes + i] = tmpreadParityNode; 1065 tmpxorNode->antType[numDataNodes + i] = rf_trueData; 1066 tmpxorNode = tmpxorNode->list_next; 1067 } 1068 tmpreadParityNode = tmpreadParityNode->list_next; 1069 } 1070 1071#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 1072 /* connect read old q nodes to q nodes */ 1073 if (nfaults == 2) { 1074 tmpreadParityNode = readParityNodes; 1075 tmpreadQNode = readQNodes; 1076 for (i = 0; i < numParityNodes; i++) { 1077 RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes); 1078 tmpqNode = qNodes; 1079 for (j = 0; j < numParityNodes; j++) { 1080 tmpreadQNode->succedents[j] = tmpqNode; 1081 tmpqNode->antecedents[numDataNodes + i] = tmpreadQNode; 1082 tmpqNode->antType[numDataNodes + i] = rf_trueData; 1083 tmpqNode = tmpqNode->list_next; 1084 } 1085 tmpreadParityNode = tmpreadParityNode->list_next; 1086 tmpreadQNode = tmpreadQNode->list_next; 1087 } 1088 } 1089#endif 1090 /* connect xor nodes to commit node */ 1091 RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes)); 1092 tmpxorNode = xorNodes; 1093 for (i = 0; i < numParityNodes; i++) { 1094 RF_ASSERT(tmpxorNode->numSuccedents == 1); 1095 tmpxorNode->succedents[0] = commitNode; 1096 commitNode->antecedents[i] = tmpxorNode; 1097 commitNode->antType[i] = rf_control; 1098 tmpxorNode = tmpxorNode->list_next; 1099 } 1100 1101#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 1102 /* connect q nodes to commit node */ 1103 if (nfaults == 2) { 1104 tmpqNode = qNodes; 1105 for (i = 0; i < numParityNodes; i++) { 1106 RF_ASSERT(tmpqNode->numSuccedents == 1); 1107 tmpqNode->succedents[0] = commitNode; 1108 commitNode->antecedents[i + numParityNodes] = tmpqNode; 1109 commitNode->antType[i + numParityNodes] = rf_control; 1110 tmpqNode = tmpqNode->list_next; 1111 } 1112 } 1113#endif 1114 /* connect commit node to write nodes */ 1115 RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes))); 1116 tmpwriteDataNode = writeDataNodes; 1117 for (i = 0; i < numDataNodes; i++) { 1118 RF_ASSERT(tmpwriteDataNode->numAntecedents == 1); 1119 commitNode->succedents[i] = tmpwriteDataNode; 1120 tmpwriteDataNode->antecedents[0] = commitNode; 1121 tmpwriteDataNode->antType[0] = rf_trueData; 1122 tmpwriteDataNode = tmpwriteDataNode->list_next; 1123 } 1124 tmpwriteParityNode = writeParityNodes; 1125 for (i = 0; i < numParityNodes; i++) { 1126 RF_ASSERT(tmpwriteParityNode->numAntecedents == 1); 1127 commitNode->succedents[i + numDataNodes] = tmpwriteParityNode; 1128 tmpwriteParityNode->antecedents[0] = commitNode; 1129 tmpwriteParityNode->antType[0] = rf_trueData; 1130 tmpwriteParityNode = tmpwriteParityNode->list_next; 1131 } 1132#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 1133 if (nfaults == 2) { 1134 tmpwriteQNode = writeQNodes; 1135 for (i = 0; i < numParityNodes; i++) { 1136 RF_ASSERT(tmpwriteQNode->numAntecedents == 1); 1137 commitNode->succedents[i + numDataNodes + numParityNodes] = tmpwriteQNode; 1138 tmpwriteQNode->antecedents[0] = commitNode; 1139 tmpwriteQNode->antType[0] = rf_trueData; 1140 tmpwriteQNode = tmpwriteQNode->list_next; 1141 } 1142 } 1143#endif 1144 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); 1145 RF_ASSERT(termNode->numSuccedents == 0); 1146 tmpwriteDataNode = writeDataNodes; 1147 for (i = 0; i < numDataNodes; i++) { 1148 /* connect write new data nodes to term node */ 1149 RF_ASSERT(tmpwriteDataNode->numSuccedents == 1); 1150 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); 1151 tmpwriteDataNode->succedents[0] = termNode; 1152 termNode->antecedents[i] = tmpwriteDataNode; 1153 termNode->antType[i] = rf_control; 1154 tmpwriteDataNode = tmpwriteDataNode->list_next; 1155 } 1156 1157 tmpwriteParityNode = writeParityNodes; 1158 for (i = 0; i < numParityNodes; i++) { 1159 RF_ASSERT(tmpwriteParityNode->numSuccedents == 1); 1160 tmpwriteParityNode->succedents[0] = termNode; 1161 termNode->antecedents[numDataNodes + i] = tmpwriteParityNode; 1162 termNode->antType[numDataNodes + i] = rf_control; 1163 tmpwriteParityNode = tmpwriteParityNode->list_next; 1164 } 1165 1166#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 1167 if (nfaults == 2) { 1168 tmpwriteQNode = writeQNodes; 1169 for (i = 0; i < numParityNodes; i++) { 1170 RF_ASSERT(tmpwriteQNode->numSuccedents == 1); 1171 tmpwriteQNode->succedents[0] = termNode; 1172 termNode->antecedents[numDataNodes + numParityNodes + i] = tmpwriteQNode; 1173 termNode->antType[numDataNodes + numParityNodes + i] = rf_control; 1174 tmpwriteQNode = tmpwriteQNode->list_next; 1175 } 1176 } 1177#endif 1178} 1179 1180 1181/****************************************************************************** 1182 * create a write graph (fault-free or degraded) for RAID level 1 1183 * 1184 * Hdr -> Commit -> Wpd -> Nil -> Trm 1185 * -> Wsd -> 1186 * 1187 * The "Wpd" node writes data to the primary copy in the mirror pair 1188 * The "Wsd" node writes data to the secondary copy in the mirror pair 1189 * 1190 * Parameters: raidPtr - description of the physical array 1191 * asmap - logical & physical addresses for this access 1192 * bp - buffer ptr (holds write data) 1193 * flags - general flags (e.g. disk locking) 1194 * allocList - list of memory allocated in DAG creation 1195 *****************************************************************************/ 1196 1197void 1198rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1199 RF_DagHeader_t *dag_h, void *bp, 1200 RF_RaidAccessFlags_t flags, 1201 RF_AllocListElem_t *allocList) 1202{ 1203 RF_DagNode_t *unblockNode, *termNode, *commitNode; 1204 RF_DagNode_t *wndNode, *wmirNode; 1205 RF_DagNode_t *tmpNode, *tmpwndNode, *tmpwmirNode; 1206 int nWndNodes, nWmirNodes, i; 1207 RF_ReconUnitNum_t which_ru; 1208 RF_PhysDiskAddr_t *pda, *pdaP; 1209 RF_StripeNum_t parityStripeID; 1210 1211 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 1212 asmap->raidAddress, &which_ru); 1213#if RF_DEBUG_DAG 1214 if (rf_dagDebug) { 1215 printf("[Creating RAID level 1 write DAG]\n"); 1216 } 1217#endif 1218 dag_h->creator = "RaidOneWriteDAG"; 1219 1220 /* 2 implies access not SU aligned */ 1221 nWmirNodes = (asmap->parityInfo->next) ? 2 : 1; 1222 nWndNodes = (asmap->physInfo->next) ? 2 : 1; 1223 1224 /* alloc the Wnd nodes and the Wmir node */ 1225 if (asmap->numDataFailed == 1) 1226 nWndNodes--; 1227 if (asmap->numParityFailed == 1) 1228 nWmirNodes--; 1229 1230 /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock 1231 * + terminator) */ 1232 for (i = 0; i < nWndNodes; i++) { 1233 tmpNode = rf_AllocDAGNode(raidPtr); 1234 tmpNode->list_next = dag_h->nodes; 1235 dag_h->nodes = tmpNode; 1236 } 1237 wndNode = dag_h->nodes; 1238 1239 for (i = 0; i < nWmirNodes; i++) { 1240 tmpNode = rf_AllocDAGNode(raidPtr); 1241 tmpNode->list_next = dag_h->nodes; 1242 dag_h->nodes = tmpNode; 1243 } 1244 wmirNode = dag_h->nodes; 1245 1246 commitNode = rf_AllocDAGNode(raidPtr); 1247 commitNode->list_next = dag_h->nodes; 1248 dag_h->nodes = commitNode; 1249 1250 unblockNode = rf_AllocDAGNode(raidPtr); 1251 unblockNode->list_next = dag_h->nodes; 1252 dag_h->nodes = unblockNode; 1253 1254 termNode = rf_AllocDAGNode(raidPtr); 1255 termNode->list_next = dag_h->nodes; 1256 dag_h->nodes = termNode; 1257 1258 /* this dag can commit immediately */ 1259 dag_h->numCommitNodes = 1; 1260 dag_h->numCommits = 0; 1261 dag_h->numSuccedents = 1; 1262 1263 /* initialize the commit, unblock, and term nodes */ 1264 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, 1265 rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes), 1266 0, 0, 0, dag_h, "Cmt", allocList); 1267 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, 1268 rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes), 1269 0, 0, dag_h, "Nil", allocList); 1270 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, 1271 rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, 1272 dag_h, "Trm", allocList); 1273 1274 /* initialize the wnd nodes */ 1275 if (nWndNodes > 0) { 1276 pda = asmap->physInfo; 1277 tmpwndNode = wndNode; 1278 for (i = 0; i < nWndNodes; i++) { 1279 rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, 1280 rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 1281 rf_GenericWakeupFunc, 1, 1, 4, 0, 1282 dag_h, "Wpd", allocList); 1283 RF_ASSERT(pda != NULL); 1284 tmpwndNode->params[0].p = pda; 1285 tmpwndNode->params[1].p = pda->bufPtr; 1286 tmpwndNode->params[2].v = parityStripeID; 1287 tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 1288 pda = pda->next; 1289 tmpwndNode = tmpwndNode->list_next; 1290 } 1291 RF_ASSERT(pda == NULL); 1292 } 1293 /* initialize the mirror nodes */ 1294 if (nWmirNodes > 0) { 1295 pda = asmap->physInfo; 1296 pdaP = asmap->parityInfo; 1297 tmpwmirNode = wmirNode; 1298 for (i = 0; i < nWmirNodes; i++) { 1299 rf_InitNode(tmpwmirNode, rf_wait, RF_FALSE, 1300 rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 1301 rf_GenericWakeupFunc, 1, 1, 4, 0, 1302 dag_h, "Wsd", allocList); 1303 RF_ASSERT(pda != NULL); 1304 tmpwmirNode->params[0].p = pdaP; 1305 tmpwmirNode->params[1].p = pda->bufPtr; 1306 tmpwmirNode->params[2].v = parityStripeID; 1307 tmpwmirNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 1308 pda = pda->next; 1309 pdaP = pdaP->next; 1310 tmpwmirNode = tmpwmirNode->list_next; 1311 } 1312 RF_ASSERT(pda == NULL); 1313 RF_ASSERT(pdaP == NULL); 1314 } 1315 /* link the header node to the commit node */ 1316 RF_ASSERT(dag_h->numSuccedents == 1); 1317 RF_ASSERT(commitNode->numAntecedents == 0); 1318 dag_h->succedents[0] = commitNode; 1319 1320 /* link the commit node to the write nodes */ 1321 RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes)); 1322 tmpwndNode = wndNode; 1323 for (i = 0; i < nWndNodes; i++) { 1324 RF_ASSERT(tmpwndNode->numAntecedents == 1); 1325 commitNode->succedents[i] = tmpwndNode; 1326 tmpwndNode->antecedents[0] = commitNode; 1327 tmpwndNode->antType[0] = rf_control; 1328 tmpwndNode = tmpwndNode->list_next; 1329 } 1330 tmpwmirNode = wmirNode; 1331 for (i = 0; i < nWmirNodes; i++) { 1332 RF_ASSERT(tmpwmirNode->numAntecedents == 1); 1333 commitNode->succedents[i + nWndNodes] = tmpwmirNode; 1334 tmpwmirNode->antecedents[0] = commitNode; 1335 tmpwmirNode->antType[0] = rf_control; 1336 tmpwmirNode = tmpwmirNode->list_next; 1337 } 1338 1339 /* link the write nodes to the unblock node */ 1340 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes)); 1341 tmpwndNode = wndNode; 1342 for (i = 0; i < nWndNodes; i++) { 1343 RF_ASSERT(tmpwndNode->numSuccedents == 1); 1344 tmpwndNode->succedents[0] = unblockNode; 1345 unblockNode->antecedents[i] = tmpwndNode; 1346 unblockNode->antType[i] = rf_control; 1347 tmpwndNode = tmpwndNode->list_next; 1348 } 1349 tmpwmirNode = wmirNode; 1350 for (i = 0; i < nWmirNodes; i++) { 1351 RF_ASSERT(tmpwmirNode->numSuccedents == 1); 1352 tmpwmirNode->succedents[0] = unblockNode; 1353 unblockNode->antecedents[i + nWndNodes] = tmpwmirNode; 1354 unblockNode->antType[i + nWndNodes] = rf_control; 1355 tmpwmirNode = tmpwmirNode->list_next; 1356 } 1357 1358 /* link the unblock node to the term node */ 1359 RF_ASSERT(unblockNode->numSuccedents == 1); 1360 RF_ASSERT(termNode->numAntecedents == 1); 1361 RF_ASSERT(termNode->numSuccedents == 0); 1362 unblockNode->succedents[0] = termNode; 1363 termNode->antecedents[0] = unblockNode; 1364 termNode->antType[0] = rf_control; 1365} 1366