rf_dagdegwr.c revision 1.14
1/*	$NetBSD: rf_dagdegwr.c,v 1.14 2003/12/30 21:59:03 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * rf_dagdegwr.c
31 *
32 * code for creating degraded write DAGs
33 *
34 */
35
36#include <sys/cdefs.h>
37__KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.14 2003/12/30 21:59:03 oster Exp $");
38
39#include <dev/raidframe/raidframevar.h>
40
41#include "rf_raid.h"
42#include "rf_dag.h"
43#include "rf_dagutils.h"
44#include "rf_dagfuncs.h"
45#include "rf_debugMem.h"
46#include "rf_general.h"
47#include "rf_dagdegwr.h"
48
49
50/******************************************************************************
51 *
52 * General comments on DAG creation:
53 *
54 * All DAGs in this file use roll-away error recovery.  Each DAG has a single
55 * commit node, usually called "Cmt."  If an error occurs before the Cmt node
56 * is reached, the execution engine will halt forward execution and work
57 * backward through the graph, executing the undo functions.  Assuming that
58 * each node in the graph prior to the Cmt node are undoable and atomic - or -
59 * does not make changes to permanent state, the graph will fail atomically.
60 * If an error occurs after the Cmt node executes, the engine will roll-forward
61 * through the graph, blindly executing nodes until it reaches the end.
62 * If a graph reaches the end, it is assumed to have completed successfully.
63 *
64 * A graph has only 1 Cmt node.
65 *
66 */
67
68
69/******************************************************************************
70 *
71 * The following wrappers map the standard DAG creation interface to the
72 * DAG creation routines.  Additionally, these wrappers enable experimentation
73 * with new DAG structures by providing an extra level of indirection, allowing
74 * the DAG creation routines to be replaced at this single point.
75 */
76
77static
78RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
79{
80	rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
81	    flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
82}
83
84void
85rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
86			  RF_DagHeader_t *dag_h, void *bp,
87			  RF_RaidAccessFlags_t flags,
88			  RF_AllocListElem_t *allocList)
89{
90
91	RF_ASSERT(asmap->numDataFailed == 1);
92	dag_h->creator = "DegradedWriteDAG";
93
94	/*
95	 * if the access writes only a portion of the failed unit, and also
96	 * writes some portion of at least one surviving unit, we create two
97	 * DAGs, one for the failed component and one for the non-failed
98	 * component, and do them sequentially.  Note that the fact that we're
99	 * accessing only a portion of the failed unit indicates that the
100	 * access either starts or ends in the failed unit, and hence we need
101	 * create only two dags.  This is inefficient in that the same data or
102	 * parity can get read and written twice using this structure.  I need
103	 * to fix this to do the access all at once.
104	 */
105	RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
106		    asmap->failedPDAs[0]->numSector !=
107			raidPtr->Layout.sectorsPerStripeUnit));
108	rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
109	    allocList);
110}
111
112
113
114/******************************************************************************
115 *
116 * DAG creation code begins here
117 */
118
119
120
121/******************************************************************************
122 *
123 * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
124 * write, which is as follows
125 *
126 *                                        / {Wnq} --\
127 * hdr -> blockNode ->  Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
128 *                  \  {Rod} /            \  Wnd ---/
129 *                                        \ {Wnd} -/
130 *
131 * commit nodes: Xor, Wnd
132 *
133 * IMPORTANT:
134 * This DAG generator does not work for double-degraded archs since it does not
135 * generate Q
136 *
137 * This dag is essentially identical to the large-write dag, except that the
138 * write to the failed data unit is suppressed.
139 *
140 * IMPORTANT:  this dag does not work in the case where the access writes only
141 * a portion of the failed unit, and also writes some portion of at least one
142 * surviving SU.  this case is handled in CreateDegradedWriteDAG above.
143 *
144 * The block & unblock nodes are leftovers from a previous version.  They
145 * do nothing, but I haven't deleted them because it would be a tremendous
146 * effort to put them back in.
147 *
148 * This dag is used whenever a one of the data units in a write has failed.
149 * If it is the parity unit that failed, the nonredundant write dag (below)
150 * is used.
151 *****************************************************************************/
152
153void
154rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
155				      RF_AccessStripeMap_t *asmap,
156				      RF_DagHeader_t *dag_h, void *bp,
157				      RF_RaidAccessFlags_t flags,
158				      RF_AllocListElem_t *allocList,
159				      int nfaults,
160				      int (*redFunc) (RF_DagNode_t *),
161				      int allowBufferRecycle)
162{
163	int     nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
164	        rdnodesFaked;
165	RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
166	RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
167	RF_SectorCount_t sectorsPerSU;
168	RF_ReconUnitNum_t which_ru;
169	char   *xorTargetBuf = NULL;	/* the target buffer for the XOR
170					 * operation */
171	char   *overlappingPDAs;/* a temporary array of flags */
172	RF_AccessStripeMapHeader_t *new_asm_h[2];
173	RF_PhysDiskAddr_t *pda, *parityPDA;
174	RF_StripeNum_t parityStripeID;
175	RF_PhysDiskAddr_t *failedPDA;
176	RF_RaidLayout_t *layoutPtr;
177
178	layoutPtr = &(raidPtr->Layout);
179	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
180	    &which_ru);
181	sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
182	/* failedPDA points to the pda within the asm that targets the failed
183	 * disk */
184	failedPDA = asmap->failedPDAs[0];
185
186	if (rf_dagDebug)
187		printf("[Creating degraded-write DAG]\n");
188
189	RF_ASSERT(asmap->numDataFailed == 1);
190	dag_h->creator = "SimpleDegradedWriteDAG";
191
192	/*
193         * Generate two ASMs identifying the surviving data
194         * we need in order to recover the lost data.
195         */
196	/* overlappingPDAs array must be zero'd */
197	RF_Malloc(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char), (char *));
198	rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
199	    &nXorBufs, NULL, overlappingPDAs, allocList);
200
201	/* create all the nodes at once */
202	nWndNodes = asmap->numStripeUnitsAccessed - 1;	/* no access is
203							 * generated for the
204							 * failed pda */
205
206	nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
207	    ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
208	/*
209         * XXX
210         *
211         * There's a bug with a complete stripe overwrite- that means 0 reads
212         * of old data, and the rest of the DAG generation code doesn't like
213         * that. A release is coming, and I don't wanna risk breaking a critical
214         * DAG generator, so here's what I'm gonna do- if there's no read nodes,
215         * I'm gonna fake there being a read node, and I'm gonna swap in a
216         * no-op node in its place (to make all the link-up code happy).
217         * This should be fixed at some point.  --jimz
218         */
219	if (nRrdNodes == 0) {
220		nRrdNodes = 1;
221		rdnodesFaked = 1;
222	} else {
223		rdnodesFaked = 0;
224	}
225	/* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
226	nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
227	RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t),
228	    (RF_DagNode_t *), allocList);
229	i = 0;
230	blockNode = &nodes[i];
231	i += 1;
232	commitNode = &nodes[i];
233	i += 1;
234	unblockNode = &nodes[i];
235	i += 1;
236	termNode = &nodes[i];
237	i += 1;
238	xorNode = &nodes[i];
239	i += 1;
240	wnpNode = &nodes[i];
241	i += 1;
242	wndNodes = &nodes[i];
243	i += nWndNodes;
244	rrdNodes = &nodes[i];
245	i += nRrdNodes;
246	if (nfaults == 2) {
247		wnqNode = &nodes[i];
248		i += 1;
249	} else {
250		wnqNode = NULL;
251	}
252	RF_ASSERT(i == nNodes);
253
254	/* this dag can not commit until all rrd and xor Nodes have completed */
255	dag_h->numCommitNodes = 1;
256	dag_h->numCommits = 0;
257	dag_h->numSuccedents = 1;
258
259	RF_ASSERT(nRrdNodes > 0);
260	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
261	    NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
262	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
263	    NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
264	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
265	    NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
266	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
267	    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
268	rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
269	    nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
270
271	/*
272         * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
273         * the failed buffer, save a pointer to it so we can use it as the target
274         * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
275         * a buffer is the same size as the failed buffer, it must also be at the
276         * same alignment within the SU.
277         */
278	i = 0;
279	if (new_asm_h[0]) {
280		for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
281		    i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
282		    i++, pda = pda->next) {
283			rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
284			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
285			RF_ASSERT(pda);
286			rrdNodes[i].params[0].p = pda;
287			rrdNodes[i].params[1].p = pda->bufPtr;
288			rrdNodes[i].params[2].v = parityStripeID;
289			rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
290		}
291	}
292	/* i now equals the number of stripe units accessed in new_asm_h[0] */
293	if (new_asm_h[1]) {
294		for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
295		    j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
296		    j++, pda = pda->next) {
297			rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
298			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
299			RF_ASSERT(pda);
300			rrdNodes[i + j].params[0].p = pda;
301			rrdNodes[i + j].params[1].p = pda->bufPtr;
302			rrdNodes[i + j].params[2].v = parityStripeID;
303			rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
304			if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
305				xorTargetBuf = pda->bufPtr;
306		}
307	}
308	if (rdnodesFaked) {
309		/*
310	         * This is where we'll init that fake noop read node
311	         * (XXX should the wakeup func be different?)
312	         */
313		rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
314		    NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
315	}
316	/*
317         * Make a PDA for the parity unit.  The parity PDA should start at
318         * the same offset into the SU as the failed PDA.
319         */
320	/* Danner comment: I don't think this copy is really necessary. We are
321	 * in one of two cases here. (1) The entire failed unit is written.
322	 * Then asmap->parityInfo will describe the entire parity. (2) We are
323	 * only writing a subset of the failed unit and nothing else. Then the
324	 * asmap->parityInfo describes the failed unit and the copy can also
325	 * be avoided. */
326
327	RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
328	parityPDA->col = asmap->parityInfo->col;
329	parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
330	    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
331	parityPDA->numSector = failedPDA->numSector;
332
333	if (!xorTargetBuf) {
334		RF_MallocAndAdd(xorTargetBuf,
335		    rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
336	}
337	/* init the Wnp node */
338	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
339	    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
340	wnpNode->params[0].p = parityPDA;
341	wnpNode->params[1].p = xorTargetBuf;
342	wnpNode->params[2].v = parityStripeID;
343	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
344
345	/* fill in the Wnq Node */
346	if (nfaults == 2) {
347		{
348			RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
349			    (RF_PhysDiskAddr_t *), allocList);
350			parityPDA->col = asmap->qInfo->col;
351			parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
352			    * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
353			parityPDA->numSector = failedPDA->numSector;
354
355			rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
356			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
357			wnqNode->params[0].p = parityPDA;
358			RF_MallocAndAdd(xorNode->results[1],
359			    rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
360			wnqNode->params[1].p = xorNode->results[1];
361			wnqNode->params[2].v = parityStripeID;
362			wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
363		}
364	}
365	/* fill in the Wnd nodes */
366	for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
367		if (pda == failedPDA) {
368			i--;
369			continue;
370		}
371		rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
372		    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
373		RF_ASSERT(pda);
374		wndNodes[i].params[0].p = pda;
375		wndNodes[i].params[1].p = pda->bufPtr;
376		wndNodes[i].params[2].v = parityStripeID;
377		wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
378	}
379
380	/* fill in the results of the xor node */
381	xorNode->results[0] = xorTargetBuf;
382
383	/* fill in the params of the xor node */
384
385	paramNum = 0;
386	if (rdnodesFaked == 0) {
387		for (i = 0; i < nRrdNodes; i++) {
388			/* all the Rrd nodes need to be xored together */
389			xorNode->params[paramNum++] = rrdNodes[i].params[0];
390			xorNode->params[paramNum++] = rrdNodes[i].params[1];
391		}
392	}
393	for (i = 0; i < nWndNodes; i++) {
394		/* any Wnd nodes that overlap the failed access need to be
395		 * xored in */
396		if (overlappingPDAs[i]) {
397			RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
398			memcpy((char *) pda, (char *) wndNodes[i].params[0].p, sizeof(RF_PhysDiskAddr_t));
399			rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
400			xorNode->params[paramNum++].p = pda;
401			xorNode->params[paramNum++].p = pda->bufPtr;
402		}
403	}
404	RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
405
406	/*
407         * Install the failed PDA into the xor param list so that the
408         * new data gets xor'd in.
409         */
410	xorNode->params[paramNum++].p = failedPDA;
411	xorNode->params[paramNum++].p = failedPDA->bufPtr;
412
413	/*
414         * The last 2 params to the recovery xor node are always the failed
415         * PDA and the raidPtr. install the failedPDA even though we have just
416         * done so above. This allows us to use the same XOR function for both
417         * degraded reads and degraded writes.
418         */
419	xorNode->params[paramNum++].p = failedPDA;
420	xorNode->params[paramNum++].p = raidPtr;
421	RF_ASSERT(paramNum == 2 * nXorBufs + 2);
422
423	/*
424         * Code to link nodes begins here
425         */
426
427	/* link header to block node */
428	RF_ASSERT(blockNode->numAntecedents == 0);
429	dag_h->succedents[0] = blockNode;
430
431	/* link block node to rd nodes */
432	RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
433	for (i = 0; i < nRrdNodes; i++) {
434		RF_ASSERT(rrdNodes[i].numAntecedents == 1);
435		blockNode->succedents[i] = &rrdNodes[i];
436		rrdNodes[i].antecedents[0] = blockNode;
437		rrdNodes[i].antType[0] = rf_control;
438	}
439
440	/* link read nodes to xor node */
441	RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
442	for (i = 0; i < nRrdNodes; i++) {
443		RF_ASSERT(rrdNodes[i].numSuccedents == 1);
444		rrdNodes[i].succedents[0] = xorNode;
445		xorNode->antecedents[i] = &rrdNodes[i];
446		xorNode->antType[i] = rf_trueData;
447	}
448
449	/* link xor node to commit node */
450	RF_ASSERT(xorNode->numSuccedents == 1);
451	RF_ASSERT(commitNode->numAntecedents == 1);
452	xorNode->succedents[0] = commitNode;
453	commitNode->antecedents[0] = xorNode;
454	commitNode->antType[0] = rf_control;
455
456	/* link commit node to wnd nodes */
457	RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
458	for (i = 0; i < nWndNodes; i++) {
459		RF_ASSERT(wndNodes[i].numAntecedents == 1);
460		commitNode->succedents[i] = &wndNodes[i];
461		wndNodes[i].antecedents[0] = commitNode;
462		wndNodes[i].antType[0] = rf_control;
463	}
464
465	/* link the commit node to wnp, wnq nodes */
466	RF_ASSERT(wnpNode->numAntecedents == 1);
467	commitNode->succedents[nWndNodes] = wnpNode;
468	wnpNode->antecedents[0] = commitNode;
469	wnpNode->antType[0] = rf_control;
470	if (nfaults == 2) {
471		RF_ASSERT(wnqNode->numAntecedents == 1);
472		commitNode->succedents[nWndNodes + 1] = wnqNode;
473		wnqNode->antecedents[0] = commitNode;
474		wnqNode->antType[0] = rf_control;
475	}
476	/* link write new data nodes to unblock node */
477	RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
478	for (i = 0; i < nWndNodes; i++) {
479		RF_ASSERT(wndNodes[i].numSuccedents == 1);
480		wndNodes[i].succedents[0] = unblockNode;
481		unblockNode->antecedents[i] = &wndNodes[i];
482		unblockNode->antType[i] = rf_control;
483	}
484
485	/* link write new parity node to unblock node */
486	RF_ASSERT(wnpNode->numSuccedents == 1);
487	wnpNode->succedents[0] = unblockNode;
488	unblockNode->antecedents[nWndNodes] = wnpNode;
489	unblockNode->antType[nWndNodes] = rf_control;
490
491	/* link write new q node to unblock node */
492	if (nfaults == 2) {
493		RF_ASSERT(wnqNode->numSuccedents == 1);
494		wnqNode->succedents[0] = unblockNode;
495		unblockNode->antecedents[nWndNodes + 1] = wnqNode;
496		unblockNode->antType[nWndNodes + 1] = rf_control;
497	}
498	/* link unblock node to term node */
499	RF_ASSERT(unblockNode->numSuccedents == 1);
500	RF_ASSERT(termNode->numAntecedents == 1);
501	RF_ASSERT(termNode->numSuccedents == 0);
502	unblockNode->succedents[0] = termNode;
503	termNode->antecedents[0] = unblockNode;
504	termNode->antType[0] = rf_control;
505}
506#define CONS_PDA(if,start,num) \
507  pda_p->col = asmap->if->col; \
508  pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
509  pda_p->numSector = num; \
510  pda_p->next = NULL; \
511  RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
512#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
513void
514rf_WriteGenerateFailedAccessASMs(
515    RF_Raid_t * raidPtr,
516    RF_AccessStripeMap_t * asmap,
517    RF_PhysDiskAddr_t ** pdap,
518    int *nNodep,
519    RF_PhysDiskAddr_t ** pqpdap,
520    int *nPQNodep,
521    RF_AllocListElem_t * allocList)
522{
523	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
524	int     PDAPerDisk, i;
525	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
526	int     numDataCol = layoutPtr->numDataCol;
527	int     state;
528	unsigned napdas;
529	RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
530	RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
531	RF_PhysDiskAddr_t *pda_p;
532	RF_RaidAddr_t sosAddr;
533
534	/* determine how many pda's we will have to generate per unaccess
535	 * stripe. If there is only one failed data unit, it is one; if two,
536	 * possibly two, depending wether they overlap. */
537
538	fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
539	fone_end = fone_start + fone->numSector;
540
541	if (asmap->numDataFailed == 1) {
542		PDAPerDisk = 1;
543		state = 1;
544		RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
545		pda_p = *pqpdap;
546		/* build p */
547		CONS_PDA(parityInfo, fone_start, fone->numSector);
548		pda_p->type = RF_PDA_TYPE_PARITY;
549		pda_p++;
550		/* build q */
551		CONS_PDA(qInfo, fone_start, fone->numSector);
552		pda_p->type = RF_PDA_TYPE_Q;
553	} else {
554		ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
555		ftwo_end = ftwo_start + ftwo->numSector;
556		if (fone->numSector + ftwo->numSector > secPerSU) {
557			PDAPerDisk = 1;
558			state = 2;
559			RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
560			pda_p = *pqpdap;
561			CONS_PDA(parityInfo, 0, secPerSU);
562			pda_p->type = RF_PDA_TYPE_PARITY;
563			pda_p++;
564			CONS_PDA(qInfo, 0, secPerSU);
565			pda_p->type = RF_PDA_TYPE_Q;
566		} else {
567			PDAPerDisk = 2;
568			state = 3;
569			/* four of them, fone, then ftwo */
570			RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
571			pda_p = *pqpdap;
572			CONS_PDA(parityInfo, fone_start, fone->numSector);
573			pda_p->type = RF_PDA_TYPE_PARITY;
574			pda_p++;
575			CONS_PDA(qInfo, fone_start, fone->numSector);
576			pda_p->type = RF_PDA_TYPE_Q;
577			pda_p++;
578			CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
579			pda_p->type = RF_PDA_TYPE_PARITY;
580			pda_p++;
581			CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
582			pda_p->type = RF_PDA_TYPE_Q;
583		}
584	}
585	/* figure out number of nonaccessed pda */
586	napdas = PDAPerDisk * (numDataCol - 2);
587	*nPQNodep = PDAPerDisk;
588
589	*nNodep = napdas;
590	if (napdas == 0)
591		return;		/* short circuit */
592
593	/* allocate up our list of pda's */
594
595	RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t),
596			(RF_PhysDiskAddr_t *), allocList);
597	*pdap = pda_p;
598
599	/* linkem together */
600	for (i = 0; i < (napdas - 1); i++)
601		pda_p[i].next = pda_p + (i + 1);
602
603	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
604	for (i = 0; i < numDataCol; i++) {
605		if ((pda_p - (*pdap)) == napdas)
606			continue;
607		pda_p->type = RF_PDA_TYPE_DATA;
608		pda_p->raidAddress = sosAddr + (i * secPerSU);
609		(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
610		/* skip over dead disks */
611		if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status))
612			continue;
613		switch (state) {
614		case 1:	/* fone */
615			pda_p->numSector = fone->numSector;
616			pda_p->raidAddress += fone_start;
617			pda_p->startSector += fone_start;
618			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
619			break;
620		case 2:	/* full stripe */
621			pda_p->numSector = secPerSU;
622			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
623			break;
624		case 3:	/* two slabs */
625			pda_p->numSector = fone->numSector;
626			pda_p->raidAddress += fone_start;
627			pda_p->startSector += fone_start;
628			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
629			pda_p++;
630			pda_p->type = RF_PDA_TYPE_DATA;
631			pda_p->raidAddress = sosAddr + (i * secPerSU);
632			(raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
633			pda_p->numSector = ftwo->numSector;
634			pda_p->raidAddress += ftwo_start;
635			pda_p->startSector += ftwo_start;
636			RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
637			break;
638		default:
639			RF_PANIC();
640		}
641		pda_p++;
642	}
643
644	RF_ASSERT(pda_p - *pdap == napdas);
645	return;
646}
647#define DISK_NODE_PDA(node)  ((node)->params[0].p)
648
649#define DISK_NODE_PARAMS(_node_,_p_) \
650  (_node_).params[0].p = _p_ ; \
651  (_node_).params[1].p = (_p_)->bufPtr; \
652  (_node_).params[2].v = parityStripeID; \
653  (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
654
655void
656rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
657		       RF_DagHeader_t *dag_h, void *bp,
658		       RF_RaidAccessFlags_t flags,
659		       RF_AllocListElem_t *allocList,
660		       char *redundantReadNodeName,
661		       char *redundantWriteNodeName,
662		       char *recoveryNodeName,
663		       int (*recovFunc) (RF_DagNode_t *))
664{
665	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
666	RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
667	       *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
668	RF_PhysDiskAddr_t *pda, *pqPDAs;
669	RF_PhysDiskAddr_t *npdas;
670	int     nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
671	RF_ReconUnitNum_t which_ru;
672	int     nPQNodes;
673	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
674
675	/* simple small write case - First part looks like a reconstruct-read
676	 * of the failed data units. Then a write of all data units not
677	 * failed. */
678
679
680	/* Hdr | ------Block- /  /         \   Rrd  Rrd ...  Rrd  Rp Rq \  \
681	 * /  -------PQ----- /   \   \ Wud   Wp  WQ	     \    |   /
682	 * --Unblock- | T
683	 *
684	 * Rrd = read recovery data  (potentially none) Wud = write user data
685	 * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
686	 * (could be two)
687	 *
688	 */
689
690	rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
691
692	RF_ASSERT(asmap->numDataFailed == 1);
693
694	nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
695	nReadNodes = nRrdNodes + 2 * nPQNodes;
696	nWriteNodes = nWudNodes + 2 * nPQNodes;
697	nNodes = 4 + nReadNodes + nWriteNodes;
698
699	RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
700	blockNode = nodes;
701	unblockNode = blockNode + 1;
702	termNode = unblockNode + 1;
703	recoveryNode = termNode + 1;
704	rrdNodes = recoveryNode + 1;
705	rpNodes = rrdNodes + nRrdNodes;
706	rqNodes = rpNodes + nPQNodes;
707	wudNodes = rqNodes + nPQNodes;
708	wpNodes = wudNodes + nWudNodes;
709	wqNodes = wpNodes + nPQNodes;
710
711	dag_h->creator = "PQ_DDSimpleSmallWrite";
712	dag_h->numSuccedents = 1;
713	dag_h->succedents[0] = blockNode;
714	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
715	termNode->antecedents[0] = unblockNode;
716	termNode->antType[0] = rf_control;
717
718	/* init the block and unblock nodes */
719	/* The block node has all the read nodes as successors */
720	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
721	for (i = 0; i < nReadNodes; i++)
722		blockNode->succedents[i] = rrdNodes + i;
723
724	/* The unblock node has all the writes as successors */
725	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
726	for (i = 0; i < nWriteNodes; i++) {
727		unblockNode->antecedents[i] = wudNodes + i;
728		unblockNode->antType[i] = rf_control;
729	}
730	unblockNode->succedents[0] = termNode;
731
732#define INIT_READ_NODE(node,name) \
733  rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
734  (node)->succedents[0] = recoveryNode; \
735  (node)->antecedents[0] = blockNode; \
736  (node)->antType[0] = rf_control;
737
738	/* build the read nodes */
739	pda = npdas;
740	for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
741		INIT_READ_NODE(rrdNodes + i, "rrd");
742		DISK_NODE_PARAMS(rrdNodes[i], pda);
743	}
744
745	/* read redundancy pdas */
746	pda = pqPDAs;
747	INIT_READ_NODE(rpNodes, "Rp");
748	RF_ASSERT(pda);
749	DISK_NODE_PARAMS(rpNodes[0], pda);
750	pda++;
751	INIT_READ_NODE(rqNodes, redundantReadNodeName);
752	RF_ASSERT(pda);
753	DISK_NODE_PARAMS(rqNodes[0], pda);
754	if (nPQNodes == 2) {
755		pda++;
756		INIT_READ_NODE(rpNodes + 1, "Rp");
757		RF_ASSERT(pda);
758		DISK_NODE_PARAMS(rpNodes[1], pda);
759		pda++;
760		INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
761		RF_ASSERT(pda);
762		DISK_NODE_PARAMS(rqNodes[1], pda);
763	}
764	/* the recovery node has all reads as precedessors and all writes as
765	 * successors. It generates a result for every write P or write Q
766	 * node. As parameters, it takes a pda per read and a pda per stripe
767	 * of user data written. It also takes as the last params the raidPtr
768	 * and asm. For results, it takes PDA for P & Q. */
769
770
771	rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
772	    nWriteNodes,	/* succesors */
773	    nReadNodes,		/* preds */
774	    nReadNodes + nWudNodes + 3,	/* params */
775	    2 * nPQNodes,	/* results */
776	    dag_h, recoveryNodeName, allocList);
777
778
779
780	for (i = 0; i < nReadNodes; i++) {
781		recoveryNode->antecedents[i] = rrdNodes + i;
782		recoveryNode->antType[i] = rf_control;
783		recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
784	}
785	for (i = 0; i < nWudNodes; i++) {
786		recoveryNode->succedents[i] = wudNodes + i;
787	}
788	recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
789	recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
790	recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
791
792	for (; i < nWriteNodes; i++)
793		recoveryNode->succedents[i] = wudNodes + i;
794
795	pda = pqPDAs;
796	recoveryNode->results[0] = pda;
797	pda++;
798	recoveryNode->results[1] = pda;
799	if (nPQNodes == 2) {
800		pda++;
801		recoveryNode->results[2] = pda;
802		pda++;
803		recoveryNode->results[3] = pda;
804	}
805	/* fill writes */
806#define INIT_WRITE_NODE(node,name) \
807  rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
808    (node)->succedents[0] = unblockNode; \
809    (node)->antecedents[0] = recoveryNode; \
810    (node)->antType[0] = rf_control;
811
812	pda = asmap->physInfo;
813	for (i = 0; i < nWudNodes; i++) {
814		INIT_WRITE_NODE(wudNodes + i, "Wd");
815		DISK_NODE_PARAMS(wudNodes[i], pda);
816		recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
817		pda = pda->next;
818	}
819	/* write redundancy pdas */
820	pda = pqPDAs;
821	INIT_WRITE_NODE(wpNodes, "Wp");
822	RF_ASSERT(pda);
823	DISK_NODE_PARAMS(wpNodes[0], pda);
824	pda++;
825	INIT_WRITE_NODE(wqNodes, "Wq");
826	RF_ASSERT(pda);
827	DISK_NODE_PARAMS(wqNodes[0], pda);
828	if (nPQNodes == 2) {
829		pda++;
830		INIT_WRITE_NODE(wpNodes + 1, "Wp");
831		RF_ASSERT(pda);
832		DISK_NODE_PARAMS(wpNodes[1], pda);
833		pda++;
834		INIT_WRITE_NODE(wqNodes + 1, "Wq");
835		RF_ASSERT(pda);
836		DISK_NODE_PARAMS(wqNodes[1], pda);
837	}
838}
839#endif   /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */
840