rf_pqdegdags.c revision 1.11
1/*	$NetBSD: rf_pqdegdags.c,v 1.11 2005/12/11 12:23:37 christos Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32*/
33
34
35#include <sys/cdefs.h>
36__KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.11 2005/12/11 12:23:37 christos Exp $");
37
38#include "rf_archs.h"
39
40#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
41
42#include <dev/raidframe/raidframevar.h>
43
44#include "rf_raid.h"
45#include "rf_dag.h"
46#include "rf_dagdegrd.h"
47#include "rf_dagdegwr.h"
48#include "rf_dagfuncs.h"
49#include "rf_dagutils.h"
50#include "rf_etimer.h"
51#include "rf_acctrace.h"
52#include "rf_general.h"
53#include "rf_pqdegdags.h"
54#include "rf_pq.h"
55
56static void
57applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
58    RF_PhysDiskAddr_t * qpda, void *bp);
59
60/*
61   Two data drives have failed, and we are doing a read that covers one of them.
62   We may also be reading some of the surviving drives.
63
64
65 *****************************************************************************************
66 *
67 * creates a DAG to perform a degraded-mode read of data within one stripe.
68 * This DAG is as follows:
69 *
70 *                                      Hdr
71 *                                       |
72 *                                     Block
73 *                       /         /           \         \     \   \
74 *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
75 *                      | \       | \         | \       | \    | \ | \
76 *
77 *                                 |                 |
78 *                              Unblock              X
79 *                                  \               /
80 *                                   ------ T ------
81 *
82 * Each R node is a successor of the L node
83 * One successor arc from each R node goes to U, and the other to X
84 * There is one Rud for each chunk of surviving user data requested by the user,
85 * and one Rrd for each chunk of surviving user data _not_ being read by the user
86 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
87 * X = pq recovery node, T = terminate
88 *
89 * The block & unblock nodes are leftovers from a previous version.  They
90 * do nothing, but I haven't deleted them because it would be a tremendous
91 * effort to put them back in.
92 *
93 * Note:  The target buffer for the XOR node is set to the actual user buffer where the
94 * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
95 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
96 * zero the target buffer prior to the re-use.
97 *
98 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
99 * needs and what's not.
100 ****************************************************************************************/
101/*   init a disk node with 2 successors and one predecessor */
102#define INIT_DISK_NODE(node,name) \
103rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
104(node)->succedents[0] = unblockNode; \
105(node)->succedents[1] = recoveryNode; \
106(node)->antecedents[0] = blockNode; \
107(node)->antType[0] = rf_control
108
109#define DISK_NODE_PARAMS(_node_,_p_) \
110  (_node_).params[0].p = _p_ ; \
111  (_node_).params[1].p = (_p_)->bufPtr; \
112  (_node_).params[2].v = parityStripeID; \
113  (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
114
115#define DISK_NODE_PDA(node)  ((node)->params[0].p)
116
117RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
118{
119	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
120	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
121}
122
123static void
124applyPDA(raidPtr, pda, ppda, qpda, bp)
125	RF_Raid_t *raidPtr;
126	RF_PhysDiskAddr_t *pda;
127	RF_PhysDiskAddr_t *ppda;
128	RF_PhysDiskAddr_t *qpda;
129	void   *bp;
130{
131	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
132	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
133	RF_SectorCount_t s0len = ppda->numSector, len;
134	RF_SectorNum_t suoffset;
135	unsigned coeff;
136	char   *pbuf = ppda->bufPtr;
137	char   *qbuf = qpda->bufPtr;
138	char   *buf;
139	int     delta;
140
141	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
142	len = pda->numSector;
143	/* see if pda intersects a recovery pda */
144	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
145		buf = pda->bufPtr;
146		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
147		coeff = (coeff % raidPtr->Layout.numDataCol);
148
149		if (suoffset < s0off) {
150			delta = s0off - suoffset;
151			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
152			suoffset = s0off;
153			len -= delta;
154		}
155		if (suoffset > s0off) {
156			delta = suoffset - s0off;
157			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
158			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
159		}
160		if ((suoffset + len) > (s0len + s0off))
161			len = s0len + s0off - suoffset;
162
163		/* src, dest, len */
164		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
165
166		/* dest, src, len, coeff */
167		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
168	}
169}
170/*
171   Recover data in the case of a double failure. There can be two
172   result buffers, one for each chunk of data trying to be recovered.
173   The params are pda's that have not been range restricted or otherwise
174   politely massaged - this should be done here. The last params are the
175   pdas of P and Q, followed by the raidPtr. The list can look like
176
177   pda, pda, ... , p pda, q pda, raidptr, asm
178
179   or
180
181   pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
182
183   depending on wether two chunks of recovery data were required.
184
185   The second condition only arises if there are two failed buffers
186   whose lengths do not add up a stripe unit.
187*/
188
189
190int
191rf_PQDoubleRecoveryFunc(node)
192	RF_DagNode_t *node;
193{
194	int     np = node->numParams;
195	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
196	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
197	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
198	int     d, i;
199	unsigned coeff;
200	RF_RaidAddr_t sosAddr, suoffset;
201	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
202	int     two = 0;
203	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
204	char   *buf;
205	int     numDataCol = layoutPtr->numDataCol;
206	RF_Etimer_t timer;
207	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
208
209	RF_ETIMER_START(timer);
210
211	if (asmap->failedPDAs[1] &&
212	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
213		RF_ASSERT(0);
214		ppda = node->params[np - 6].p;
215		ppda2 = node->params[np - 5].p;
216		qpda = node->params[np - 4].p;
217		qpda2 = node->params[np - 3].p;
218		d = (np - 6);
219		two = 1;
220	} else {
221		ppda = node->params[np - 4].p;
222		qpda = node->params[np - 3].p;
223		d = (np - 4);
224	}
225
226	for (i = 0; i < d; i++) {
227		pda = node->params[i].p;
228		buf = pda->bufPtr;
229		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
230		len = pda->numSector;
231		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
232		/* compute the data unit offset within the column */
233		coeff = (coeff % raidPtr->Layout.numDataCol);
234		/* see if pda intersects a recovery pda */
235		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
236		if (two)
237			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
238	}
239
240	/* ok, we got the parity back to the point where we can recover. We
241	 * now need to determine the coeff of the columns that need to be
242	 * recovered. We can also only need to recover a single stripe unit. */
243
244	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
245						 * to recover. */
246		pda = asmap->failedPDAs[0];
247		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
248		/* need to determine the column of the other failed disk */
249		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
250		/* compute the data unit offset within the column */
251		coeff = (coeff % raidPtr->Layout.numDataCol);
252		for (i = 0; i < numDataCol; i++) {
253			npda.raidAddress = sosAddr + (i * secPerSU);
254			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
255			/* skip over dead disks */
256			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
257				if (i != coeff)
258					break;
259		}
260		RF_ASSERT(i < numDataCol);
261		RF_ASSERT(two == 0);
262		/* recover the data. Since we need only want to recover one
263		 * column, we overwrite the parity with the other one. */
264		if (coeff < i)	/* recovering 'a' */
265			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
266		else		/* recovering 'b' */
267			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
268	} else
269		RF_PANIC();
270
271	RF_ETIMER_STOP(timer);
272	RF_ETIMER_EVAL(timer);
273	if (tracerec)
274		tracerec->q_us += RF_ETIMER_VAL_US(timer);
275	rf_GenericWakeupFunc(node, 0);
276	return (0);
277}
278
279int
280rf_PQWriteDoubleRecoveryFunc(node)
281	RF_DagNode_t *node;
282{
283	/* The situation:
284	 *
285	 * We are doing a write that hits only one failed data unit. The other
286	 * failed data unit is not being overwritten, so we need to generate
287	 * it.
288	 *
289	 * For the moment, we assume all the nonfailed data being written is in
290	 * the shadow of the failed data unit. (i.e,, either a single data
291	 * unit write or the entire failed stripe unit is being overwritten. )
292	 *
293	 * Recovery strategy: apply the recovery data to the parity and q. Use P
294	 * & Q to recover the second failed data unit in P. Zero fill Q, then
295	 * apply the recovered data to p. Then apply the data being written to
296	 * the failed drive. Then walk through the surviving drives, applying
297	 * new data when it exists, othewise the recovery data. Quite a mess.
298	 *
299	 *
300	 * The params
301	 *
302	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
303	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
304	 * raidPtr, asmap */
305
306	int     np = node->numParams;
307	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
308	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
309	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
310	int     i;
311	RF_RaidAddr_t sosAddr;
312	unsigned coeff;
313	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
314	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
315	int     numDataCol = layoutPtr->numDataCol;
316	RF_Etimer_t timer;
317	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
318
319	RF_ASSERT(node->numResults == 2);
320	RF_ASSERT(asmap->failedPDAs[1] == NULL);
321	RF_ETIMER_START(timer);
322	ppda = node->results[0];
323	qpda = node->results[1];
324	/* apply the recovery data */
325	for (i = 0; i < numDataCol - 2; i++)
326		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
327
328	/* determine the other failed data unit */
329	pda = asmap->failedPDAs[0];
330	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
331	/* need to determine the column of the other failed disk */
332	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
333	/* compute the data unit offset within the column */
334	coeff = (coeff % raidPtr->Layout.numDataCol);
335	for (i = 0; i < numDataCol; i++) {
336		npda.raidAddress = sosAddr + (i * secPerSU);
337		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
338		/* skip over dead disks */
339		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
340			if (i != coeff)
341				break;
342	}
343	RF_ASSERT(i < numDataCol);
344	/* recover the data. The column we want to recover we write over the
345	 * parity. The column we don't care about we dump in q. */
346	if (coeff < i)		/* recovering 'a' */
347		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
348	else			/* recovering 'b' */
349		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
350
351	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
352	memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
353	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
354
355	/* now apply all the write data to the buffer */
356	/* single stripe unit write case: the failed data is only thing we are
357	 * writing. */
358	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
359	/* dest, src, len, coeff */
360	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
361	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
362
363	/* now apply all the recovery data */
364	for (i = 0; i < numDataCol - 2; i++)
365		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
366
367	RF_ETIMER_STOP(timer);
368	RF_ETIMER_EVAL(timer);
369	if (tracerec)
370		tracerec->q_us += RF_ETIMER_VAL_US(timer);
371
372	rf_GenericWakeupFunc(node, 0);
373	return (0);
374}
375RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
376{
377	RF_PANIC();
378}
379/*
380   Two lost data unit write case.
381
382   There are really two cases here:
383
384   (1) The write completely covers the two lost data units.
385       In that case, a reconstruct write that doesn't write the
386       failed data units will do the correct thing. So in this case,
387       the dag looks like
388
389            full stripe read of surviving data units (not being overwriten)
390	    write new data (ignoring failed units)   compute P&Q
391	                                             write P&Q
392
393
394   (2) The write does not completely cover both failed data units
395       (but touches at least one of them). Then we need to do the
396       equivalent of a reconstruct read to recover the missing data
397       unit from the other stripe.
398
399       For any data we are writing that is not in the "shadow"
400       of the failed units, we need to do a four cycle update.
401       PANIC on this case. for now
402
403*/
404
405RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
406{
407	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
408	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
409	int     sum;
410	int     nf = asmap->numDataFailed;
411
412	sum = asmap->failedPDAs[0]->numSector;
413	if (nf == 2)
414		sum += asmap->failedPDAs[1]->numSector;
415
416	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
417		/* large write case */
418		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
419		return;
420	}
421	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
422		/* small write case, no user data not in shadow */
423		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
424		return;
425	}
426	RF_PANIC();
427}
428RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
429{
430	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
431}
432#endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
433				 * (RF_INCLUDE_RAID6 > 0) */
434