rf_evenodd_dagfuncs.c revision 1.23
1/*	$NetBSD: rf_evenodd_dagfuncs.c,v 1.23 2019/02/09 03:34:00 christos Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: ChangMing Wu
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * Code for RAID-EVENODD  architecture.
31 */
32
33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.23 2019/02/09 03:34:00 christos Exp $");
35
36#include "rf_archs.h"
37
38#ifdef _KERNEL_OPT
39#include "opt_raid_diagnostic.h"
40#endif
41
42#if RF_INCLUDE_EVENODD > 0
43
44#include <dev/raidframe/raidframevar.h>
45
46#include "rf_raid.h"
47#include "rf_dag.h"
48#include "rf_dagffrd.h"
49#include "rf_dagffwr.h"
50#include "rf_dagdegrd.h"
51#include "rf_dagdegwr.h"
52#include "rf_dagutils.h"
53#include "rf_dagfuncs.h"
54#include "rf_etimer.h"
55#include "rf_general.h"
56#include "rf_parityscan.h"
57#include "rf_evenodd.h"
58#include "rf_evenodd_dagfuncs.h"
59
60/* These redundant functions are for small write */
61RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"};
62RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"};
63/* These redundant functions are for degraded read */
64RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
65RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"};
66/**********************************************************************************************
67 *   the following encoding node functions is used in  EO_000_CreateLargeWriteDAG
68 **********************************************************************************************/
69int
70rf_RegularPEFunc(RF_DagNode_t *node)
71{
72	rf_RegularESubroutine(node, node->results[1]);
73	rf_RegularXorFunc(node);/* does the wakeup here! */
74#if 1
75	return (0);		/* XXX This was missing... GO */
76#endif
77}
78
79
80/************************************************************************************************
81 *  For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
82 *  be used. The previous case is when write access at least sectors of full stripe unit.
83 *  The later function is used when the write access two stripe units but with total sectors
84 *  less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
85 *  areas in their stripe unit and  parity write and 'E' write are both devided into two distinct
86 *  writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
87 ************************************************************************************************/
88
89/* Algorithm:
90     1. Store the difference of old data and new data in the Rod buffer.
91     2. then encode this buffer into the buffer which already have old 'E' information inside it,
92	the result can be shown to be the new 'E' information.
93     3. xor the Wnd buffer into the difference buffer to recover the  original old data.
94   Here we have another alternative: to allocate a temporary buffer for storing the difference of
95   old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
96   take the same speed as the previous, and need more memory.
97*/
98int
99rf_RegularONEFunc(RF_DagNode_t *node)
100{
101	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
102	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
103	int     EpdaIndex = (node->numParams - 1) / 2 - 1;	/* the parameter of node
104								 * where you can find
105								 * e-pda */
106	int     i, k;
107	int     suoffset, length;
108	RF_RowCol_t scol;
109	char   *srcbuf, *destbuf;
110	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
111	RF_Etimer_t timer;
112	RF_PhysDiskAddr_t *pda;
113#ifdef RAID_DIAGNOSTIC
114	RF_PhysDiskAddr_t *EPDA =
115	    (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
116	int     ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);
117
118	RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
119	RF_ASSERT(ESUOffset == 0);
120#endif /* RAID_DIAGNOSTIC */
121
122	RF_ETIMER_START(timer);
123
124	/* Xor the Wnd buffer into Rod buffer, the difference of old data and
125	 * new data is stored in Rod buffer */
126	for (k = 0; k < EpdaIndex; k += 2) {
127		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
128		rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
129	}
130	/* Start to encoding the buffer storing the difference of old data and
131	 * new data into 'E' buffer  */
132	for (i = 0; i < EpdaIndex; i += 2)
133		if (node->params[i + 1].p != node->results[0]) {	/* results[0] is buf ptr
134									 * of E */
135			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
136			srcbuf = (char *) node->params[i + 1].p;
137			scol = rf_EUCol(layoutPtr, pda->raidAddress);
138			suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
139			destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
140			rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
141		}
142	/* Recover the original old data to be used by parity encoding
143	 * function in XorNode */
144	for (k = 0; k < EpdaIndex; k += 2) {
145		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
146		rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
147	}
148	RF_ETIMER_STOP(timer);
149	RF_ETIMER_EVAL(timer);
150	tracerec->q_us += RF_ETIMER_VAL_US(timer);
151	rf_GenericWakeupFunc(node, 0);
152#if 1
153	return (0);		/* XXX this was missing.. GO */
154#endif
155}
156
157int
158rf_SimpleONEFunc(RF_DagNode_t *node)
159{
160	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
161	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
162	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
163	int     retcode = 0;
164	char   *srcbuf, *destbuf;
165	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
166	int     length;
167	RF_RowCol_t scol;
168	RF_Etimer_t timer;
169
170	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
171	if (node->dagHdr->status == rf_enable) {
172		RF_ETIMER_START(timer);
173		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);	/* this is a pda of
174														 * writeDataNodes */
175		/* bxor to buffer of readDataNodes */
176		retcode = rf_bxor(node->params[5].p, node->params[1].p, length);
177		/* find out the corresponding colume in encoding matrix for
178		 * write colume to be encoded into redundant disk 'E' */
179		scol = rf_EUCol(layoutPtr, pda->raidAddress);
180		srcbuf = node->params[1].p;
181		destbuf = node->params[3].p;
182		/* Start encoding process */
183		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
184		rf_bxor(node->params[5].p, node->params[1].p, length);
185		RF_ETIMER_STOP(timer);
186		RF_ETIMER_EVAL(timer);
187		tracerec->q_us += RF_ETIMER_VAL_US(timer);
188
189	}
190	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
191							 * explicitly since no
192							 * I/O in this node */
193}
194
195
196/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write  ********/
197void
198rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf)
199{
200	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
201	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
202	RF_PhysDiskAddr_t *pda;
203	int     i, suoffset;
204	RF_RowCol_t scol;
205	char   *srcbuf, *destbuf;
206	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
207	RF_Etimer_t timer;
208
209	RF_ETIMER_START(timer);
210	for (i = 0; i < node->numParams - 2; i += 2) {
211		RF_ASSERT(node->params[i + 1].p != ebuf);
212		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
213		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
214		scol = rf_EUCol(layoutPtr, pda->raidAddress);
215		srcbuf = (char *) node->params[i + 1].p;
216		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
217		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
218	}
219	RF_ETIMER_STOP(timer);
220	RF_ETIMER_EVAL(timer);
221	tracerec->xor_us += RF_ETIMER_VAL_US(timer);
222}
223
224
225/*******************************************************************************************
226 *			 Used in  EO_001_CreateLargeWriteDAG
227 ******************************************************************************************/
228int
229rf_RegularEFunc(RF_DagNode_t *node)
230{
231	rf_RegularESubroutine(node, node->results[0]);
232	rf_GenericWakeupFunc(node, 0);
233#if 1
234	return (0);		/* XXX this was missing?.. GO */
235#endif
236}
237/*******************************************************************************************
238 * This degraded function allow only two case:
239 *  1. when write access the full failed stripe unit, then the access can be more than
240 *     one tripe units.
241 *  2. when write access only part of the failed SU, we assume accesses of more than
242 *     one stripe unit is not allowed so that the write can be dealt with like a
243 *     large write.
244 *  The following function is based on these assumptions. So except in the second case,
245 *  it looks the same as a large write encodeing function. But this is not exactly the
246 *  normal way for doing a degraded write, since raidframe have to break cases of access
247 *  other than the above two into smaller accesses. We may have to change
248 *  DegrESubroutin in the future.
249 *******************************************************************************************/
250void
251rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
252{
253	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
254	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
255	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
256	RF_PhysDiskAddr_t *pda;
257	int     i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
258	RF_RowCol_t scol;
259	char   *srcbuf, *destbuf;
260	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
261	RF_Etimer_t timer;
262
263	RF_ETIMER_START(timer);
264	for (i = 0; i < node->numParams - 2; i += 2) {
265		RF_ASSERT(node->params[i + 1].p != ebuf);
266		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
267		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
268		scol = rf_EUCol(layoutPtr, pda->raidAddress);
269		srcbuf = (char *) node->params[i + 1].p;
270		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
271		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
272	}
273
274	RF_ETIMER_STOP(timer);
275	RF_ETIMER_EVAL(timer);
276	tracerec->q_us += RF_ETIMER_VAL_US(timer);
277}
278
279
280/**************************************************************************************
281 * This function is used in case where one data disk failed and both redundant disks
282 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
283 * failed in the stripe but not accessed at this time, then we should, instead, use
284 * the rf_EOWriteDoubleRecoveryFunc().
285 **************************************************************************************/
286int
287rf_Degraded_100_EOFunc(RF_DagNode_t *node)
288{
289	rf_DegrESubroutine(node, node->results[1]);
290	rf_RecoveryXorFunc(node);	/* does the wakeup here! */
291#if 1
292	return (0);		/* XXX this was missing... SHould these be
293				 * void functions??? GO */
294#endif
295}
296/**************************************************************************************
297 * This function is to encode one sector in one of the data disks to the E disk.
298 * However, in evenodd this function can also be used as decoding function to recover
299 * data from dead disk in the case of parity failure and a single data failure.
300 **************************************************************************************/
301void
302rf_e_EncOneSect(
303    RF_RowCol_t srcLogicCol,
304    char *srcSecbuf,
305    RF_RowCol_t destLogicCol,
306    char *destSecbuf,
307    int bytesPerSector)
308{
309	int     S_index;	/* index of the EU in the src col which need
310				 * be Xored into all EUs in a dest sector */
311	int     numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
312	RF_RowCol_t j, indexInDest,	/* row index of an encoding unit in
313					 * the destination colume of encoding
314					 * matrix */
315	        indexInSrc;	/* row index of an encoding unit in the source
316				 * colume used for recovery */
317	int     bytesPerEU = bytesPerSector / numRowInEncMatix;
318
319#if RF_EO_MATRIX_DIM > 17
320	int     shortsPerEU = bytesPerEU / sizeof(short);
321	short  *destShortBuf, *srcShortBuf1, *srcShortBuf2;
322	short temp1;
323#elif RF_EO_MATRIX_DIM == 17
324	int     longsPerEU = bytesPerEU / sizeof(long);
325	long   *destLongBuf, *srcLongBuf1, *srcLongBuf2;
326	long temp1;
327#endif
328
329#if RF_EO_MATRIX_DIM > 17
330	RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
331	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
332#elif RF_EO_MATRIX_DIM == 17
333	RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
334	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
335#endif
336
337	S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
338#if RF_EO_MATRIX_DIM > 17
339	srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
340#elif RF_EO_MATRIX_DIM == 17
341	srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
342#endif
343
344	for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) {
345		indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
346
347#if RF_EO_MATRIX_DIM > 17
348		destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
349		srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
350		for (j = 0; j < shortsPerEU; j++) {
351			temp1 = destShortBuf[j] ^ srcShortBuf1[j];
352			/* note: S_index won't be at the end row for any src
353			 * col! */
354			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
355				destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
356			/* if indexInSrc is at the end row, ie.
357			 * RF_EO_MATRIX_DIM -1, then all elements are zero! */
358			else
359				destShortBuf[j] = temp1;
360		}
361
362#elif RF_EO_MATRIX_DIM == 17
363		destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
364		srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
365		for (j = 0; j < longsPerEU; j++) {
366			temp1 = destLongBuf[j] ^ srcLongBuf1[j];
367			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
368				destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
369			else
370				destLongBuf[j] = temp1;
371		}
372#endif
373	}
374}
375
376void
377rf_e_encToBuf(
378    RF_Raid_t * raidPtr,
379    RF_RowCol_t srcLogicCol,
380    char *srcbuf,
381    RF_RowCol_t destLogicCol,
382    char *destbuf,
383    int numSector)
384{
385	int     i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
386
387	for (i = 0; i < numSector; i++) {
388		rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
389		srcbuf += bytesPerSector;
390		destbuf += bytesPerSector;
391	}
392}
393/**************************************************************************************
394 * when parity die and one data die, We use second redundant information, 'E',
395 * to recover the data in dead disk. This function is used in the recovery node of
396 * for EO_110_CreateReadDAG
397 **************************************************************************************/
398int
399rf_RecoveryEFunc(RF_DagNode_t *node)
400{
401	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
402	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
403	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
404	RF_RowCol_t scol,	/* source logical column */
405	        fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
406									 * failed SU */
407	int     i;
408	RF_PhysDiskAddr_t *pda;
409	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
410	char   *srcbuf, *destbuf;
411	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
412	RF_Etimer_t timer;
413
414	memset(node->results[0], 0,
415	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
416	if (node->dagHdr->status == rf_enable) {
417		RF_ETIMER_START(timer);
418		for (i = 0; i < node->numParams - 2; i += 2)
419			if (node->params[i + 1].p != node->results[0]) {
420				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
421				if (i == node->numParams - 4)
422					scol = RF_EO_MATRIX_DIM - 2;	/* the colume of
423									 * redundant E */
424				else
425					scol = rf_EUCol(layoutPtr, pda->raidAddress);
426				srcbuf = (char *) node->params[i + 1].p;
427				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
428				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
429				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
430			}
431		RF_ETIMER_STOP(timer);
432		RF_ETIMER_EVAL(timer);
433		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
434	}
435	return (rf_GenericWakeupFunc(node, 0));	/* node execute successfully */
436}
437/**************************************************************************************
438 * This function is used in the case where one data and the parity have filed.
439 * (in EO_110_CreateWriteDAG )
440 **************************************************************************************/
441int
442rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)
443{
444	rf_DegrESubroutine(node, node->results[0]);
445	rf_GenericWakeupFunc(node, 0);
446#if 1
447	return (0);		/* XXX Yet another one!! GO */
448#endif
449}
450
451
452
453/**************************************************************************************
454 *  		THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
455 **************************************************************************************/
456
457void
458rf_doubleEOdecode(
459    RF_Raid_t * raidPtr,
460    char **rrdbuf,
461    char **dest,
462    RF_RowCol_t * fcol,
463    char *pbuf,
464    char *ebuf)
465{
466	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
467	int     i, j, k, f1, f2, row;
468	int     rrdrow, erow, count = 0;
469	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
470	int     numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
471#if 0
472	int     pcol = (RF_EO_MATRIX_DIM) - 1;
473#endif
474	int     ecol = (RF_EO_MATRIX_DIM) - 2;
475	int     bytesPerEU = bytesPerSector / numRowInEncMatix;
476	int     numDataCol = layoutPtr->numDataCol;
477#if RF_EO_MATRIX_DIM > 17
478	int     shortsPerEU = bytesPerEU / sizeof(short);
479	short  *rrdbuf_current, *pbuf_current, *ebuf_current;
480	short  *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
481	short *temp;
482	short  *P;
483
484	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
485#elif RF_EO_MATRIX_DIM == 17
486	int     longsPerEU = bytesPerEU / sizeof(long);
487	long   *rrdbuf_current, *pbuf_current, *ebuf_current;
488	long   *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
489	long *temp;
490	long   *P;
491
492	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
493#endif
494	P = RF_Malloc(bytesPerEU);
495	temp = RF_Malloc(bytesPerEU);
496	RF_ASSERT(*((long *) dest[0]) == 0);
497	RF_ASSERT(*((long *) dest[1]) == 0);
498	RF_ASSERT(*P == 0);
499	/* calculate the 'P' parameter, which, not parity, is the Xor of all
500	 * elements in the last two column, ie. 'E' and 'parity' colume, see
501	 * the Ref. paper by Blaum, et al 1993  */
502	for (i = 0; i < numRowInEncMatix; i++)
503		for (k = 0; k < longsPerEU; k++) {
504#if RF_EO_MATRIX_DIM > 17
505			ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
506			pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
507#elif RF_EO_MATRIX_DIM == 17
508			ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
509			pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
510#endif
511			P[k] ^= *ebuf_current;
512			P[k] ^= *pbuf_current;
513		}
514	RF_ASSERT(fcol[0] != fcol[1]);
515	if (fcol[0] < fcol[1]) {
516#if RF_EO_MATRIX_DIM > 17
517		dest_smaller = (short *) (dest[0]);
518		dest_larger = (short *) (dest[1]);
519#elif RF_EO_MATRIX_DIM == 17
520		dest_smaller = (long *) (dest[0]);
521		dest_larger = (long *) (dest[1]);
522#endif
523		f1 = fcol[0];
524		f2 = fcol[1];
525	} else {
526#if RF_EO_MATRIX_DIM > 17
527		dest_smaller = (short *) (dest[1]);
528		dest_larger = (short *) (dest[0]);
529#elif RF_EO_MATRIX_DIM == 17
530		dest_smaller = (long *) (dest[1]);
531		dest_larger = (long *) (dest[0]);
532#endif
533		f1 = fcol[1];
534		f2 = fcol[0];
535	}
536	row = (RF_EO_MATRIX_DIM) - 1;
537	while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) {
538#if RF_EO_MATRIX_DIM > 17
539		dest_larger_current = dest_larger + row * shortsPerEU;
540		dest_smaller_current = dest_smaller + row * shortsPerEU;
541#elif RF_EO_MATRIX_DIM == 17
542		dest_larger_current = dest_larger + row * longsPerEU;
543		dest_smaller_current = dest_smaller + row * longsPerEU;
544#endif
545		/**    Do the diagonal recovery. Initially, temp[k] = (failed 1),
546		       which is the failed data in the colume which has smaller col index. **/
547		/* step 1:  ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3))         */
548		for (j = 0; j < numDataCol; j++) {
549			if (j == f1 || j == f2)
550				continue;
551			rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
552			if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
553#if RF_EO_MATRIX_DIM > 17
554				rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU;
555				for (k = 0; k < shortsPerEU; k++)
556					temp[k] ^= *(rrdbuf_current + k);
557#elif RF_EO_MATRIX_DIM == 17
558				rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU;
559				for (k = 0; k < longsPerEU; k++)
560					temp[k] ^= *(rrdbuf_current + k);
561#endif
562			}
563		}
564		/* step 2:  ^E(erow,m-2), If erow is at the buttom row, don't
565		 * Xor into it  E(erow,m-2) = (principle diagonal) ^ (failed
566		 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
567		 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
568		 * diagonal) ^ (failed 2)       */
569
570		erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
571		if (erow != (RF_EO_MATRIX_DIM) - 1) {
572#if RF_EO_MATRIX_DIM > 17
573			ebuf_current = (short *) ebuf + shortsPerEU * erow;
574			for (k = 0; k < shortsPerEU; k++)
575				temp[k] ^= *(ebuf_current + k);
576#elif RF_EO_MATRIX_DIM == 17
577			ebuf_current = (long *) ebuf + longsPerEU * erow;
578			for (k = 0; k < longsPerEU; k++)
579				temp[k] ^= *(ebuf_current + k);
580#endif
581		}
582		/* step 3: ^P to obtain the failed data (failed 2).  P can be
583		 * proved to be actually  (principle diagonal)  After this
584		 * step, temp[k] = (failed 2), the failed data to be recovered */
585#if RF_EO_MATRIX_DIM > 17
586		for (k = 0; k < shortsPerEU; k++)
587			temp[k] ^= P[k];
588		/* Put the data to the destination buffer                              */
589		for (k = 0; k < shortsPerEU; k++)
590			dest_larger_current[k] = temp[k];
591#elif RF_EO_MATRIX_DIM == 17
592		for (k = 0; k < longsPerEU; k++)
593			temp[k] ^= P[k];
594		/* Put the data to the destination buffer                              */
595		for (k = 0; k < longsPerEU; k++)
596			dest_larger_current[k] = temp[k];
597#endif
598
599		/**          THE FOLLOWING DO THE HORIZONTAL XOR                **/
600		/* step 1:  ^(SUM of A(row,0..m-3)), ie. all nonfailed data
601		 * columes    */
602		for (j = 0; j < numDataCol; j++) {
603			if (j == f1 || j == f2)
604				continue;
605#if RF_EO_MATRIX_DIM > 17
606			rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU;
607			for (k = 0; k < shortsPerEU; k++)
608				temp[k] ^= *(rrdbuf_current + k);
609#elif RF_EO_MATRIX_DIM == 17
610			rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU;
611			for (k = 0; k < longsPerEU; k++)
612				temp[k] ^= *(rrdbuf_current + k);
613#endif
614		}
615		/* step 2: ^A(row,m-1) */
616		/* step 3: Put the data to the destination buffer                             	 */
617#if RF_EO_MATRIX_DIM > 17
618		pbuf_current = (short *) pbuf + shortsPerEU * row;
619		for (k = 0; k < shortsPerEU; k++)
620			temp[k] ^= *(pbuf_current + k);
621		for (k = 0; k < shortsPerEU; k++)
622			dest_smaller_current[k] = temp[k];
623#elif RF_EO_MATRIX_DIM == 17
624		pbuf_current = (long *) pbuf + longsPerEU * row;
625		for (k = 0; k < longsPerEU; k++)
626			temp[k] ^= *(pbuf_current + k);
627		for (k = 0; k < longsPerEU; k++)
628			dest_smaller_current[k] = temp[k];
629#endif
630		count++;
631	}
632	/* Check if all Encoding Unit in the data buffer have been decoded,
633	 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
634	 * this algorithm will covered all buffer 				 */
635	RF_ASSERT(count == numRowInEncMatix);
636	RF_Free((char *) P, bytesPerEU);
637	RF_Free((char *) temp, bytesPerEU);
638}
639
640
641/***************************************************************************************
642* 	This function is called by double degragded read
643* 	EO_200_CreateReadDAG
644*
645***************************************************************************************/
646int
647rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
648{
649	int     ndataParam = 0;
650	int     np = node->numParams;
651	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
652	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
653	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
654	int     i, prm, sector, nresults = node->numResults;
655	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
656	unsigned sosAddr;
657	int     mallc_one = 0, mallc_two = 0;	/* flags to indicate if
658						 * memory is allocated */
659	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
660	RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
661	        npda;
662	RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
663	char  **buf, *ebuf, *pbuf, *dest[2];
664	long   *suoff = NULL, *suend = NULL, *prmToCol = NULL,
665	    psuoff = 0, esuoff = 0;
666	RF_SectorNum_t startSector, endSector;
667	RF_Etimer_t timer;
668	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
669
670	RF_ETIMER_START(timer);
671
672	/* Find out the number of parameters which are pdas for data
673	 * information */
674	for (i = 0; i <= np; i++)
675		if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
676			ndataParam = i;
677			break;
678		}
679	buf = RF_Malloc(numDataCol * sizeof(*buf));
680	if (ndataParam != 0) {
681		suoff = RF_Malloc(ndataParam * sizeof(*suoff));
682		suend = RF_Malloc(ndataParam * sizeof(*suend));
683		prmToCol = RF_Malloc(ndataParam * sizeof(*prmToCol));
684	}
685	if (asmap->failedPDAs[1] &&
686	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
687		RF_ASSERT(0);	/* currently, no support for this situation */
688		ppda = node->params[np - 6].p;
689		ppda2 = node->params[np - 5].p;
690		RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
691		epda = node->params[np - 4].p;
692		epda2 = node->params[np - 3].p;
693		RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
694	} else {
695		ppda = node->params[np - 4].p;
696		epda = node->params[np - 3].p;
697		psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
698		esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
699		RF_ASSERT(psuoff == esuoff);
700	}
701	/*
702            the followings have three goals:
703            1. determine the startSector to begin decoding and endSector to end decoding.
704            2. determine the colume numbers of the two failed disks.
705            3. determine the offset and end offset of the access within each failed stripe unit.
706         */
707	if (nresults == 1) {
708		/* find the startSector to begin decoding */
709		pda = node->results[0];
710		memset(pda->bufPtr, 0, bytesPerSector * pda->numSector);
711		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
712		fsuend[0] = fsuoff[0] + pda->numSector;
713		fsuoff[1] = 0;
714		fsuend[1] = 0;
715		startSector = fsuoff[0];
716		endSector = fsuend[0];
717
718		/* find out the column of failed disk being accessed */
719		fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
720
721		/* find out the other failed colume not accessed */
722		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
723		for (i = 0; i < numDataCol; i++) {
724			npda.raidAddress = sosAddr + (i * secPerSU);
725			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
726			/* skip over dead disks */
727			if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
728				if (i != fcol[0])
729					break;
730		}
731		RF_ASSERT(i < numDataCol);
732		fcol[1] = i;
733	} else {
734		RF_ASSERT(nresults == 2);
735		pda0 = node->results[0];
736		memset(pda0->bufPtr, 0, bytesPerSector * pda0->numSector);
737		pda1 = node->results[1];
738		memset(pda1->bufPtr, 0, bytesPerSector * pda1->numSector);
739		/* determine the failed colume numbers of the two failed
740		 * disks. */
741		fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
742		fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
743		/* determine the offset and end offset of the access within
744		 * each failed stripe unit. */
745		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
746		fsuend[0] = fsuoff[0] + pda0->numSector;
747		fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
748		fsuend[1] = fsuoff[1] + pda1->numSector;
749		/* determine the startSector to begin decoding */
750		startSector = RF_MIN(pda0->startSector, pda1->startSector);
751		/* determine the endSector to end decoding */
752		endSector = RF_MAX(fsuend[0], fsuend[1]);
753	}
754	/*
755	      assign the beginning sector and the end sector for each parameter
756	      find out the corresponding colume # for each parameter
757        */
758	for (prm = 0; prm < ndataParam; prm++) {
759		pda = node->params[prm].p;
760		suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
761		suend[prm] = suoff[prm] + pda->numSector;
762		prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
763	}
764	/* 'sector' is the sector for the current decoding algorithm. For each
765	 * sector in the failed SU, find out the corresponding parameters that
766	 * cover the current sector and that are needed for decoding of this
767	 * sector in failed SU. 2.  Find out if sector is in the shadow of any
768	 * accessed failed SU. If not, malloc a temporary space of a sector in
769	 * size. */
770	for (sector = startSector; sector < endSector; sector++) {
771		if (nresults == 2)
772			if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1]))
773				continue;
774		for (prm = 0; prm < ndataParam; prm++)
775			if (suoff[prm] <= sector && sector < suend[prm])
776				buf[(prmToCol[prm])] = (char *)((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr +
777				    rf_RaidAddressToByte(raidPtr, sector - suoff[prm]);
778		/* find out if sector is in the shadow of any accessed failed
779		 * SU. If yes, assign dest[0], dest[1] to point at suitable
780		 * position of the buffer corresponding to failed SUs. if no,
781		 * malloc a temporary space of a sector in size for
782		 * destination of decoding. */
783		RF_ASSERT(nresults == 1 || nresults == 2);
784		if (nresults == 1) {
785			dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
786			/* Always malloc temp buffer to dest[1]  */
787			dest[1] = RF_Malloc(bytesPerSector);
788			mallc_two = 1;
789		} else {
790			if (fsuoff[0] <= sector && sector < fsuend[0])
791				dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
792			else {
793				dest[0] = RF_Malloc(bytesPerSector);
794				mallc_one = 1;
795			}
796			if (fsuoff[1] <= sector && sector < fsuend[1])
797				dest[1] = (char *)((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]);
798			else {
799				dest[1] = RF_Malloc(bytesPerSector);
800				mallc_two = 1;
801			}
802			RF_ASSERT(mallc_one == 0 || mallc_two == 0);
803		}
804		pbuf = (char *)ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff);
805		ebuf = (char *)epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff);
806		/*
807	         * After finish finding all needed sectors, call doubleEOdecode function for decoding
808	         * one sector to destination.
809	         */
810		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
811		/* free all allocated memory, and mark flag to indicate no
812		 * memory is being allocated */
813		if (mallc_one == 1)
814			RF_Free(dest[0], bytesPerSector);
815		if (mallc_two == 1)
816			RF_Free(dest[1], bytesPerSector);
817		mallc_one = mallc_two = 0;
818	}
819	RF_Free(buf, numDataCol * sizeof(char *));
820	if (ndataParam != 0) {
821		RF_Free(suoff, ndataParam * sizeof(long));
822		RF_Free(suend, ndataParam * sizeof(long));
823		RF_Free(prmToCol, ndataParam * sizeof(long));
824	}
825	RF_ETIMER_STOP(timer);
826	RF_ETIMER_EVAL(timer);
827	if (tracerec) {
828		tracerec->q_us += RF_ETIMER_VAL_US(timer);
829	}
830	rf_GenericWakeupFunc(node, 0);
831#if 1
832	return (0);		/* XXX is this even close!!?!?!!? GO */
833#endif
834}
835
836
837/* currently, only access of one of the two failed SU is allowed in this function.
838 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
839 * many accesses of single stripe unit.
840 */
841
842int
843rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node)
844{
845	int     np = node->numParams;
846	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
847	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
848	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
849	RF_SectorNum_t sector;
850	RF_RowCol_t col, scol;
851	int     prm, i, j;
852	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
853	unsigned sosAddr;
854	unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
855	RF_int64 numbytes;
856	RF_SectorNum_t startSector, endSector;
857	RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
858	RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
859	char  **buf;		/* buf[0], buf[1], buf[2], ...etc. point to
860				 * buffer storing data read from col0, col1,
861				 * col2 */
862	char   *ebuf, *pbuf, *dest[2], *olddata[2];
863	RF_Etimer_t timer;
864	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
865
866	RF_ASSERT(asmap->numDataFailed == 1);	/* currently only support this
867						 * case, the other failed SU
868						 * is not being accessed */
869	RF_ETIMER_START(timer);
870	buf = RF_Malloc(numDataCol * sizeof(*buf));
871
872	ppda = node->results[0];/* Instead of being buffers, node->results[0]
873				 * and [1] are Ppda and Epda  */
874	epda = node->results[1];
875	fpda = asmap->failedPDAs[0];
876
877	/* First, recovery the failed old SU using EvenOdd double decoding      */
878	/* determine the startSector and endSector for decoding */
879	startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
880	endSector = startSector + fpda->numSector;
881	/* Assign buf[col] pointers to point to each non-failed colume  and
882	 * initialize the pbuf and ebuf to point at the beginning of each
883	 * source buffers and destination buffers */
884	for (prm = 0; prm < numDataCol - 2; prm++) {
885		pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
886		col = rf_EUCol(layoutPtr, pda->raidAddress);
887		buf[col] = pda->bufPtr;
888	}
889	/* pbuf and ebuf:  they will change values as double recovery decoding
890	 * goes on */
891	pbuf = ppda->bufPtr;
892	ebuf = epda->bufPtr;
893	/* find out the logical colume numbers in the encoding matrix of the
894	 * two failed columes */
895	fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
896
897	/* find out the other failed colume not accessed this time */
898	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
899	for (i = 0; i < numDataCol; i++) {
900		npda.raidAddress = sosAddr + (i * secPerSU);
901		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
902		/* skip over dead disks */
903		if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
904			if (i != fcol[0])
905				break;
906	}
907	RF_ASSERT(i < numDataCol);
908	fcol[1] = i;
909	/* assign temporary space to put recovered failed SU */
910	numbytes = fpda->numSector * bytesPerSector;
911	olddata[0] = RF_Malloc(numbytes);
912	olddata[1] = RF_Malloc(numbytes);
913	dest[0] = olddata[0];
914	dest[1] = olddata[1];
915	/* Begin the recovery decoding, initially buf[j],  ebuf, pbuf, dest[j]
916	 * have already pointed at the beginning of each source buffers and
917	 * destination buffers */
918	for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
919		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
920		for (j = 0; j < numDataCol; j++)
921			if ((j != fcol[0]) && (j != fcol[1]))
922				buf[j] += bytesPerSector;
923		dest[0] += bytesPerSector;
924		dest[1] += bytesPerSector;
925		ebuf += bytesPerSector;
926		pbuf += bytesPerSector;
927	}
928	/* after recovery, the buffer pointed by olddata[0] is the old failed
929	 * data. With new writing data and this old data, use small write to
930	 * calculate the new redundant informations */
931	/* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
932	 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
933	 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
934	 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
935	 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
936	 * wudNodes; For current implementation, we assume the simplest case:
937	 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
938	 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
939	 * data to be writen to the failed disk. We first bxor the new data
940	 * into the old recovered data, then do the same things as small
941	 * write. */
942
943	rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes);
944	/* do new 'E' calculation  */
945	/* find out the corresponding colume in encoding matrix for write
946	 * colume to be encoded into redundant disk 'E' */
947	scol = rf_EUCol(layoutPtr, fpda->raidAddress);
948	/* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
949	 * buffer pointer               */
950	rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
951
952	/* do new 'P' calculation  */
953	rf_bxor(olddata[0], ppda->bufPtr, numbytes);
954	/* Free the allocated buffer  */
955	RF_Free(olddata[0], numbytes);
956	RF_Free(olddata[1], numbytes);
957	RF_Free(buf, numDataCol * sizeof(char *));
958
959	RF_ETIMER_STOP(timer);
960	RF_ETIMER_EVAL(timer);
961	if (tracerec) {
962		tracerec->q_us += RF_ETIMER_VAL_US(timer);
963	}
964	rf_GenericWakeupFunc(node, 0);
965	return (0);
966}
967#endif				/* RF_INCLUDE_EVENODD > 0 */
968