1/*	$NetBSD: rf_evenodd_dagfuncs.c,v 1.25 2022/02/16 22:00:56 andvar Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: ChangMing Wu
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * Code for RAID-EVENODD  architecture.
31 */
32
33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.25 2022/02/16 22:00:56 andvar Exp $");
35
36#include "rf_archs.h"
37
38#ifdef _KERNEL_OPT
39#include "opt_raid_diagnostic.h"
40#endif
41
42#if RF_INCLUDE_EVENODD > 0
43
44#include <dev/raidframe/raidframevar.h>
45
46#include "rf_raid.h"
47#include "rf_dag.h"
48#include "rf_dagffrd.h"
49#include "rf_dagffwr.h"
50#include "rf_dagdegrd.h"
51#include "rf_dagdegwr.h"
52#include "rf_dagutils.h"
53#include "rf_dagfuncs.h"
54#include "rf_etimer.h"
55#include "rf_general.h"
56#include "rf_parityscan.h"
57#include "rf_evenodd.h"
58#include "rf_evenodd_dagfuncs.h"
59
60/* These redundant functions are for small write */
61RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"};
62RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"};
63/* These redundant functions are for degraded read */
64RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
65RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"};
66/**********************************************************************************************
67 *   the following encoding node functions is used in  EO_000_CreateLargeWriteDAG
68 **********************************************************************************************/
69void
70rf_RegularPEFunc(RF_DagNode_t *node)
71{
72	rf_RegularESubroutine(node, node->results[1]);
73	rf_RegularXorFunc(node);/* does the wakeup here! */
74}
75
76
77/************************************************************************************************
78 *  For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
79 *  be used. The previous case is when write access at least sectors of full stripe unit.
80 *  The later function is used when the write access two stripe units but with total sectors
81 *  less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
82 *  areas in their stripe unit and  parity write and 'E' write are both divided into two distinct
83 *  writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
84 ************************************************************************************************/
85
86/* Algorithm:
87     1. Store the difference of old data and new data in the Rod buffer.
88     2. then encode this buffer into the buffer which already have old 'E' information inside it,
89	the result can be shown to be the new 'E' information.
90     3. xor the Wnd buffer into the difference buffer to recover the  original old data.
91   Here we have another alternative: to allocate a temporary buffer for storing the difference of
92   old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
93   take the same speed as the previous, and need more memory.
94*/
95void
96rf_RegularONEFunc(RF_DagNode_t *node)
97{
98	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
99	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
100	int     EpdaIndex = (node->numParams - 1) / 2 - 1;	/* the parameter of node
101								 * where you can find
102								 * e-pda */
103	int     i, k;
104	int     suoffset, length;
105	RF_RowCol_t scol;
106	char   *srcbuf, *destbuf;
107	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
108	RF_Etimer_t timer;
109	RF_PhysDiskAddr_t *pda;
110#ifdef RAID_DIAGNOSTIC
111	RF_PhysDiskAddr_t *EPDA =
112	    (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
113	int     ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);
114
115	RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
116	RF_ASSERT(ESUOffset == 0);
117#endif /* RAID_DIAGNOSTIC */
118
119	RF_ETIMER_START(timer);
120
121	/* Xor the Wnd buffer into Rod buffer, the difference of old data and
122	 * new data is stored in Rod buffer */
123	for (k = 0; k < EpdaIndex; k += 2) {
124		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
125		rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
126	}
127	/* Start to encoding the buffer storing the difference of old data and
128	 * new data into 'E' buffer  */
129	for (i = 0; i < EpdaIndex; i += 2)
130		if (node->params[i + 1].p != node->results[0]) {	/* results[0] is buf ptr
131									 * of E */
132			pda = (RF_PhysDiskAddr_t *) node->params[i].p;
133			srcbuf = (char *) node->params[i + 1].p;
134			scol = rf_EUCol(layoutPtr, pda->raidAddress);
135			suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
136			destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
137			rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
138		}
139	/* Recover the original old data to be used by parity encoding
140	 * function in XorNode */
141	for (k = 0; k < EpdaIndex; k += 2) {
142		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
143		rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
144	}
145	RF_ETIMER_STOP(timer);
146	RF_ETIMER_EVAL(timer);
147	tracerec->q_us += RF_ETIMER_VAL_US(timer);
148	rf_GenericWakeupFunc(node, 0);
149}
150
151void
152rf_SimpleONEFunc(RF_DagNode_t *node)
153{
154	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
155	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
156	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
157	int     retcode = 0;
158	char   *srcbuf, *destbuf;
159	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
160	int     length;
161	RF_RowCol_t scol;
162	RF_Etimer_t timer;
163
164	RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
165	if (node->dagHdr->status == rf_enable) {
166		RF_ETIMER_START(timer);
167		length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector);	/* this is a pda of
168														 * writeDataNodes */
169		/* bxor to buffer of readDataNodes */
170		retcode = rf_bxor(node->params[5].p, node->params[1].p, length);
171		/* find out the corresponding column in encoding matrix for
172		 * write column to be encoded into redundant disk 'E' */
173		scol = rf_EUCol(layoutPtr, pda->raidAddress);
174		srcbuf = node->params[1].p;
175		destbuf = node->params[3].p;
176		/* Start encoding process */
177		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
178		rf_bxor(node->params[5].p, node->params[1].p, length);
179		RF_ETIMER_STOP(timer);
180		RF_ETIMER_EVAL(timer);
181		tracerec->q_us += RF_ETIMER_VAL_US(timer);
182
183	}
184	rf_GenericWakeupFunc(node, retcode);	/* call wake func
185						 * explicitly since no
186						 * I/O in this node */
187}
188
189
190/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write  ********/
191void
192rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf)
193{
194	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
195	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
196	RF_PhysDiskAddr_t *pda;
197	int     i, suoffset;
198	RF_RowCol_t scol;
199	char   *srcbuf, *destbuf;
200	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
201	RF_Etimer_t timer;
202
203	RF_ETIMER_START(timer);
204	for (i = 0; i < node->numParams - 2; i += 2) {
205		RF_ASSERT(node->params[i + 1].p != ebuf);
206		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
207		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
208		scol = rf_EUCol(layoutPtr, pda->raidAddress);
209		srcbuf = (char *) node->params[i + 1].p;
210		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
211		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
212	}
213	RF_ETIMER_STOP(timer);
214	RF_ETIMER_EVAL(timer);
215	tracerec->xor_us += RF_ETIMER_VAL_US(timer);
216}
217
218
219/*******************************************************************************************
220 *			 Used in  EO_001_CreateLargeWriteDAG
221 ******************************************************************************************/
222void
223rf_RegularEFunc(RF_DagNode_t *node)
224{
225	rf_RegularESubroutine(node, node->results[0]);
226	rf_GenericWakeupFunc(node, 0);
227}
228/*******************************************************************************************
229 * This degraded function allow only two case:
230 *  1. when write access the full failed stripe unit, then the access can be more than
231 *     one tripe units.
232 *  2. when write access only part of the failed SU, we assume accesses of more than
233 *     one stripe unit is not allowed so that the write can be dealt with like a
234 *     large write.
235 *  The following function is based on these assumptions. So except in the second case,
236 *  it looks the same as a large write encoding function. But this is not exactly the
237 *  normal way for doing a degraded write, since raidframe have to break cases of access
238 *  other than the above two into smaller accesses. We may have to change
239 *  DegrESubroutine in the future.
240 *******************************************************************************************/
241void
242rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
243{
244	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
245	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
246	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
247	RF_PhysDiskAddr_t *pda;
248	int     i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
249	RF_RowCol_t scol;
250	char   *srcbuf, *destbuf;
251	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
252	RF_Etimer_t timer;
253
254	RF_ETIMER_START(timer);
255	for (i = 0; i < node->numParams - 2; i += 2) {
256		RF_ASSERT(node->params[i + 1].p != ebuf);
257		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
258		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
259		scol = rf_EUCol(layoutPtr, pda->raidAddress);
260		srcbuf = (char *) node->params[i + 1].p;
261		destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
262		rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
263	}
264
265	RF_ETIMER_STOP(timer);
266	RF_ETIMER_EVAL(timer);
267	tracerec->q_us += RF_ETIMER_VAL_US(timer);
268}
269
270
271/**************************************************************************************
272 * This function is used in case where one data disk failed and both redundant disks
273 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
274 * failed in the stripe but not accessed at this time, then we should, instead, use
275 * the rf_EOWriteDoubleRecoveryFunc().
276 **************************************************************************************/
277void
278rf_Degraded_100_EOFunc(RF_DagNode_t *node)
279{
280	rf_DegrESubroutine(node, node->results[1]);
281	rf_RecoveryXorFunc(node);	/* does the wakeup here! */
282}
283/**************************************************************************************
284 * This function is to encode one sector in one of the data disks to the E disk.
285 * However, in evenodd this function can also be used as decoding function to recover
286 * data from dead disk in the case of parity failure and a single data failure.
287 **************************************************************************************/
288void
289rf_e_EncOneSect(
290    RF_RowCol_t srcLogicCol,
291    char *srcSecbuf,
292    RF_RowCol_t destLogicCol,
293    char *destSecbuf,
294    int bytesPerSector)
295{
296	int     S_index;	/* index of the EU in the src col which need
297				 * be Xored into all EUs in a dest sector */
298	int     numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
299	RF_RowCol_t j, indexInDest,	/* row index of an encoding unit in
300					 * the destination column of encoding
301					 * matrix */
302	        indexInSrc;	/* row index of an encoding unit in the source
303				 * column used for recovery */
304	int     bytesPerEU = bytesPerSector / numRowInEncMatix;
305
306#if RF_EO_MATRIX_DIM > 17
307	int     shortsPerEU = bytesPerEU / sizeof(short);
308	short  *destShortBuf, *srcShortBuf1, *srcShortBuf2;
309	short temp1;
310#elif RF_EO_MATRIX_DIM == 17
311	int     longsPerEU = bytesPerEU / sizeof(long);
312	long   *destLongBuf, *srcLongBuf1, *srcLongBuf2;
313	long temp1;
314#endif
315
316#if RF_EO_MATRIX_DIM > 17
317	RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
318	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
319#elif RF_EO_MATRIX_DIM == 17
320	RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
321	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
322#endif
323
324	S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
325#if RF_EO_MATRIX_DIM > 17
326	srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
327#elif RF_EO_MATRIX_DIM == 17
328	srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
329#endif
330
331	for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) {
332		indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
333
334#if RF_EO_MATRIX_DIM > 17
335		destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
336		srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
337		for (j = 0; j < shortsPerEU; j++) {
338			temp1 = destShortBuf[j] ^ srcShortBuf1[j];
339			/* note: S_index won't be at the end row for any src
340			 * col! */
341			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
342				destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
343			/* if indexInSrc is at the end row, ie.
344			 * RF_EO_MATRIX_DIM -1, then all elements are zero! */
345			else
346				destShortBuf[j] = temp1;
347		}
348
349#elif RF_EO_MATRIX_DIM == 17
350		destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
351		srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
352		for (j = 0; j < longsPerEU; j++) {
353			temp1 = destLongBuf[j] ^ srcLongBuf1[j];
354			if (indexInSrc != RF_EO_MATRIX_DIM - 1)
355				destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
356			else
357				destLongBuf[j] = temp1;
358		}
359#endif
360	}
361}
362
363void
364rf_e_encToBuf(
365    RF_Raid_t * raidPtr,
366    RF_RowCol_t srcLogicCol,
367    char *srcbuf,
368    RF_RowCol_t destLogicCol,
369    char *destbuf,
370    int numSector)
371{
372	int     i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
373
374	for (i = 0; i < numSector; i++) {
375		rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
376		srcbuf += bytesPerSector;
377		destbuf += bytesPerSector;
378	}
379}
380/**************************************************************************************
381 * when parity die and one data die, We use second redundant information, 'E',
382 * to recover the data in dead disk. This function is used in the recovery node of
383 * for EO_110_CreateReadDAG
384 **************************************************************************************/
385void
386rf_RecoveryEFunc(RF_DagNode_t *node)
387{
388	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
389	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
390	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
391	RF_RowCol_t scol,	/* source logical column */
392	        fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress);	/* logical column of
393									 * failed SU */
394	int     i;
395	RF_PhysDiskAddr_t *pda;
396	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
397	char   *srcbuf, *destbuf;
398	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
399	RF_Etimer_t timer;
400
401	memset(node->results[0], 0,
402	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
403	if (node->dagHdr->status == rf_enable) {
404		RF_ETIMER_START(timer);
405		for (i = 0; i < node->numParams - 2; i += 2)
406			if (node->params[i + 1].p != node->results[0]) {
407				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
408				if (i == node->numParams - 4)
409					scol = RF_EO_MATRIX_DIM - 2;	/* the column of
410									 * redundant E */
411				else
412					scol = rf_EUCol(layoutPtr, pda->raidAddress);
413				srcbuf = (char *) node->params[i + 1].p;
414				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
415				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
416				rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
417			}
418		RF_ETIMER_STOP(timer);
419		RF_ETIMER_EVAL(timer);
420		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
421	}
422	rf_GenericWakeupFunc(node, 0);	/* node execute successfully */
423}
424/**************************************************************************************
425 * This function is used in the case where one data and the parity have filed.
426 * (in EO_110_CreateWriteDAG )
427 **************************************************************************************/
428void
429rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)
430{
431	rf_DegrESubroutine(node, node->results[0]);
432	rf_GenericWakeupFunc(node, 0);
433}
434
435
436
437/**************************************************************************************
438 *  		THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
439 **************************************************************************************/
440
441void
442rf_doubleEOdecode(
443    RF_Raid_t * raidPtr,
444    char **rrdbuf,
445    char **dest,
446    RF_RowCol_t * fcol,
447    char *pbuf,
448    char *ebuf)
449{
450	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
451	int     i, j, k, f1, f2, row;
452	int     rrdrow, erow, count = 0;
453	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
454	int     numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
455#if 0
456	int     pcol = (RF_EO_MATRIX_DIM) - 1;
457#endif
458	int     ecol = (RF_EO_MATRIX_DIM) - 2;
459	int     bytesPerEU = bytesPerSector / numRowInEncMatix;
460	int     numDataCol = layoutPtr->numDataCol;
461#if RF_EO_MATRIX_DIM > 17
462	int     shortsPerEU = bytesPerEU / sizeof(short);
463	short  *rrdbuf_current, *pbuf_current, *ebuf_current;
464	short  *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
465	short *temp;
466	short  *P;
467
468	RF_ASSERT(bytesPerEU % sizeof(short) == 0);
469#elif RF_EO_MATRIX_DIM == 17
470	int     longsPerEU = bytesPerEU / sizeof(long);
471	long   *rrdbuf_current, *pbuf_current, *ebuf_current;
472	long   *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
473	long *temp;
474	long   *P;
475
476	RF_ASSERT(bytesPerEU % sizeof(long) == 0);
477#endif
478	P = RF_Malloc(bytesPerEU);
479	temp = RF_Malloc(bytesPerEU);
480	RF_ASSERT(*((long *) dest[0]) == 0);
481	RF_ASSERT(*((long *) dest[1]) == 0);
482	RF_ASSERT(*P == 0);
483	/* calculate the 'P' parameter, which, not parity, is the Xor of all
484	 * elements in the last two columns, ie. 'E' and 'parity' columns, see
485	 * the Ref. paper by Blaum, et al 1993  */
486	for (i = 0; i < numRowInEncMatix; i++)
487		for (k = 0; k < longsPerEU; k++) {
488#if RF_EO_MATRIX_DIM > 17
489			ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
490			pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
491#elif RF_EO_MATRIX_DIM == 17
492			ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
493			pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
494#endif
495			P[k] ^= *ebuf_current;
496			P[k] ^= *pbuf_current;
497		}
498	RF_ASSERT(fcol[0] != fcol[1]);
499	if (fcol[0] < fcol[1]) {
500#if RF_EO_MATRIX_DIM > 17
501		dest_smaller = (short *) (dest[0]);
502		dest_larger = (short *) (dest[1]);
503#elif RF_EO_MATRIX_DIM == 17
504		dest_smaller = (long *) (dest[0]);
505		dest_larger = (long *) (dest[1]);
506#endif
507		f1 = fcol[0];
508		f2 = fcol[1];
509	} else {
510#if RF_EO_MATRIX_DIM > 17
511		dest_smaller = (short *) (dest[1]);
512		dest_larger = (short *) (dest[0]);
513#elif RF_EO_MATRIX_DIM == 17
514		dest_smaller = (long *) (dest[1]);
515		dest_larger = (long *) (dest[0]);
516#endif
517		f1 = fcol[1];
518		f2 = fcol[0];
519	}
520	row = (RF_EO_MATRIX_DIM) - 1;
521	while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) {
522#if RF_EO_MATRIX_DIM > 17
523		dest_larger_current = dest_larger + row * shortsPerEU;
524		dest_smaller_current = dest_smaller + row * shortsPerEU;
525#elif RF_EO_MATRIX_DIM == 17
526		dest_larger_current = dest_larger + row * longsPerEU;
527		dest_smaller_current = dest_smaller + row * longsPerEU;
528#endif
529		/**    Do the diagonal recovery. Initially, temp[k] = (failed 1),
530		       which is the failed data in the column which has smaller col index. **/
531		/* step 1:  ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3))         */
532		for (j = 0; j < numDataCol; j++) {
533			if (j == f1 || j == f2)
534				continue;
535			rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
536			if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
537#if RF_EO_MATRIX_DIM > 17
538				rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU;
539				for (k = 0; k < shortsPerEU; k++)
540					temp[k] ^= *(rrdbuf_current + k);
541#elif RF_EO_MATRIX_DIM == 17
542				rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU;
543				for (k = 0; k < longsPerEU; k++)
544					temp[k] ^= *(rrdbuf_current + k);
545#endif
546			}
547		}
548		/* step 2:  ^E(erow,m-2), If erow is at the bottom row, don't
549		 * Xor into it  E(erow,m-2) = (principle diagonal) ^ (failed
550		 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
551		 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
552		 * diagonal) ^ (failed 2)       */
553
554		erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
555		if (erow != (RF_EO_MATRIX_DIM) - 1) {
556#if RF_EO_MATRIX_DIM > 17
557			ebuf_current = (short *) ebuf + shortsPerEU * erow;
558			for (k = 0; k < shortsPerEU; k++)
559				temp[k] ^= *(ebuf_current + k);
560#elif RF_EO_MATRIX_DIM == 17
561			ebuf_current = (long *) ebuf + longsPerEU * erow;
562			for (k = 0; k < longsPerEU; k++)
563				temp[k] ^= *(ebuf_current + k);
564#endif
565		}
566		/* step 3: ^P to obtain the failed data (failed 2).  P can be
567		 * proved to be actually  (principle diagonal)  After this
568		 * step, temp[k] = (failed 2), the failed data to be recovered */
569#if RF_EO_MATRIX_DIM > 17
570		for (k = 0; k < shortsPerEU; k++)
571			temp[k] ^= P[k];
572		/* Put the data to the destination buffer                              */
573		for (k = 0; k < shortsPerEU; k++)
574			dest_larger_current[k] = temp[k];
575#elif RF_EO_MATRIX_DIM == 17
576		for (k = 0; k < longsPerEU; k++)
577			temp[k] ^= P[k];
578		/* Put the data to the destination buffer                              */
579		for (k = 0; k < longsPerEU; k++)
580			dest_larger_current[k] = temp[k];
581#endif
582
583		/**          THE FOLLOWING DO THE HORIZONTAL XOR                **/
584		/* step 1:  ^(SUM of A(row,0..m-3)), ie. all nonfailed data
585		 * columns    */
586		for (j = 0; j < numDataCol; j++) {
587			if (j == f1 || j == f2)
588				continue;
589#if RF_EO_MATRIX_DIM > 17
590			rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU;
591			for (k = 0; k < shortsPerEU; k++)
592				temp[k] ^= *(rrdbuf_current + k);
593#elif RF_EO_MATRIX_DIM == 17
594			rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU;
595			for (k = 0; k < longsPerEU; k++)
596				temp[k] ^= *(rrdbuf_current + k);
597#endif
598		}
599		/* step 2: ^A(row,m-1) */
600		/* step 3: Put the data to the destination buffer                             	 */
601#if RF_EO_MATRIX_DIM > 17
602		pbuf_current = (short *) pbuf + shortsPerEU * row;
603		for (k = 0; k < shortsPerEU; k++)
604			temp[k] ^= *(pbuf_current + k);
605		for (k = 0; k < shortsPerEU; k++)
606			dest_smaller_current[k] = temp[k];
607#elif RF_EO_MATRIX_DIM == 17
608		pbuf_current = (long *) pbuf + longsPerEU * row;
609		for (k = 0; k < longsPerEU; k++)
610			temp[k] ^= *(pbuf_current + k);
611		for (k = 0; k < longsPerEU; k++)
612			dest_smaller_current[k] = temp[k];
613#endif
614		count++;
615	}
616	/* Check if all Encoding Unit in the data buffer have been decoded,
617	 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
618	 * this algorithm will covered all buffer 				 */
619	RF_ASSERT(count == numRowInEncMatix);
620	RF_Free((char *) P, bytesPerEU);
621	RF_Free((char *) temp, bytesPerEU);
622}
623
624
625/***************************************************************************************
626* 	This function is called by double degraded read
627* 	EO_200_CreateReadDAG
628*
629***************************************************************************************/
630void
631rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
632{
633	int     ndataParam = 0;
634	int     np = node->numParams;
635	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
636	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
637	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
638	int     i, prm, sector, nresults = node->numResults;
639	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
640	unsigned sosAddr;
641	int     mallc_one = 0, mallc_two = 0;	/* flags to indicate if
642						 * memory is allocated */
643	int     bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
644	RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
645	        npda;
646	RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
647	char  **buf, *ebuf, *pbuf, *dest[2];
648	long   *suoff = NULL, *suend = NULL, *prmToCol = NULL,
649	    psuoff = 0, esuoff = 0;
650	RF_SectorNum_t startSector, endSector;
651	RF_Etimer_t timer;
652	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
653
654	RF_ETIMER_START(timer);
655
656	/* Find out the number of parameters which are pdas for data
657	 * information */
658	for (i = 0; i <= np; i++)
659		if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
660			ndataParam = i;
661			break;
662		}
663	buf = RF_Malloc(numDataCol * sizeof(*buf));
664	if (ndataParam != 0) {
665		suoff = RF_Malloc(ndataParam * sizeof(*suoff));
666		suend = RF_Malloc(ndataParam * sizeof(*suend));
667		prmToCol = RF_Malloc(ndataParam * sizeof(*prmToCol));
668	}
669	if (asmap->failedPDAs[1] &&
670	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
671		RF_ASSERT(0);	/* currently, no support for this situation */
672		ppda = node->params[np - 6].p;
673		ppda2 = node->params[np - 5].p;
674		RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
675		epda = node->params[np - 4].p;
676		epda2 = node->params[np - 3].p;
677		RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
678	} else {
679		ppda = node->params[np - 4].p;
680		epda = node->params[np - 3].p;
681		psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
682		esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
683		RF_ASSERT(psuoff == esuoff);
684	}
685	/*
686            the followings have three goals:
687            1. determine the startSector to begin decoding and endSector to end decoding.
688            2. determine the column numbers of the two failed disks.
689            3. determine the offset and end offset of the access within each failed stripe unit.
690         */
691	if (nresults == 1) {
692		/* find the startSector to begin decoding */
693		pda = node->results[0];
694		memset(pda->bufPtr, 0, bytesPerSector * pda->numSector);
695		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
696		fsuend[0] = fsuoff[0] + pda->numSector;
697		fsuoff[1] = 0;
698		fsuend[1] = 0;
699		startSector = fsuoff[0];
700		endSector = fsuend[0];
701
702		/* find out the column of failed disk being accessed */
703		fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
704
705		/* find out the other failed column not accessed */
706		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
707		for (i = 0; i < numDataCol; i++) {
708			npda.raidAddress = sosAddr + (i * secPerSU);
709			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
710			/* skip over dead disks */
711			if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
712				if (i != fcol[0])
713					break;
714		}
715		RF_ASSERT(i < numDataCol);
716		fcol[1] = i;
717	} else {
718		RF_ASSERT(nresults == 2);
719		pda0 = node->results[0];
720		memset(pda0->bufPtr, 0, bytesPerSector * pda0->numSector);
721		pda1 = node->results[1];
722		memset(pda1->bufPtr, 0, bytesPerSector * pda1->numSector);
723		/* determine the failed column numbers of the two failed
724		 * disks. */
725		fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
726		fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
727		/* determine the offset and end offset of the access within
728		 * each failed stripe unit. */
729		fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
730		fsuend[0] = fsuoff[0] + pda0->numSector;
731		fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
732		fsuend[1] = fsuoff[1] + pda1->numSector;
733		/* determine the startSector to begin decoding */
734		startSector = RF_MIN(pda0->startSector, pda1->startSector);
735		/* determine the endSector to end decoding */
736		endSector = RF_MAX(fsuend[0], fsuend[1]);
737	}
738	/*
739	      assign the beginning sector and the end sector for each parameter
740	      find out the corresponding column # for each parameter
741        */
742	for (prm = 0; prm < ndataParam; prm++) {
743		pda = node->params[prm].p;
744		suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
745		suend[prm] = suoff[prm] + pda->numSector;
746		prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
747	}
748	/* 'sector' is the sector for the current decoding algorithm. For each
749	 * sector in the failed SU, find out the corresponding parameters that
750	 * cover the current sector and that are needed for decoding of this
751	 * sector in failed SU. 2.  Find out if sector is in the shadow of any
752	 * accessed failed SU. If not, malloc a temporary space of a sector in
753	 * size. */
754	for (sector = startSector; sector < endSector; sector++) {
755		if (nresults == 2)
756			if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1]))
757				continue;
758		for (prm = 0; prm < ndataParam; prm++)
759			if (suoff[prm] <= sector && sector < suend[prm])
760				buf[(prmToCol[prm])] = (char *)((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr +
761				    rf_RaidAddressToByte(raidPtr, sector - suoff[prm]);
762		/* find out if sector is in the shadow of any accessed failed
763		 * SU. If yes, assign dest[0], dest[1] to point at suitable
764		 * position of the buffer corresponding to failed SUs. if no,
765		 * malloc a temporary space of a sector in size for
766		 * destination of decoding. */
767		RF_ASSERT(nresults == 1 || nresults == 2);
768		if (nresults == 1) {
769			dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
770			/* Always malloc temp buffer to dest[1]  */
771			dest[1] = RF_Malloc(bytesPerSector);
772			mallc_two = 1;
773		} else {
774			if (fsuoff[0] <= sector && sector < fsuend[0])
775				dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
776			else {
777				dest[0] = RF_Malloc(bytesPerSector);
778				mallc_one = 1;
779			}
780			if (fsuoff[1] <= sector && sector < fsuend[1])
781				dest[1] = (char *)((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]);
782			else {
783				dest[1] = RF_Malloc(bytesPerSector);
784				mallc_two = 1;
785			}
786			RF_ASSERT(mallc_one == 0 || mallc_two == 0);
787		}
788		pbuf = (char *)ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff);
789		ebuf = (char *)epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff);
790		/*
791	         * After finish finding all needed sectors, call doubleEOdecode function for decoding
792	         * one sector to destination.
793	         */
794		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
795		/* free all allocated memory, and mark flag to indicate no
796		 * memory is being allocated */
797		if (mallc_one == 1)
798			RF_Free(dest[0], bytesPerSector);
799		if (mallc_two == 1)
800			RF_Free(dest[1], bytesPerSector);
801		mallc_one = mallc_two = 0;
802	}
803	RF_Free(buf, numDataCol * sizeof(char *));
804	if (ndataParam != 0) {
805		RF_Free(suoff, ndataParam * sizeof(long));
806		RF_Free(suend, ndataParam * sizeof(long));
807		RF_Free(prmToCol, ndataParam * sizeof(long));
808	}
809	RF_ETIMER_STOP(timer);
810	RF_ETIMER_EVAL(timer);
811	if (tracerec) {
812		tracerec->q_us += RF_ETIMER_VAL_US(timer);
813	}
814	rf_GenericWakeupFunc(node, 0);
815}
816
817
818/* currently, only access of one of the two failed SU is allowed in this function.
819 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
820 * many accesses of single stripe unit.
821 */
822
823void
824rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node)
825{
826	int     np = node->numParams;
827	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
828	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
829	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
830	RF_SectorNum_t sector;
831	RF_RowCol_t col, scol;
832	int     prm, i, j;
833	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
834	unsigned sosAddr;
835	unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
836	RF_int64 numbytes;
837	RF_SectorNum_t startSector, endSector;
838	RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
839	RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
840	char  **buf;		/* buf[0], buf[1], buf[2], ...etc. point to
841				 * buffer storing data read from col0, col1,
842				 * col2 */
843	char   *ebuf, *pbuf, *dest[2], *olddata[2];
844	RF_Etimer_t timer;
845	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
846
847	RF_ASSERT(asmap->numDataFailed == 1);	/* currently only support this
848						 * case, the other failed SU
849						 * is not being accessed */
850	RF_ETIMER_START(timer);
851	buf = RF_Malloc(numDataCol * sizeof(*buf));
852
853	ppda = node->results[0];/* Instead of being buffers, node->results[0]
854				 * and [1] are Ppda and Epda  */
855	epda = node->results[1];
856	fpda = asmap->failedPDAs[0];
857
858	/* First, recovery the failed old SU using EvenOdd double decoding      */
859	/* determine the startSector and endSector for decoding */
860	startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
861	endSector = startSector + fpda->numSector;
862	/* Assign buf[col] pointers to point to each non-failed column  and
863	 * initialize the pbuf and ebuf to point at the beginning of each
864	 * source buffers and destination buffers */
865	for (prm = 0; prm < numDataCol - 2; prm++) {
866		pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
867		col = rf_EUCol(layoutPtr, pda->raidAddress);
868		buf[col] = pda->bufPtr;
869	}
870	/* pbuf and ebuf:  they will change values as double recovery decoding
871	 * goes on */
872	pbuf = ppda->bufPtr;
873	ebuf = epda->bufPtr;
874	/* find out the logical column numbers in the encoding matrix of the
875	 * two failed columns */
876	fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
877
878	/* find out the other failed column not accessed this time */
879	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
880	for (i = 0; i < numDataCol; i++) {
881		npda.raidAddress = sosAddr + (i * secPerSU);
882		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
883		/* skip over dead disks */
884		if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
885			if (i != fcol[0])
886				break;
887	}
888	RF_ASSERT(i < numDataCol);
889	fcol[1] = i;
890	/* assign temporary space to put recovered failed SU */
891	numbytes = fpda->numSector * bytesPerSector;
892	olddata[0] = RF_Malloc(numbytes);
893	olddata[1] = RF_Malloc(numbytes);
894	dest[0] = olddata[0];
895	dest[1] = olddata[1];
896	/* Begin the recovery decoding, initially buf[j],  ebuf, pbuf, dest[j]
897	 * have already pointed at the beginning of each source buffers and
898	 * destination buffers */
899	for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
900		rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
901		for (j = 0; j < numDataCol; j++)
902			if ((j != fcol[0]) && (j != fcol[1]))
903				buf[j] += bytesPerSector;
904		dest[0] += bytesPerSector;
905		dest[1] += bytesPerSector;
906		ebuf += bytesPerSector;
907		pbuf += bytesPerSector;
908	}
909	/* after recovery, the buffer pointed by olddata[0] is the old failed
910	 * data. With new writing data and this old data, use small write to
911	 * calculate the new redundant information */
912	/* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
913	 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
914	 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
915	 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
916	 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
917	 * wudNodes; For current implementation, we assume the simplest case:
918	 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
919	 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
920	 * data to be written to the failed disk. We first bxor the new data
921	 * into the old recovered data, then do the same things as small
922	 * write. */
923
924	rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes);
925	/* do new 'E' calculation  */
926	/* find out the corresponding column in encoding matrix for write
927	 * column to be encoded into redundant disk 'E' */
928	scol = rf_EUCol(layoutPtr, fpda->raidAddress);
929	/* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
930	 * buffer pointer               */
931	rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
932
933	/* do new 'P' calculation  */
934	rf_bxor(olddata[0], ppda->bufPtr, numbytes);
935	/* Free the allocated buffer  */
936	RF_Free(olddata[0], numbytes);
937	RF_Free(olddata[1], numbytes);
938	RF_Free(buf, numDataCol * sizeof(char *));
939
940	RF_ETIMER_STOP(timer);
941	RF_ETIMER_EVAL(timer);
942	if (tracerec) {
943		tracerec->q_us += RF_ETIMER_VAL_US(timer);
944	}
945	rf_GenericWakeupFunc(node, 0);
946}
947#endif				/* RF_INCLUDE_EVENODD > 0 */
948