rf_evenodd_dagfuncs.c revision 1.1
1/*	$NetBSD: rf_evenodd_dagfuncs.c,v 1.1 1998/11/13 04:20:29 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: ChangMing Wu
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * Code for RAID-EVENODD  architecture.
31 */
32
33#include "rf_types.h"
34#include "rf_raid.h"
35#include "rf_dag.h"
36#include "rf_dagffrd.h"
37#include "rf_dagffwr.h"
38#include "rf_dagdegrd.h"
39#include "rf_dagdegwr.h"
40#include "rf_dagutils.h"
41#include "rf_dagfuncs.h"
42#include "rf_threadid.h"
43#include "rf_etimer.h"
44#include "rf_general.h"
45#include "rf_configure.h"
46#include "rf_parityscan.h"
47#include "rf_sys.h"
48#include "rf_evenodd.h"
49#include "rf_evenodd_dagfuncs.h"
50
51/* These redundant functions are for small write */
52RF_RedFuncs_t rf_EOSmallWritePFuncs = { rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P" };
53RF_RedFuncs_t rf_EOSmallWriteEFuncs = { rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E" };
54
55/* These redundant functions are for degraded read */
56RF_RedFuncs_t rf_eoPRecoveryFuncs =  { rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
57RF_RedFuncs_t rf_eoERecoveryFuncs = { rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func" };
58
59/**********************************************************************************************
60 *   the following encoding node functions is used in  EO_000_CreateLargeWriteDAG
61 **********************************************************************************************/
62int rf_RegularPEFunc(node)
63  RF_DagNode_t  *node;
64{
65   rf_RegularESubroutine(node,node->results[1]);
66   rf_RegularXorFunc(node);    /* does the wakeup here! */
67#if 1
68   return(0); /* XXX This was missing... GO */
69#endif
70}
71
72
73/************************************************************************************************
74 *  For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
75 *  be used. The previous case is when write access at least sectors of full stripe unit.
76 *  The later function is used when the write access two stripe units but with total sectors
77 *  less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
78 *  areas in their stripe unit and  parity write and 'E' write are both devided into two distinct
79 *  writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
80 ************************************************************************************************/
81
82/* Algorithm:
83     1. Store the difference of old data and new data in the Rod buffer.
84     2. then encode this buffer into the buffer which already have old 'E' information inside it,
85	the result can be shown to be the new 'E' information.
86     3. xor the Wnd buffer into the difference buffer to recover the  original old data.
87   Here we have another alternative: to allocate a temporary buffer for storing the difference of
88   old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
89   take the same speed as the previous, and need more memory.
90*/
91int rf_RegularONEFunc(node)
92  RF_DagNode_t  *node;
93{
94  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
95  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
96  int EpdaIndex = (node->numParams-1)/2 - 1; /* the parameter of node where you can find e-pda */
97  int i, k, retcode = 0;
98  int suoffset, length;
99  RF_RowCol_t scol;
100  char *srcbuf, *destbuf;
101  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
102  RF_Etimer_t timer;
103  RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
104  int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); /* generally zero  */
105
106  RF_ASSERT( EPDA->type == RF_PDA_TYPE_Q );
107  RF_ASSERT(ESUOffset == 0);
108
109  RF_ETIMER_START(timer);
110
111  /* Xor the Wnd buffer into Rod buffer, the difference of old data and new data is stored in Rod buffer */
112  for( k=0; k< EpdaIndex; k += 2) {
113   length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector );
114   retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp);
115  }
116  /* Start to encoding the buffer storing the difference of old data and new data into 'E' buffer  */
117  for (i=0; i<EpdaIndex; i+=2) if (node->params[i+1].p != node->results[0]) { /* results[0] is buf ptr of E */
118    pda = (RF_PhysDiskAddr_t *) node->params[i].p;
119    srcbuf = (char *) node->params[i+1].p;
120    scol = rf_EUCol(layoutPtr, pda->raidAddress );
121    suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
122    destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset);
123    rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
124  }
125  /* Recover the original old data to be used by parity encoding function in XorNode */
126  for( k=0; k< EpdaIndex; k += 2) {
127   length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector );
128   retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp);
129  }
130  RF_ETIMER_STOP(timer);
131  RF_ETIMER_EVAL(timer);
132  tracerec->q_us += RF_ETIMER_VAL_US(timer);
133  rf_GenericWakeupFunc(node, 0);
134#if 1
135  return(0); /* XXX this was missing.. GO */
136#endif
137}
138
139int rf_SimpleONEFunc(node)
140  RF_DagNode_t   *node;
141{
142  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
143  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
144  RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
145  int retcode = 0;
146  char *srcbuf, *destbuf;
147  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
148  int length;
149  RF_RowCol_t scol;
150  RF_Etimer_t timer;
151
152  RF_ASSERT( ((RF_PhysDiskAddr_t *)node->params[2].p)->type == RF_PDA_TYPE_Q );
153  if (node->dagHdr->status == rf_enable) {
154     RF_ETIMER_START(timer);
155     length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[4].p)->numSector );/* this is a pda of writeDataNodes */
156     /* bxor to buffer of readDataNodes */
157     retcode = rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
158     /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */
159     scol = rf_EUCol(layoutPtr, pda->raidAddress );
160     srcbuf = node->params[1].p;
161     destbuf = node->params[3].p;
162     /* Start encoding process */
163     rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
164     rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
165     RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer);
166
167  }
168  return(rf_GenericWakeupFunc(node, retcode));     /* call wake func explicitly since no I/O in this node */
169}
170
171
172/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write  ********/
173void rf_RegularESubroutine(node, ebuf)
174  RF_DagNode_t  *node;
175  char          *ebuf;
176{
177  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
178  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
179  RF_PhysDiskAddr_t *pda;
180  int i, suoffset;
181  RF_RowCol_t scol;
182  char *srcbuf, *destbuf;
183  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
184  RF_Etimer_t timer;
185
186  RF_ETIMER_START(timer);
187  for (i=0; i<node->numParams-2; i+=2) {
188    RF_ASSERT( node->params[i+1].p != ebuf );
189    pda = (RF_PhysDiskAddr_t *) node->params[i].p;
190    suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
191    scol = rf_EUCol(layoutPtr, pda->raidAddress );
192    srcbuf = (char *) node->params[i+1].p;
193    destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset );
194    rf_e_encToBuf(raidPtr,  scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
195  }
196  RF_ETIMER_STOP(timer);
197  RF_ETIMER_EVAL(timer);
198  tracerec->xor_us += RF_ETIMER_VAL_US(timer);
199}
200
201
202/*******************************************************************************************
203 *			 Used in  EO_001_CreateLargeWriteDAG
204 ******************************************************************************************/
205int rf_RegularEFunc(node)
206  RF_DagNode_t  *node;
207{
208   rf_RegularESubroutine(node, node->results[0]);
209   rf_GenericWakeupFunc(node, 0);
210#if 1
211   return(0); /* XXX this was missing?.. GO */
212#endif
213}
214
215/*******************************************************************************************
216 * This degraded function allow only two case:
217 *  1. when write access the full failed stripe unit, then the access can be more than
218 *     one tripe units.
219 *  2. when write access only part of the failed SU, we assume accesses of more than
220 *     one stripe unit is not allowed so that the write can be dealt with like a
221 *     large write.
222 *  The following function is based on these assumptions. So except in the second case,
223 *  it looks the same as a large write encodeing function. But this is not exactly the
224 *  normal way for doing a degraded write, since raidframe have to break cases of access
225 *  other than the above two into smaller accesses. We may have to change
226 *  DegrESubroutin in the future.
227 *******************************************************************************************/
228void rf_DegrESubroutine(node, ebuf)
229  RF_DagNode_t  *node;
230  char          *ebuf;
231{
232  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
233  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
234  RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
235  RF_PhysDiskAddr_t *pda;
236  int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
237  RF_RowCol_t scol;
238  char *srcbuf, *destbuf;
239  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
240  RF_Etimer_t timer;
241
242  RF_ETIMER_START(timer);
243  for (i=0; i<node->numParams-2; i+=2) {
244    RF_ASSERT( node->params[i+1].p != ebuf );
245    pda = (RF_PhysDiskAddr_t *) node->params[i].p;
246    suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
247    scol = rf_EUCol(layoutPtr, pda->raidAddress );
248    srcbuf = (char *) node->params[i+1].p;
249    destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset-failedSUOffset);
250    rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
251  }
252
253  RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer);
254}
255
256
257/**************************************************************************************
258 * This function is used in case where one data disk failed and both redundant disks
259 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
260 * failed in the stripe but not accessed at this time, then we should, instead, use
261 * the rf_EOWriteDoubleRecoveryFunc().
262 **************************************************************************************/
263int rf_Degraded_100_EOFunc(node)
264  RF_DagNode_t  *node;
265{
266  rf_DegrESubroutine(node, node->results[1]);
267  rf_RecoveryXorFunc(node);  /* does the wakeup here! */
268#if 1
269  return(0); /* XXX this was missing... SHould these be void functions??? GO */
270#endif
271}
272
273/**************************************************************************************
274 * This function is to encode one sector in one of the data disks to the E disk.
275 * However, in evenodd this function can also be used as decoding function to recover
276 * data from dead disk in the case of parity failure and a single data failure.
277 **************************************************************************************/
278void rf_e_EncOneSect(
279  RF_RowCol_t   srcLogicCol,
280  char         *srcSecbuf,
281  RF_RowCol_t   destLogicCol,
282  char         *destSecbuf,
283  int           bytesPerSector)
284{
285  int S_index;  /* index of the EU in the src col which need be Xored into all EUs in a dest sector */
286  int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1;
287  RF_RowCol_t j, indexInDest,   /* row index of an encoding unit in the destination colume of encoding matrix */
288              indexInSrc;  /* row index of an encoding unit in the source colume used for recovery */
289  int bytesPerEU = bytesPerSector/numRowInEncMatix;
290
291#if RF_EO_MATRIX_DIM > 17
292  int shortsPerEU = bytesPerEU/sizeof(short);
293  short *destShortBuf, *srcShortBuf1, *srcShortBuf2;
294  register short temp1;
295#elif RF_EO_MATRIX_DIM == 17
296  int longsPerEU = bytesPerEU/sizeof(long);
297  long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
298  register long temp1;
299#endif
300
301#if RF_EO_MATRIX_DIM > 17
302  RF_ASSERT( sizeof(short) == 2 || sizeof(short) == 1 );
303  RF_ASSERT( bytesPerEU % sizeof(short) == 0 );
304#elif RF_EO_MATRIX_DIM == 17
305  RF_ASSERT( sizeof(long) == 8 || sizeof(long) == 4 );
306  RF_ASSERT( bytesPerEU % sizeof(long) == 0);
307#endif
308
309  S_index = rf_EO_Mod( ( RF_EO_MATRIX_DIM -1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
310#if RF_EO_MATRIX_DIM > 17
311  srcShortBuf1 = (short *)(srcSecbuf + S_index * bytesPerEU);
312#elif RF_EO_MATRIX_DIM == 17
313  srcLongBuf1 = (long *)(srcSecbuf + S_index * bytesPerEU);
314#endif
315
316  for( indexInDest = 0; indexInDest < numRowInEncMatix ; indexInDest++){
317     indexInSrc = rf_EO_Mod( (indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM );
318
319#if RF_EO_MATRIX_DIM > 17
320     destShortBuf = (short *)(destSecbuf + indexInDest * bytesPerEU);
321     srcShortBuf2 = (short *)(srcSecbuf + indexInSrc * bytesPerEU);
322     for(j=0; j < shortsPerEU; j++) {
323        temp1 = destShortBuf[j]^srcShortBuf1[j];
324        /* note: S_index won't be at the end row for any src col! */
325        if(indexInSrc != RF_EO_MATRIX_DIM -1) destShortBuf[j] = (srcShortBuf2[j])^temp1;
326        /* if indexInSrc is at the end row, ie. RF_EO_MATRIX_DIM -1, then all elements are zero! */
327	else destShortBuf[j] = temp1;
328     }
329
330#elif RF_EO_MATRIX_DIM == 17
331     destLongBuf = (long *)(destSecbuf + indexInDest * bytesPerEU);
332     srcLongBuf2 = (long *)(srcSecbuf + indexInSrc * bytesPerEU);
333     for(j=0; j < longsPerEU; j++) {
334        temp1 = destLongBuf[j]^srcLongBuf1[j];
335        if(indexInSrc != RF_EO_MATRIX_DIM -1) destLongBuf[j] = (srcLongBuf2[j])^temp1;
336        else destLongBuf[j] = temp1;
337     }
338#endif
339  }
340}
341
342void rf_e_encToBuf(
343  RF_Raid_t    *raidPtr,
344  RF_RowCol_t   srcLogicCol,
345  char         *srcbuf,
346  RF_RowCol_t   destLogicCol,
347  char         *destbuf,
348  int           numSector)
349{
350  int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
351
352  for (i=0; i < numSector; i++)
353  {
354     rf_e_EncOneSect( srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
355     srcbuf += bytesPerSector;
356     destbuf += bytesPerSector;
357  }
358}
359
360/**************************************************************************************
361 * when parity die and one data die, We use second redundant information, 'E',
362 * to recover the data in dead disk. This function is used in the recovery node of
363 * for EO_110_CreateReadDAG
364 **************************************************************************************/
365int rf_RecoveryEFunc(node)
366  RF_DagNode_t  *node;
367{
368  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
369  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
370  RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
371  RF_RowCol_t scol, /*source logical column*/
372              fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress ); /* logical column of failed SU */
373  int i;
374  RF_PhysDiskAddr_t *pda;
375  int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
376  char *srcbuf, *destbuf;
377  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
378  RF_Etimer_t timer;
379
380  bzero( (char *)node->results[0], rf_RaidAddressToByte(raidPtr,failedPDA->numSector));
381  if (node->dagHdr->status == rf_enable) {
382    RF_ETIMER_START(timer);
383    for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) {
384      pda = (RF_PhysDiskAddr_t *) node->params[i].p;
385      if( i == node->numParams - 4 ) scol = RF_EO_MATRIX_DIM - 2; /* the colume of redundant E */
386      else scol = rf_EUCol(layoutPtr, pda->raidAddress );
387      srcbuf = (char *) node->params[i+1].p;
388      suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
389      destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
390      rf_e_encToBuf(raidPtr,  scol, srcbuf, fcol, destbuf, pda->numSector);
391    }
392    RF_ETIMER_STOP(timer);
393    RF_ETIMER_EVAL(timer);
394    tracerec->xor_us += RF_ETIMER_VAL_US(timer);
395  }
396  return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */
397}
398
399/**************************************************************************************
400 * This function is used in the case where one data and the parity have filed.
401 * (in EO_110_CreateWriteDAG )
402 **************************************************************************************/
403int rf_EO_DegradedWriteEFunc(RF_DagNode_t *node)
404{
405  rf_DegrESubroutine(node, node->results[0]);
406  rf_GenericWakeupFunc(node, 0);
407#if 1
408  return(0); /* XXX Yet another one!! GO */
409#endif
410}
411
412
413
414/**************************************************************************************
415 *  		THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
416 **************************************************************************************/
417
418void rf_doubleEOdecode(
419  RF_Raid_t     *raidPtr,
420  char         **rrdbuf,
421  char         **dest,
422  RF_RowCol_t   *fcol,
423  char          *pbuf,
424  char          *ebuf)
425{
426  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
427  int i, j, k, f1, f2, row;
428  int rrdrow, erow, count = 0;
429  int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 );
430  int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1;
431#if 0
432  int pcol = (RF_EO_MATRIX_DIM) - 1;
433#endif
434  int ecol = (RF_EO_MATRIX_DIM) - 2;
435  int bytesPerEU = bytesPerSector/numRowInEncMatix;
436  int numDataCol  = layoutPtr->numDataCol;
437#if RF_EO_MATRIX_DIM > 17
438  int shortsPerEU = bytesPerEU/sizeof(short);
439  short *rrdbuf_current, *pbuf_current, *ebuf_current;
440  short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
441  register short *temp;
442  short *P;
443
444  RF_ASSERT( bytesPerEU % sizeof(short) == 0);
445  RF_Malloc(P, bytesPerEU, (short *));
446  RF_Malloc(temp, bytesPerEU, (short *));
447#elif RF_EO_MATRIX_DIM == 17
448  int longsPerEU = bytesPerEU/sizeof(long);
449  long *rrdbuf_current, *pbuf_current, *ebuf_current;
450  long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
451  register long *temp;
452  long *P;
453
454  RF_ASSERT( bytesPerEU % sizeof(long) == 0);
455  RF_Malloc(P, bytesPerEU, (long *));
456  RF_Malloc(temp, bytesPerEU, (long *));
457#endif
458  RF_ASSERT( *((long *)dest[0]) == 0);
459  RF_ASSERT( *((long *)dest[1]) == 0);
460  bzero((char *)P, bytesPerEU);
461  bzero((char *)temp, bytesPerEU);
462  RF_ASSERT( *P == 0 );
463  /* calculate the 'P' parameter, which, not parity, is the Xor of all elements in
464     the last two column, ie. 'E' and 'parity' colume, see the Ref. paper by Blaum, et al 1993  */
465  for( i=0; i< numRowInEncMatix; i++)
466       for( k=0; k< longsPerEU; k++) {
467#if RF_EO_MATRIX_DIM > 17
468            ebuf_current = ((short *)ebuf) + i*shortsPerEU + k;
469            pbuf_current = ((short *)pbuf) + i*shortsPerEU + k;
470#elif RF_EO_MATRIX_DIM == 17
471            ebuf_current = ((long *)ebuf) + i*longsPerEU + k;
472            pbuf_current = ((long *)pbuf) + i*longsPerEU + k;
473#endif
474            P[k] ^= *ebuf_current;
475            P[k] ^= *pbuf_current;
476       }
477  RF_ASSERT( fcol[0] != fcol[1] );
478  if( fcol[0] < fcol[1] ) {
479#if RF_EO_MATRIX_DIM > 17
480        dest_smaller = (short *)(dest[0]);
481        dest_larger = (short *)(dest[1]);
482#elif RF_EO_MATRIX_DIM == 17
483	dest_smaller = (long *)(dest[0]);
484	dest_larger = (long *)(dest[1]);
485#endif
486	f1 = fcol[0];
487	f2 = fcol[1];
488  }
489  else {
490#if RF_EO_MATRIX_DIM > 17
491        dest_smaller = (short *)(dest[1]);
492        dest_larger = (short *)(dest[0]);
493#elif RF_EO_MATRIX_DIM == 17
494	dest_smaller = (long *)(dest[1]);
495	dest_larger = (long *)(dest[0]);
496#endif
497	f1 = fcol[1];
498	f2 = fcol[0];
499  }
500  row = (RF_EO_MATRIX_DIM) -1;
501  while( (row = rf_EO_Mod( (row+f1-f2), RF_EO_MATRIX_DIM )) != ( (RF_EO_MATRIX_DIM) -1) )
502  {
503#if RF_EO_MATRIX_DIM > 17
504       dest_larger_current = dest_larger + row*shortsPerEU;
505       dest_smaller_current = dest_smaller + row*shortsPerEU;
506#elif RF_EO_MATRIX_DIM == 17
507       dest_larger_current = dest_larger + row*longsPerEU;
508       dest_smaller_current = dest_smaller + row*longsPerEU;
509#endif
510       /**    Do the diagonal recovery. Initially, temp[k] = (failed 1),
511	      which is the failed data in the colume which has smaller col index. **/
512       /*   step 1:  ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3))         */
513       for( j=0; j< numDataCol; j++)
514       {
515             if( j == f1 || j == f2 ) continue;
516             rrdrow = rf_EO_Mod( (row+f2-j), RF_EO_MATRIX_DIM );
517	     if ( rrdrow != (RF_EO_MATRIX_DIM) -1 ) {
518#if RF_EO_MATRIX_DIM > 17
519                 rrdbuf_current = (short *)(rrdbuf[j]) + rrdrow * shortsPerEU;
520                 for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current + k);
521#elif RF_EO_MATRIX_DIM == 17
522	         rrdbuf_current = (long *)(rrdbuf[j]) + rrdrow * longsPerEU;
523                 for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current + k);
524#endif
525	     }
526       }
527       /*   step 2:  ^E(erow,m-2), If erow is at the buttom row, don't Xor into it
528	    E(erow,m-2) = (principle diagonal) ^ (failed 1) ^ (failed 2)
529                        ^ ( SUM of nonfailed in-diagonal A(rrdrow,0..m-3) )
530            After this step, temp[k] = (principle diagonal) ^ (failed 2)       */
531
532       erow = rf_EO_Mod( (row+f2-ecol), (RF_EO_MATRIX_DIM) );
533       if ( erow != (RF_EO_MATRIX_DIM) -1) {
534#if RF_EO_MATRIX_DIM > 17
535           ebuf_current = (short *)ebuf + shortsPerEU * erow;
536           for (k=0; k< shortsPerEU; k++) temp[k] ^= *(ebuf_current+k);
537#elif RF_EO_MATRIX_DIM == 17
538           ebuf_current = (long *)ebuf + longsPerEU * erow;
539           for (k=0; k< longsPerEU; k++) temp[k] ^= *(ebuf_current+k);
540#endif
541       }
542       /*   step 3: ^P to obtain the failed data (failed 2).
543	    P can be proved to be actually  (principle diagonal)
544            After this step, temp[k] = (failed 2), the failed data to be recovered */
545#if RF_EO_MATRIX_DIM > 17
546       for (k=0; k< shortsPerEU; k++) temp[k] ^= P[k];
547       /*   Put the data to the destination buffer                              */
548       for (k=0; k< shortsPerEU; k++) dest_larger_current[k] = temp[k];
549#elif RF_EO_MATRIX_DIM == 17
550       for (k=0; k< longsPerEU; k++) temp[k] ^= P[k];
551       /*   Put the data to the destination buffer                              */
552       for (k=0; k< longsPerEU; k++) dest_larger_current[k] = temp[k];
553#endif
554
555       /**          THE FOLLOWING DO THE HORIZONTAL XOR                **/
556       /*   step 1:  ^(SUM of A(row,0..m-3)), ie. all nonfailed data columes    */
557       for (j=0; j< numDataCol; j++)
558       {
559             if( j == f1 || j == f2 ) continue;
560#if RF_EO_MATRIX_DIM > 17
561             rrdbuf_current = (short *)(rrdbuf[j]) + row * shortsPerEU;
562             for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current+k);
563#elif RF_EO_MATRIX_DIM == 17
564	     rrdbuf_current = (long *)(rrdbuf[j]) + row * longsPerEU;
565             for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current+k);
566#endif
567       }
568       /*   step 2: ^A(row,m-1) */
569       /*   step 3: Put the data to the destination buffer                             	*/
570#if RF_EO_MATRIX_DIM > 17
571       pbuf_current = (short *)pbuf + shortsPerEU * row;
572       for (k=0; k< shortsPerEU; k++) temp[k] ^= *(pbuf_current+k);
573       for (k=0; k< shortsPerEU; k++) dest_smaller_current[k] = temp[k];
574#elif RF_EO_MATRIX_DIM == 17
575       pbuf_current = (long *)pbuf + longsPerEU * row;
576       for (k=0; k< longsPerEU; k++) temp[k] ^= *(pbuf_current+k);
577       for (k=0; k< longsPerEU; k++) dest_smaller_current[k] = temp[k];
578#endif
579       count++;
580  }
581  /*        Check if all Encoding Unit in the data buffer have been decoded,
582	    according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
583	    this algorithm will covered all buffer 				 */
584  RF_ASSERT( count == numRowInEncMatix );
585  RF_Free((char *)P, bytesPerEU);
586  RF_Free((char *)temp, bytesPerEU);
587}
588
589
590/***************************************************************************************
591* 	This function is called by double degragded read
592* 	EO_200_CreateReadDAG
593*
594***************************************************************************************/
595int rf_EvenOddDoubleRecoveryFunc(node)
596  RF_DagNode_t  *node;
597{
598  int ndataParam = 0;
599  int np = node->numParams;
600  RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
601  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
602  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
603  int i, prm, sector, nresults = node->numResults;
604  RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
605  unsigned sosAddr;
606  int two = 0, mallc_one= 0, mallc_two = 0;    /* flags to indicate if memory is allocated */
607  int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 );
608  RF_PhysDiskAddr_t *ppda,*ppda2,*epda,*epda2,*pda, *pda0, *pda1, npda;
609  RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
610  char **buf, *ebuf, *pbuf, *dest[2];
611  long *suoff=NULL, *suend=NULL, *prmToCol=NULL, psuoff, esuoff;
612  RF_SectorNum_t startSector, endSector;
613  RF_Etimer_t timer;
614  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
615
616  RF_ETIMER_START(timer);
617
618  /* Find out the number of parameters which are pdas for data information */
619  for (i = 0; i<= np; i++)
620     if( ((RF_PhysDiskAddr_t *)node->params[i].p)->type != RF_PDA_TYPE_DATA) {ndataParam = i ; break; }
621
622  RF_Malloc(buf, numDataCol*sizeof(char *), (char **));
623  if (ndataParam != 0 ){
624      RF_Malloc(suoff, ndataParam*sizeof(long), (long *) );
625      RF_Malloc(suend, ndataParam*sizeof(long), (long *) );
626      RF_Malloc(prmToCol, ndataParam*sizeof(long), (long *) );
627  }
628
629  if (asmap->failedPDAs[1] &&
630      (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
631      RF_ASSERT(0); /* currently, no support for this situation */
632      ppda  = node->params[np-6].p;
633      ppda2 = node->params[np-5].p;
634      RF_ASSERT( ppda2->type == RF_PDA_TYPE_PARITY );
635      epda  = node->params[np-4].p;
636      epda2 = node->params[np-3].p;
637      RF_ASSERT( epda2->type == RF_PDA_TYPE_Q );
638      two = 1;
639  }
640  else {
641      ppda = node->params[np-4].p;
642      epda = node->params[np-3].p;
643      psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
644      esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
645      RF_ASSERT( psuoff == esuoff );
646  }
647  /*
648      the followings have three goals:
649      1. determine the startSector to begin decoding and endSector to end decoding.
650      2. determine the colume numbers of the two failed disks.
651      3. determine the offset and end offset of the access within each failed stripe unit.
652   */
653  if( nresults == 1 ) {
654      /* find the startSector to begin decoding */
655      pda = node->results[0];
656      bzero(pda->bufPtr, bytesPerSector*pda->numSector );
657      fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector );
658      fsuend[0] = fsuoff[0] + pda->numSector;
659      startSector = fsuoff[0];
660      endSector = fsuend[0];
661
662      /* find out the the column of failed disk being accessed */
663      fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress );
664
665      /* find out the other failed colume not accessed */
666      sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
667      for (i=0; i < numDataCol; i++) {
668          npda.raidAddress = sosAddr + (i * secPerSU);
669          (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
670          /* skip over dead disks */
671          if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
672              if (i != fcol[0]) break;
673      }
674      RF_ASSERT (i < numDataCol);
675      fcol[1] = i;
676  }
677  else {
678      RF_ASSERT (  nresults == 2 );
679      pda0 = node->results[0];  bzero(pda0->bufPtr, bytesPerSector*pda0->numSector );
680      pda1 = node->results[1];  bzero(pda1->bufPtr, bytesPerSector*pda1->numSector );
681      /* determine the failed colume numbers of the two failed disks. */
682      fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress );
683      fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress );
684      /*  determine the offset and end offset of the access within each failed stripe unit. */
685      fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector );
686      fsuend[0] = fsuoff[0] + pda0->numSector;
687      fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector );
688      fsuend[1] = fsuoff[1] + pda1->numSector;
689      /*  determine the startSector to begin decoding */
690      startSector = RF_MIN( pda0->startSector, pda1->startSector );
691      /*  determine the endSector to end decoding */
692      endSector = RF_MAX( fsuend[0], fsuend[1] );
693  }
694  /*
695	assign the beginning sector and the end sector for each parameter
696	find out the corresponding colume # for each parameter
697  */
698  for( prm=0; prm < ndataParam; prm++ ) {
699      pda = node->params[prm].p;
700      suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
701      suend[prm] = suoff[prm] + pda->numSector;
702      prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress );
703  }
704  /* 'sector' is the sector for the current decoding algorithm. For each sector in the failed SU,
705     find out the corresponding parameters that cover the current sector and that are needed for
706     decoding of this sector in failed SU. 2.  Find out if sector is in the shadow of any accessed
707     failed SU. If not, malloc a temporary space of a sector in size.
708  */
709  for( sector = startSector; sector < endSector; sector++ ){
710     if ( nresults == 2 )
711	  if( !(fsuoff[0]<=sector && sector<fsuend[0]) && !(fsuoff[1]<=sector && sector<fsuend[1]) )continue;
712     for( prm=0; prm < ndataParam; prm++ )
713          if( suoff[prm] <= sector && sector < suend[prm] )
714               buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *)node->params[prm].p)->bufPtr +
715					rf_RaidAddressToByte(raidPtr, sector-suoff[prm]);
716     /* find out if sector is in the shadow of any accessed failed SU. If yes, assign dest[0], dest[1] to point
717	 at suitable position of the buffer corresponding to failed SUs. if no, malloc a temporary space of
718	 a sector in size for destination of decoding.
719      */
720     RF_ASSERT( nresults == 1 || nresults == 2 );
721     if ( nresults == 1) {
722           dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]);
723           /* Always malloc temp buffer to dest[1]  */
724           RF_Malloc( dest[1], bytesPerSector, (char *) );
725	   bzero(dest[1],bytesPerSector); mallc_two = 1; }
726      else {
727           if( fsuoff[0] <= sector && sector < fsuend[0] )
728                  dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]);
729           else { RF_Malloc( dest[0], bytesPerSector, (char *) );
730		  bzero(dest[0],bytesPerSector); mallc_one = 1; }
731           if( fsuoff[1] <= sector && sector < fsuend[1] )
732                  dest[1] = ((RF_PhysDiskAddr_t *)node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[1]);
733           else { RF_Malloc( dest[1], bytesPerSector, (char *) );
734                  bzero(dest[1],bytesPerSector); mallc_two = 1; }
735           RF_ASSERT( mallc_one == 0 || mallc_two == 0 );
736      }
737      pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-psuoff );
738      ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-esuoff );
739      /*
740       * After finish finding all needed sectors, call doubleEOdecode function for decoding
741       * one sector to destination.
742       */
743      rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf );
744      /* free all allocated memory, and mark flag to indicate no memory is being allocated */
745      if( mallc_one == 1) RF_Free( dest[0], bytesPerSector );
746      if( mallc_two == 1) RF_Free( dest[1], bytesPerSector );
747      mallc_one = mallc_two = 0;
748  }
749  RF_Free(buf, numDataCol*sizeof(char *));
750  if (ndataParam != 0){
751      RF_Free(suoff, ndataParam*sizeof(long));
752      RF_Free(suend, ndataParam*sizeof(long));
753      RF_Free(prmToCol, ndataParam*sizeof(long));
754  }
755
756  RF_ETIMER_STOP(timer);
757  RF_ETIMER_EVAL(timer);
758  if (tracerec) {
759    tracerec->q_us += RF_ETIMER_VAL_US(timer);
760  }
761  rf_GenericWakeupFunc(node,0);
762#if 1
763  return(0); /* XXX is this even close!!?!?!!? GO */
764#endif
765}
766
767
768/* currently, only access of one of the two failed SU is allowed in this function.
769 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
770 * many accesses of single stripe unit.
771 */
772
773int rf_EOWriteDoubleRecoveryFunc(node)
774  RF_DagNode_t  *node;
775{
776  int np = node->numParams;
777  RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
778  RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
779  RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
780  RF_SectorNum_t sector;
781  RF_RowCol_t col, scol;
782  int prm, i, j;
783  RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
784  unsigned sosAddr;
785  unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 );
786  RF_int64 numbytes;
787  RF_SectorNum_t startSector, endSector;
788  RF_PhysDiskAddr_t *ppda,*epda,*pda, *fpda, npda;
789  RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
790  char **buf; /* buf[0], buf[1], buf[2], ...etc. point to buffer storing data read from col0, col1, col2 */
791  char *ebuf, *pbuf, *dest[2], *olddata[2];
792  RF_Etimer_t timer;
793  RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
794
795  RF_ASSERT( asmap->numDataFailed == 1 ); /* currently only support this case, the other failed SU is not being accessed */
796  RF_ETIMER_START(timer);
797  RF_Malloc(buf, numDataCol*sizeof(char *), (char **));
798
799  ppda = node->results[0];            /* Instead of being buffers, node->results[0] and [1] are Ppda and Epda  */
800  epda = node->results[1];
801  fpda = asmap->failedPDAs[0];
802
803  /* First, recovery the failed old SU using EvenOdd double decoding      */
804  /* determine the startSector and endSector for decoding */
805  startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector );
806  endSector = startSector + fpda->numSector;
807  /* Assign buf[col] pointers to point to each non-failed colume  and initialize the pbuf
808     and ebuf to point at the beginning of each source buffers and destination buffers */
809  for( prm=0; prm < numDataCol-2; prm++ ) {
810      pda = (RF_PhysDiskAddr_t *)node->params[prm].p;
811      col = rf_EUCol(layoutPtr, pda->raidAddress );
812      buf[col] = pda->bufPtr;
813  }
814  /*  pbuf and ebuf:  they will change values as double recovery decoding goes on */
815  pbuf = ppda->bufPtr;
816  ebuf = epda->bufPtr;
817  /* find out the logical colume numbers in the encoding matrix of the two failed columes */
818  fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress );
819
820  /* find out the other failed colume not accessed this time */
821  sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
822  for (i=0; i < numDataCol; i++) {
823      npda.raidAddress = sosAddr + (i * secPerSU);
824      (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
825      /* skip over dead disks */
826      if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
827      if (i != fcol[0]) break;
828  }
829  RF_ASSERT (i < numDataCol);
830  fcol[1] = i;
831  /* assign temporary space to put recovered failed SU */
832  numbytes = fpda->numSector * bytesPerSector;
833  RF_Malloc(olddata[0], numbytes, (char *) );
834  RF_Malloc(olddata[1], numbytes, (char *) );
835  dest[0] = olddata[0];
836  dest[1] = olddata[1];
837  bzero(olddata[0], numbytes);
838  bzero(olddata[1], numbytes);
839  /* Begin the recovery decoding, initially buf[j],  ebuf, pbuf, dest[j] have already
840     pointed at the beginning of each source buffers and destination buffers */
841  for( sector = startSector, i=0; sector < endSector; sector++ , i++){
842      rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf );
843      for (j=0; j < numDataCol; j++)
844           if( ( j != fcol[0]) && ( j != fcol[1] ) ) buf[j] += bytesPerSector;
845      dest[0] += bytesPerSector;
846      dest[1] += bytesPerSector;
847      ebuf += bytesPerSector;
848      pbuf += bytesPerSector;
849  }
850  /* after recovery, the buffer pointed by olddata[0] is the old failed data.
851     With new writing data and this old data, use small write to calculate
852     the new redundant informations
853   */
854  /*  node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of Rrd;
855            params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ;
856            params[ PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1]
857      	            are Pdas of wudNodes;
858      For current implementation, we assume the simplest case:
859           asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1 ie. PDAPerDisk = 1
860      then node->params[numDataCol] must be the new data to be writen to the failed disk. We first bxor the new data
861      into the old recovered data, then do the same things as small write.
862   */
863
864  rf_bxor( ((RF_PhysDiskAddr_t *)node->params[numDataCol].p)->bufPtr, olddata[0], numbytes, node->dagHdr->bp);
865  /*  do new 'E' calculation  */
866  /*  find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */
867  scol = rf_EUCol(layoutPtr, fpda->raidAddress );
868  /*  olddata[0] now is source buffer pointer; epda->bufPtr is the dest buffer pointer               */
869  rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
870
871  /*  do new 'P' calculation  */
872   rf_bxor( olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp);
873  /* Free the allocated buffer  */
874  RF_Free( olddata[0], numbytes );
875  RF_Free( olddata[1], numbytes );
876  RF_Free( buf, numDataCol*sizeof(char *));
877
878  RF_ETIMER_STOP(timer);
879  RF_ETIMER_EVAL(timer);
880  if (tracerec) {
881    tracerec->q_us += RF_ETIMER_VAL_US(timer);
882  }
883
884  rf_GenericWakeupFunc(node,0);
885  return(0);
886}
887