rf_evenodd_dagfuncs.c revision 1.1
1/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: ChangMing Wu 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/* 30 * Code for RAID-EVENODD architecture. 31 */ 32 33#include "rf_types.h" 34#include "rf_raid.h" 35#include "rf_dag.h" 36#include "rf_dagffrd.h" 37#include "rf_dagffwr.h" 38#include "rf_dagdegrd.h" 39#include "rf_dagdegwr.h" 40#include "rf_dagutils.h" 41#include "rf_dagfuncs.h" 42#include "rf_threadid.h" 43#include "rf_etimer.h" 44#include "rf_general.h" 45#include "rf_configure.h" 46#include "rf_parityscan.h" 47#include "rf_sys.h" 48#include "rf_evenodd.h" 49#include "rf_evenodd_dagfuncs.h" 50 51/* These redundant functions are for small write */ 52RF_RedFuncs_t rf_EOSmallWritePFuncs = { rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P" }; 53RF_RedFuncs_t rf_EOSmallWriteEFuncs = { rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E" }; 54 55/* These redundant functions are for degraded read */ 56RF_RedFuncs_t rf_eoPRecoveryFuncs = { rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"}; 57RF_RedFuncs_t rf_eoERecoveryFuncs = { rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func" }; 58 59/********************************************************************************************** 60 * the following encoding node functions is used in EO_000_CreateLargeWriteDAG 61 **********************************************************************************************/ 62int rf_RegularPEFunc(node) 63 RF_DagNode_t *node; 64{ 65 rf_RegularESubroutine(node,node->results[1]); 66 rf_RegularXorFunc(node); /* does the wakeup here! */ 67#if 1 68 return(0); /* XXX This was missing... GO */ 69#endif 70} 71 72 73/************************************************************************************************ 74 * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to 75 * be used. The previous case is when write access at least sectors of full stripe unit. 76 * The later function is used when the write access two stripe units but with total sectors 77 * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected 78 * areas in their stripe unit and parity write and 'E' write are both devided into two distinct 79 * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5 80 ************************************************************************************************/ 81 82/* Algorithm: 83 1. Store the difference of old data and new data in the Rod buffer. 84 2. then encode this buffer into the buffer which already have old 'E' information inside it, 85 the result can be shown to be the new 'E' information. 86 3. xor the Wnd buffer into the difference buffer to recover the original old data. 87 Here we have another alternative: to allocate a temporary buffer for storing the difference of 88 old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach 89 take the same speed as the previous, and need more memory. 90*/ 91int rf_RegularONEFunc(node) 92 RF_DagNode_t *node; 93{ 94 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; 95 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; 96 int EpdaIndex = (node->numParams-1)/2 - 1; /* the parameter of node where you can find e-pda */ 97 int i, k, retcode = 0; 98 int suoffset, length; 99 RF_RowCol_t scol; 100 char *srcbuf, *destbuf; 101 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 102 RF_Etimer_t timer; 103 RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p; 104 int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); /* generally zero */ 105 106 RF_ASSERT( EPDA->type == RF_PDA_TYPE_Q ); 107 RF_ASSERT(ESUOffset == 0); 108 109 RF_ETIMER_START(timer); 110 111 /* Xor the Wnd buffer into Rod buffer, the difference of old data and new data is stored in Rod buffer */ 112 for( k=0; k< EpdaIndex; k += 2) { 113 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector ); 114 retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp); 115 } 116 /* Start to encoding the buffer storing the difference of old data and new data into 'E' buffer */ 117 for (i=0; i<EpdaIndex; i+=2) if (node->params[i+1].p != node->results[0]) { /* results[0] is buf ptr of E */ 118 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 119 srcbuf = (char *) node->params[i+1].p; 120 scol = rf_EUCol(layoutPtr, pda->raidAddress ); 121 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 122 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset); 123 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 124 } 125 /* Recover the original old data to be used by parity encoding function in XorNode */ 126 for( k=0; k< EpdaIndex; k += 2) { 127 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector ); 128 retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp); 129 } 130 RF_ETIMER_STOP(timer); 131 RF_ETIMER_EVAL(timer); 132 tracerec->q_us += RF_ETIMER_VAL_US(timer); 133 rf_GenericWakeupFunc(node, 0); 134#if 1 135 return(0); /* XXX this was missing.. GO */ 136#endif 137} 138 139int rf_SimpleONEFunc(node) 140 RF_DagNode_t *node; 141{ 142 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; 143 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; 144 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 145 int retcode = 0; 146 char *srcbuf, *destbuf; 147 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 148 int length; 149 RF_RowCol_t scol; 150 RF_Etimer_t timer; 151 152 RF_ASSERT( ((RF_PhysDiskAddr_t *)node->params[2].p)->type == RF_PDA_TYPE_Q ); 153 if (node->dagHdr->status == rf_enable) { 154 RF_ETIMER_START(timer); 155 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[4].p)->numSector );/* this is a pda of writeDataNodes */ 156 /* bxor to buffer of readDataNodes */ 157 retcode = rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp); 158 /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */ 159 scol = rf_EUCol(layoutPtr, pda->raidAddress ); 160 srcbuf = node->params[1].p; 161 destbuf = node->params[3].p; 162 /* Start encoding process */ 163 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 164 rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp); 165 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer); 166 167 } 168 return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */ 169} 170 171 172/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/ 173void rf_RegularESubroutine(node, ebuf) 174 RF_DagNode_t *node; 175 char *ebuf; 176{ 177 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; 178 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; 179 RF_PhysDiskAddr_t *pda; 180 int i, suoffset; 181 RF_RowCol_t scol; 182 char *srcbuf, *destbuf; 183 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 184 RF_Etimer_t timer; 185 186 RF_ETIMER_START(timer); 187 for (i=0; i<node->numParams-2; i+=2) { 188 RF_ASSERT( node->params[i+1].p != ebuf ); 189 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 190 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 191 scol = rf_EUCol(layoutPtr, pda->raidAddress ); 192 srcbuf = (char *) node->params[i+1].p; 193 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset ); 194 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 195 } 196 RF_ETIMER_STOP(timer); 197 RF_ETIMER_EVAL(timer); 198 tracerec->xor_us += RF_ETIMER_VAL_US(timer); 199} 200 201 202/******************************************************************************************* 203 * Used in EO_001_CreateLargeWriteDAG 204 ******************************************************************************************/ 205int rf_RegularEFunc(node) 206 RF_DagNode_t *node; 207{ 208 rf_RegularESubroutine(node, node->results[0]); 209 rf_GenericWakeupFunc(node, 0); 210#if 1 211 return(0); /* XXX this was missing?.. GO */ 212#endif 213} 214 215/******************************************************************************************* 216 * This degraded function allow only two case: 217 * 1. when write access the full failed stripe unit, then the access can be more than 218 * one tripe units. 219 * 2. when write access only part of the failed SU, we assume accesses of more than 220 * one stripe unit is not allowed so that the write can be dealt with like a 221 * large write. 222 * The following function is based on these assumptions. So except in the second case, 223 * it looks the same as a large write encodeing function. But this is not exactly the 224 * normal way for doing a degraded write, since raidframe have to break cases of access 225 * other than the above two into smaller accesses. We may have to change 226 * DegrESubroutin in the future. 227 *******************************************************************************************/ 228void rf_DegrESubroutine(node, ebuf) 229 RF_DagNode_t *node; 230 char *ebuf; 231{ 232 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; 233 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; 234 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p; 235 RF_PhysDiskAddr_t *pda; 236 int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 237 RF_RowCol_t scol; 238 char *srcbuf, *destbuf; 239 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 240 RF_Etimer_t timer; 241 242 RF_ETIMER_START(timer); 243 for (i=0; i<node->numParams-2; i+=2) { 244 RF_ASSERT( node->params[i+1].p != ebuf ); 245 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 246 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 247 scol = rf_EUCol(layoutPtr, pda->raidAddress ); 248 srcbuf = (char *) node->params[i+1].p; 249 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset-failedSUOffset); 250 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 251 } 252 253 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer); 254} 255 256 257/************************************************************************************** 258 * This function is used in case where one data disk failed and both redundant disks 259 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk 260 * failed in the stripe but not accessed at this time, then we should, instead, use 261 * the rf_EOWriteDoubleRecoveryFunc(). 262 **************************************************************************************/ 263int rf_Degraded_100_EOFunc(node) 264 RF_DagNode_t *node; 265{ 266 rf_DegrESubroutine(node, node->results[1]); 267 rf_RecoveryXorFunc(node); /* does the wakeup here! */ 268#if 1 269 return(0); /* XXX this was missing... SHould these be void functions??? GO */ 270#endif 271} 272 273/************************************************************************************** 274 * This function is to encode one sector in one of the data disks to the E disk. 275 * However, in evenodd this function can also be used as decoding function to recover 276 * data from dead disk in the case of parity failure and a single data failure. 277 **************************************************************************************/ 278void rf_e_EncOneSect( 279 RF_RowCol_t srcLogicCol, 280 char *srcSecbuf, 281 RF_RowCol_t destLogicCol, 282 char *destSecbuf, 283 int bytesPerSector) 284{ 285 int S_index; /* index of the EU in the src col which need be Xored into all EUs in a dest sector */ 286 int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1; 287 RF_RowCol_t j, indexInDest, /* row index of an encoding unit in the destination colume of encoding matrix */ 288 indexInSrc; /* row index of an encoding unit in the source colume used for recovery */ 289 int bytesPerEU = bytesPerSector/numRowInEncMatix; 290 291#if RF_EO_MATRIX_DIM > 17 292 int shortsPerEU = bytesPerEU/sizeof(short); 293 short *destShortBuf, *srcShortBuf1, *srcShortBuf2; 294 register short temp1; 295#elif RF_EO_MATRIX_DIM == 17 296 int longsPerEU = bytesPerEU/sizeof(long); 297 long *destLongBuf, *srcLongBuf1, *srcLongBuf2; 298 register long temp1; 299#endif 300 301#if RF_EO_MATRIX_DIM > 17 302 RF_ASSERT( sizeof(short) == 2 || sizeof(short) == 1 ); 303 RF_ASSERT( bytesPerEU % sizeof(short) == 0 ); 304#elif RF_EO_MATRIX_DIM == 17 305 RF_ASSERT( sizeof(long) == 8 || sizeof(long) == 4 ); 306 RF_ASSERT( bytesPerEU % sizeof(long) == 0); 307#endif 308 309 S_index = rf_EO_Mod( ( RF_EO_MATRIX_DIM -1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM); 310#if RF_EO_MATRIX_DIM > 17 311 srcShortBuf1 = (short *)(srcSecbuf + S_index * bytesPerEU); 312#elif RF_EO_MATRIX_DIM == 17 313 srcLongBuf1 = (long *)(srcSecbuf + S_index * bytesPerEU); 314#endif 315 316 for( indexInDest = 0; indexInDest < numRowInEncMatix ; indexInDest++){ 317 indexInSrc = rf_EO_Mod( (indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM ); 318 319#if RF_EO_MATRIX_DIM > 17 320 destShortBuf = (short *)(destSecbuf + indexInDest * bytesPerEU); 321 srcShortBuf2 = (short *)(srcSecbuf + indexInSrc * bytesPerEU); 322 for(j=0; j < shortsPerEU; j++) { 323 temp1 = destShortBuf[j]^srcShortBuf1[j]; 324 /* note: S_index won't be at the end row for any src col! */ 325 if(indexInSrc != RF_EO_MATRIX_DIM -1) destShortBuf[j] = (srcShortBuf2[j])^temp1; 326 /* if indexInSrc is at the end row, ie. RF_EO_MATRIX_DIM -1, then all elements are zero! */ 327 else destShortBuf[j] = temp1; 328 } 329 330#elif RF_EO_MATRIX_DIM == 17 331 destLongBuf = (long *)(destSecbuf + indexInDest * bytesPerEU); 332 srcLongBuf2 = (long *)(srcSecbuf + indexInSrc * bytesPerEU); 333 for(j=0; j < longsPerEU; j++) { 334 temp1 = destLongBuf[j]^srcLongBuf1[j]; 335 if(indexInSrc != RF_EO_MATRIX_DIM -1) destLongBuf[j] = (srcLongBuf2[j])^temp1; 336 else destLongBuf[j] = temp1; 337 } 338#endif 339 } 340} 341 342void rf_e_encToBuf( 343 RF_Raid_t *raidPtr, 344 RF_RowCol_t srcLogicCol, 345 char *srcbuf, 346 RF_RowCol_t destLogicCol, 347 char *destbuf, 348 int numSector) 349{ 350 int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 351 352 for (i=0; i < numSector; i++) 353 { 354 rf_e_EncOneSect( srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector); 355 srcbuf += bytesPerSector; 356 destbuf += bytesPerSector; 357 } 358} 359 360/************************************************************************************** 361 * when parity die and one data die, We use second redundant information, 'E', 362 * to recover the data in dead disk. This function is used in the recovery node of 363 * for EO_110_CreateReadDAG 364 **************************************************************************************/ 365int rf_RecoveryEFunc(node) 366 RF_DagNode_t *node; 367{ 368 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; 369 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; 370 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p; 371 RF_RowCol_t scol, /*source logical column*/ 372 fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress ); /* logical column of failed SU */ 373 int i; 374 RF_PhysDiskAddr_t *pda; 375 int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector); 376 char *srcbuf, *destbuf; 377 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 378 RF_Etimer_t timer; 379 380 bzero( (char *)node->results[0], rf_RaidAddressToByte(raidPtr,failedPDA->numSector)); 381 if (node->dagHdr->status == rf_enable) { 382 RF_ETIMER_START(timer); 383 for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) { 384 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 385 if( i == node->numParams - 4 ) scol = RF_EO_MATRIX_DIM - 2; /* the colume of redundant E */ 386 else scol = rf_EUCol(layoutPtr, pda->raidAddress ); 387 srcbuf = (char *) node->params[i+1].p; 388 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 389 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset); 390 rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector); 391 } 392 RF_ETIMER_STOP(timer); 393 RF_ETIMER_EVAL(timer); 394 tracerec->xor_us += RF_ETIMER_VAL_US(timer); 395 } 396 return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */ 397} 398 399/************************************************************************************** 400 * This function is used in the case where one data and the parity have filed. 401 * (in EO_110_CreateWriteDAG ) 402 **************************************************************************************/ 403int rf_EO_DegradedWriteEFunc(RF_DagNode_t *node) 404{ 405 rf_DegrESubroutine(node, node->results[0]); 406 rf_GenericWakeupFunc(node, 0); 407#if 1 408 return(0); /* XXX Yet another one!! GO */ 409#endif 410} 411 412 413 414/************************************************************************************** 415 * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES 416 **************************************************************************************/ 417 418void rf_doubleEOdecode( 419 RF_Raid_t *raidPtr, 420 char **rrdbuf, 421 char **dest, 422 RF_RowCol_t *fcol, 423 char *pbuf, 424 char *ebuf) 425{ 426 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); 427 int i, j, k, f1, f2, row; 428 int rrdrow, erow, count = 0; 429 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 ); 430 int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1; 431#if 0 432 int pcol = (RF_EO_MATRIX_DIM) - 1; 433#endif 434 int ecol = (RF_EO_MATRIX_DIM) - 2; 435 int bytesPerEU = bytesPerSector/numRowInEncMatix; 436 int numDataCol = layoutPtr->numDataCol; 437#if RF_EO_MATRIX_DIM > 17 438 int shortsPerEU = bytesPerEU/sizeof(short); 439 short *rrdbuf_current, *pbuf_current, *ebuf_current; 440 short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; 441 register short *temp; 442 short *P; 443 444 RF_ASSERT( bytesPerEU % sizeof(short) == 0); 445 RF_Malloc(P, bytesPerEU, (short *)); 446 RF_Malloc(temp, bytesPerEU, (short *)); 447#elif RF_EO_MATRIX_DIM == 17 448 int longsPerEU = bytesPerEU/sizeof(long); 449 long *rrdbuf_current, *pbuf_current, *ebuf_current; 450 long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; 451 register long *temp; 452 long *P; 453 454 RF_ASSERT( bytesPerEU % sizeof(long) == 0); 455 RF_Malloc(P, bytesPerEU, (long *)); 456 RF_Malloc(temp, bytesPerEU, (long *)); 457#endif 458 RF_ASSERT( *((long *)dest[0]) == 0); 459 RF_ASSERT( *((long *)dest[1]) == 0); 460 bzero((char *)P, bytesPerEU); 461 bzero((char *)temp, bytesPerEU); 462 RF_ASSERT( *P == 0 ); 463 /* calculate the 'P' parameter, which, not parity, is the Xor of all elements in 464 the last two column, ie. 'E' and 'parity' colume, see the Ref. paper by Blaum, et al 1993 */ 465 for( i=0; i< numRowInEncMatix; i++) 466 for( k=0; k< longsPerEU; k++) { 467#if RF_EO_MATRIX_DIM > 17 468 ebuf_current = ((short *)ebuf) + i*shortsPerEU + k; 469 pbuf_current = ((short *)pbuf) + i*shortsPerEU + k; 470#elif RF_EO_MATRIX_DIM == 17 471 ebuf_current = ((long *)ebuf) + i*longsPerEU + k; 472 pbuf_current = ((long *)pbuf) + i*longsPerEU + k; 473#endif 474 P[k] ^= *ebuf_current; 475 P[k] ^= *pbuf_current; 476 } 477 RF_ASSERT( fcol[0] != fcol[1] ); 478 if( fcol[0] < fcol[1] ) { 479#if RF_EO_MATRIX_DIM > 17 480 dest_smaller = (short *)(dest[0]); 481 dest_larger = (short *)(dest[1]); 482#elif RF_EO_MATRIX_DIM == 17 483 dest_smaller = (long *)(dest[0]); 484 dest_larger = (long *)(dest[1]); 485#endif 486 f1 = fcol[0]; 487 f2 = fcol[1]; 488 } 489 else { 490#if RF_EO_MATRIX_DIM > 17 491 dest_smaller = (short *)(dest[1]); 492 dest_larger = (short *)(dest[0]); 493#elif RF_EO_MATRIX_DIM == 17 494 dest_smaller = (long *)(dest[1]); 495 dest_larger = (long *)(dest[0]); 496#endif 497 f1 = fcol[1]; 498 f2 = fcol[0]; 499 } 500 row = (RF_EO_MATRIX_DIM) -1; 501 while( (row = rf_EO_Mod( (row+f1-f2), RF_EO_MATRIX_DIM )) != ( (RF_EO_MATRIX_DIM) -1) ) 502 { 503#if RF_EO_MATRIX_DIM > 17 504 dest_larger_current = dest_larger + row*shortsPerEU; 505 dest_smaller_current = dest_smaller + row*shortsPerEU; 506#elif RF_EO_MATRIX_DIM == 17 507 dest_larger_current = dest_larger + row*longsPerEU; 508 dest_smaller_current = dest_smaller + row*longsPerEU; 509#endif 510 /** Do the diagonal recovery. Initially, temp[k] = (failed 1), 511 which is the failed data in the colume which has smaller col index. **/ 512 /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */ 513 for( j=0; j< numDataCol; j++) 514 { 515 if( j == f1 || j == f2 ) continue; 516 rrdrow = rf_EO_Mod( (row+f2-j), RF_EO_MATRIX_DIM ); 517 if ( rrdrow != (RF_EO_MATRIX_DIM) -1 ) { 518#if RF_EO_MATRIX_DIM > 17 519 rrdbuf_current = (short *)(rrdbuf[j]) + rrdrow * shortsPerEU; 520 for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current + k); 521#elif RF_EO_MATRIX_DIM == 17 522 rrdbuf_current = (long *)(rrdbuf[j]) + rrdrow * longsPerEU; 523 for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current + k); 524#endif 525 } 526 } 527 /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't Xor into it 528 E(erow,m-2) = (principle diagonal) ^ (failed 1) ^ (failed 2) 529 ^ ( SUM of nonfailed in-diagonal A(rrdrow,0..m-3) ) 530 After this step, temp[k] = (principle diagonal) ^ (failed 2) */ 531 532 erow = rf_EO_Mod( (row+f2-ecol), (RF_EO_MATRIX_DIM) ); 533 if ( erow != (RF_EO_MATRIX_DIM) -1) { 534#if RF_EO_MATRIX_DIM > 17 535 ebuf_current = (short *)ebuf + shortsPerEU * erow; 536 for (k=0; k< shortsPerEU; k++) temp[k] ^= *(ebuf_current+k); 537#elif RF_EO_MATRIX_DIM == 17 538 ebuf_current = (long *)ebuf + longsPerEU * erow; 539 for (k=0; k< longsPerEU; k++) temp[k] ^= *(ebuf_current+k); 540#endif 541 } 542 /* step 3: ^P to obtain the failed data (failed 2). 543 P can be proved to be actually (principle diagonal) 544 After this step, temp[k] = (failed 2), the failed data to be recovered */ 545#if RF_EO_MATRIX_DIM > 17 546 for (k=0; k< shortsPerEU; k++) temp[k] ^= P[k]; 547 /* Put the data to the destination buffer */ 548 for (k=0; k< shortsPerEU; k++) dest_larger_current[k] = temp[k]; 549#elif RF_EO_MATRIX_DIM == 17 550 for (k=0; k< longsPerEU; k++) temp[k] ^= P[k]; 551 /* Put the data to the destination buffer */ 552 for (k=0; k< longsPerEU; k++) dest_larger_current[k] = temp[k]; 553#endif 554 555 /** THE FOLLOWING DO THE HORIZONTAL XOR **/ 556 /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data columes */ 557 for (j=0; j< numDataCol; j++) 558 { 559 if( j == f1 || j == f2 ) continue; 560#if RF_EO_MATRIX_DIM > 17 561 rrdbuf_current = (short *)(rrdbuf[j]) + row * shortsPerEU; 562 for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current+k); 563#elif RF_EO_MATRIX_DIM == 17 564 rrdbuf_current = (long *)(rrdbuf[j]) + row * longsPerEU; 565 for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current+k); 566#endif 567 } 568 /* step 2: ^A(row,m-1) */ 569 /* step 3: Put the data to the destination buffer */ 570#if RF_EO_MATRIX_DIM > 17 571 pbuf_current = (short *)pbuf + shortsPerEU * row; 572 for (k=0; k< shortsPerEU; k++) temp[k] ^= *(pbuf_current+k); 573 for (k=0; k< shortsPerEU; k++) dest_smaller_current[k] = temp[k]; 574#elif RF_EO_MATRIX_DIM == 17 575 pbuf_current = (long *)pbuf + longsPerEU * row; 576 for (k=0; k< longsPerEU; k++) temp[k] ^= *(pbuf_current+k); 577 for (k=0; k< longsPerEU; k++) dest_smaller_current[k] = temp[k]; 578#endif 579 count++; 580 } 581 /* Check if all Encoding Unit in the data buffer have been decoded, 582 according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number, 583 this algorithm will covered all buffer */ 584 RF_ASSERT( count == numRowInEncMatix ); 585 RF_Free((char *)P, bytesPerEU); 586 RF_Free((char *)temp, bytesPerEU); 587} 588 589 590/*************************************************************************************** 591* This function is called by double degragded read 592* EO_200_CreateReadDAG 593* 594***************************************************************************************/ 595int rf_EvenOddDoubleRecoveryFunc(node) 596 RF_DagNode_t *node; 597{ 598 int ndataParam = 0; 599 int np = node->numParams; 600 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p; 601 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p; 602 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); 603 int i, prm, sector, nresults = node->numResults; 604 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 605 unsigned sosAddr; 606 int two = 0, mallc_one= 0, mallc_two = 0; /* flags to indicate if memory is allocated */ 607 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 ); 608 RF_PhysDiskAddr_t *ppda,*ppda2,*epda,*epda2,*pda, *pda0, *pda1, npda; 609 RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol; 610 char **buf, *ebuf, *pbuf, *dest[2]; 611 long *suoff=NULL, *suend=NULL, *prmToCol=NULL, psuoff, esuoff; 612 RF_SectorNum_t startSector, endSector; 613 RF_Etimer_t timer; 614 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 615 616 RF_ETIMER_START(timer); 617 618 /* Find out the number of parameters which are pdas for data information */ 619 for (i = 0; i<= np; i++) 620 if( ((RF_PhysDiskAddr_t *)node->params[i].p)->type != RF_PDA_TYPE_DATA) {ndataParam = i ; break; } 621 622 RF_Malloc(buf, numDataCol*sizeof(char *), (char **)); 623 if (ndataParam != 0 ){ 624 RF_Malloc(suoff, ndataParam*sizeof(long), (long *) ); 625 RF_Malloc(suend, ndataParam*sizeof(long), (long *) ); 626 RF_Malloc(prmToCol, ndataParam*sizeof(long), (long *) ); 627 } 628 629 if (asmap->failedPDAs[1] && 630 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { 631 RF_ASSERT(0); /* currently, no support for this situation */ 632 ppda = node->params[np-6].p; 633 ppda2 = node->params[np-5].p; 634 RF_ASSERT( ppda2->type == RF_PDA_TYPE_PARITY ); 635 epda = node->params[np-4].p; 636 epda2 = node->params[np-3].p; 637 RF_ASSERT( epda2->type == RF_PDA_TYPE_Q ); 638 two = 1; 639 } 640 else { 641 ppda = node->params[np-4].p; 642 epda = node->params[np-3].p; 643 psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector); 644 esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector); 645 RF_ASSERT( psuoff == esuoff ); 646 } 647 /* 648 the followings have three goals: 649 1. determine the startSector to begin decoding and endSector to end decoding. 650 2. determine the colume numbers of the two failed disks. 651 3. determine the offset and end offset of the access within each failed stripe unit. 652 */ 653 if( nresults == 1 ) { 654 /* find the startSector to begin decoding */ 655 pda = node->results[0]; 656 bzero(pda->bufPtr, bytesPerSector*pda->numSector ); 657 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector ); 658 fsuend[0] = fsuoff[0] + pda->numSector; 659 startSector = fsuoff[0]; 660 endSector = fsuend[0]; 661 662 /* find out the the column of failed disk being accessed */ 663 fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress ); 664 665 /* find out the other failed colume not accessed */ 666 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 667 for (i=0; i < numDataCol; i++) { 668 npda.raidAddress = sosAddr + (i * secPerSU); 669 (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); 670 /* skip over dead disks */ 671 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) 672 if (i != fcol[0]) break; 673 } 674 RF_ASSERT (i < numDataCol); 675 fcol[1] = i; 676 } 677 else { 678 RF_ASSERT ( nresults == 2 ); 679 pda0 = node->results[0]; bzero(pda0->bufPtr, bytesPerSector*pda0->numSector ); 680 pda1 = node->results[1]; bzero(pda1->bufPtr, bytesPerSector*pda1->numSector ); 681 /* determine the failed colume numbers of the two failed disks. */ 682 fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress ); 683 fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress ); 684 /* determine the offset and end offset of the access within each failed stripe unit. */ 685 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector ); 686 fsuend[0] = fsuoff[0] + pda0->numSector; 687 fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector ); 688 fsuend[1] = fsuoff[1] + pda1->numSector; 689 /* determine the startSector to begin decoding */ 690 startSector = RF_MIN( pda0->startSector, pda1->startSector ); 691 /* determine the endSector to end decoding */ 692 endSector = RF_MAX( fsuend[0], fsuend[1] ); 693 } 694 /* 695 assign the beginning sector and the end sector for each parameter 696 find out the corresponding colume # for each parameter 697 */ 698 for( prm=0; prm < ndataParam; prm++ ) { 699 pda = node->params[prm].p; 700 suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector); 701 suend[prm] = suoff[prm] + pda->numSector; 702 prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress ); 703 } 704 /* 'sector' is the sector for the current decoding algorithm. For each sector in the failed SU, 705 find out the corresponding parameters that cover the current sector and that are needed for 706 decoding of this sector in failed SU. 2. Find out if sector is in the shadow of any accessed 707 failed SU. If not, malloc a temporary space of a sector in size. 708 */ 709 for( sector = startSector; sector < endSector; sector++ ){ 710 if ( nresults == 2 ) 711 if( !(fsuoff[0]<=sector && sector<fsuend[0]) && !(fsuoff[1]<=sector && sector<fsuend[1]) )continue; 712 for( prm=0; prm < ndataParam; prm++ ) 713 if( suoff[prm] <= sector && sector < suend[prm] ) 714 buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *)node->params[prm].p)->bufPtr + 715 rf_RaidAddressToByte(raidPtr, sector-suoff[prm]); 716 /* find out if sector is in the shadow of any accessed failed SU. If yes, assign dest[0], dest[1] to point 717 at suitable position of the buffer corresponding to failed SUs. if no, malloc a temporary space of 718 a sector in size for destination of decoding. 719 */ 720 RF_ASSERT( nresults == 1 || nresults == 2 ); 721 if ( nresults == 1) { 722 dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]); 723 /* Always malloc temp buffer to dest[1] */ 724 RF_Malloc( dest[1], bytesPerSector, (char *) ); 725 bzero(dest[1],bytesPerSector); mallc_two = 1; } 726 else { 727 if( fsuoff[0] <= sector && sector < fsuend[0] ) 728 dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]); 729 else { RF_Malloc( dest[0], bytesPerSector, (char *) ); 730 bzero(dest[0],bytesPerSector); mallc_one = 1; } 731 if( fsuoff[1] <= sector && sector < fsuend[1] ) 732 dest[1] = ((RF_PhysDiskAddr_t *)node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[1]); 733 else { RF_Malloc( dest[1], bytesPerSector, (char *) ); 734 bzero(dest[1],bytesPerSector); mallc_two = 1; } 735 RF_ASSERT( mallc_one == 0 || mallc_two == 0 ); 736 } 737 pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-psuoff ); 738 ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-esuoff ); 739 /* 740 * After finish finding all needed sectors, call doubleEOdecode function for decoding 741 * one sector to destination. 742 */ 743 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf ); 744 /* free all allocated memory, and mark flag to indicate no memory is being allocated */ 745 if( mallc_one == 1) RF_Free( dest[0], bytesPerSector ); 746 if( mallc_two == 1) RF_Free( dest[1], bytesPerSector ); 747 mallc_one = mallc_two = 0; 748 } 749 RF_Free(buf, numDataCol*sizeof(char *)); 750 if (ndataParam != 0){ 751 RF_Free(suoff, ndataParam*sizeof(long)); 752 RF_Free(suend, ndataParam*sizeof(long)); 753 RF_Free(prmToCol, ndataParam*sizeof(long)); 754 } 755 756 RF_ETIMER_STOP(timer); 757 RF_ETIMER_EVAL(timer); 758 if (tracerec) { 759 tracerec->q_us += RF_ETIMER_VAL_US(timer); 760 } 761 rf_GenericWakeupFunc(node,0); 762#if 1 763 return(0); /* XXX is this even close!!?!?!!? GO */ 764#endif 765} 766 767 768/* currently, only access of one of the two failed SU is allowed in this function. 769 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into 770 * many accesses of single stripe unit. 771 */ 772 773int rf_EOWriteDoubleRecoveryFunc(node) 774 RF_DagNode_t *node; 775{ 776 int np = node->numParams; 777 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p; 778 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p; 779 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); 780 RF_SectorNum_t sector; 781 RF_RowCol_t col, scol; 782 int prm, i, j; 783 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 784 unsigned sosAddr; 785 unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 ); 786 RF_int64 numbytes; 787 RF_SectorNum_t startSector, endSector; 788 RF_PhysDiskAddr_t *ppda,*epda,*pda, *fpda, npda; 789 RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol; 790 char **buf; /* buf[0], buf[1], buf[2], ...etc. point to buffer storing data read from col0, col1, col2 */ 791 char *ebuf, *pbuf, *dest[2], *olddata[2]; 792 RF_Etimer_t timer; 793 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 794 795 RF_ASSERT( asmap->numDataFailed == 1 ); /* currently only support this case, the other failed SU is not being accessed */ 796 RF_ETIMER_START(timer); 797 RF_Malloc(buf, numDataCol*sizeof(char *), (char **)); 798 799 ppda = node->results[0]; /* Instead of being buffers, node->results[0] and [1] are Ppda and Epda */ 800 epda = node->results[1]; 801 fpda = asmap->failedPDAs[0]; 802 803 /* First, recovery the failed old SU using EvenOdd double decoding */ 804 /* determine the startSector and endSector for decoding */ 805 startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector ); 806 endSector = startSector + fpda->numSector; 807 /* Assign buf[col] pointers to point to each non-failed colume and initialize the pbuf 808 and ebuf to point at the beginning of each source buffers and destination buffers */ 809 for( prm=0; prm < numDataCol-2; prm++ ) { 810 pda = (RF_PhysDiskAddr_t *)node->params[prm].p; 811 col = rf_EUCol(layoutPtr, pda->raidAddress ); 812 buf[col] = pda->bufPtr; 813 } 814 /* pbuf and ebuf: they will change values as double recovery decoding goes on */ 815 pbuf = ppda->bufPtr; 816 ebuf = epda->bufPtr; 817 /* find out the logical colume numbers in the encoding matrix of the two failed columes */ 818 fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress ); 819 820 /* find out the other failed colume not accessed this time */ 821 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 822 for (i=0; i < numDataCol; i++) { 823 npda.raidAddress = sosAddr + (i * secPerSU); 824 (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); 825 /* skip over dead disks */ 826 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) 827 if (i != fcol[0]) break; 828 } 829 RF_ASSERT (i < numDataCol); 830 fcol[1] = i; 831 /* assign temporary space to put recovered failed SU */ 832 numbytes = fpda->numSector * bytesPerSector; 833 RF_Malloc(olddata[0], numbytes, (char *) ); 834 RF_Malloc(olddata[1], numbytes, (char *) ); 835 dest[0] = olddata[0]; 836 dest[1] = olddata[1]; 837 bzero(olddata[0], numbytes); 838 bzero(olddata[1], numbytes); 839 /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j] have already 840 pointed at the beginning of each source buffers and destination buffers */ 841 for( sector = startSector, i=0; sector < endSector; sector++ , i++){ 842 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf ); 843 for (j=0; j < numDataCol; j++) 844 if( ( j != fcol[0]) && ( j != fcol[1] ) ) buf[j] += bytesPerSector; 845 dest[0] += bytesPerSector; 846 dest[1] += bytesPerSector; 847 ebuf += bytesPerSector; 848 pbuf += bytesPerSector; 849 } 850 /* after recovery, the buffer pointed by olddata[0] is the old failed data. 851 With new writing data and this old data, use small write to calculate 852 the new redundant informations 853 */ 854 /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of Rrd; 855 params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; 856 params[ PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] 857 are Pdas of wudNodes; 858 For current implementation, we assume the simplest case: 859 asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1 ie. PDAPerDisk = 1 860 then node->params[numDataCol] must be the new data to be writen to the failed disk. We first bxor the new data 861 into the old recovered data, then do the same things as small write. 862 */ 863 864 rf_bxor( ((RF_PhysDiskAddr_t *)node->params[numDataCol].p)->bufPtr, olddata[0], numbytes, node->dagHdr->bp); 865 /* do new 'E' calculation */ 866 /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */ 867 scol = rf_EUCol(layoutPtr, fpda->raidAddress ); 868 /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest buffer pointer */ 869 rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector); 870 871 /* do new 'P' calculation */ 872 rf_bxor( olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp); 873 /* Free the allocated buffer */ 874 RF_Free( olddata[0], numbytes ); 875 RF_Free( olddata[1], numbytes ); 876 RF_Free( buf, numDataCol*sizeof(char *)); 877 878 RF_ETIMER_STOP(timer); 879 RF_ETIMER_EVAL(timer); 880 if (tracerec) { 881 tracerec->q_us += RF_ETIMER_VAL_US(timer); 882 } 883 884 rf_GenericWakeupFunc(node,0); 885 return(0); 886} 887