rf_evenodd_dagfuncs.c revision 1.23
1/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.23 2019/02/09 03:34:00 christos Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: ChangMing Wu 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/* 30 * Code for RAID-EVENODD architecture. 31 */ 32 33#include <sys/cdefs.h> 34__KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.23 2019/02/09 03:34:00 christos Exp $"); 35 36#include "rf_archs.h" 37 38#ifdef _KERNEL_OPT 39#include "opt_raid_diagnostic.h" 40#endif 41 42#if RF_INCLUDE_EVENODD > 0 43 44#include <dev/raidframe/raidframevar.h> 45 46#include "rf_raid.h" 47#include "rf_dag.h" 48#include "rf_dagffrd.h" 49#include "rf_dagffwr.h" 50#include "rf_dagdegrd.h" 51#include "rf_dagdegwr.h" 52#include "rf_dagutils.h" 53#include "rf_dagfuncs.h" 54#include "rf_etimer.h" 55#include "rf_general.h" 56#include "rf_parityscan.h" 57#include "rf_evenodd.h" 58#include "rf_evenodd_dagfuncs.h" 59 60/* These redundant functions are for small write */ 61RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"}; 62RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"}; 63/* These redundant functions are for degraded read */ 64RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"}; 65RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"}; 66/********************************************************************************************** 67 * the following encoding node functions is used in EO_000_CreateLargeWriteDAG 68 **********************************************************************************************/ 69int 70rf_RegularPEFunc(RF_DagNode_t *node) 71{ 72 rf_RegularESubroutine(node, node->results[1]); 73 rf_RegularXorFunc(node);/* does the wakeup here! */ 74#if 1 75 return (0); /* XXX This was missing... GO */ 76#endif 77} 78 79 80/************************************************************************************************ 81 * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to 82 * be used. The previous case is when write access at least sectors of full stripe unit. 83 * The later function is used when the write access two stripe units but with total sectors 84 * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected 85 * areas in their stripe unit and parity write and 'E' write are both devided into two distinct 86 * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5 87 ************************************************************************************************/ 88 89/* Algorithm: 90 1. Store the difference of old data and new data in the Rod buffer. 91 2. then encode this buffer into the buffer which already have old 'E' information inside it, 92 the result can be shown to be the new 'E' information. 93 3. xor the Wnd buffer into the difference buffer to recover the original old data. 94 Here we have another alternative: to allocate a temporary buffer for storing the difference of 95 old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach 96 take the same speed as the previous, and need more memory. 97*/ 98int 99rf_RegularONEFunc(RF_DagNode_t *node) 100{ 101 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 102 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 103 int EpdaIndex = (node->numParams - 1) / 2 - 1; /* the parameter of node 104 * where you can find 105 * e-pda */ 106 int i, k; 107 int suoffset, length; 108 RF_RowCol_t scol; 109 char *srcbuf, *destbuf; 110 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 111 RF_Etimer_t timer; 112 RF_PhysDiskAddr_t *pda; 113#ifdef RAID_DIAGNOSTIC 114 RF_PhysDiskAddr_t *EPDA = 115 (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p; 116 int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); 117 118 RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q); 119 RF_ASSERT(ESUOffset == 0); 120#endif /* RAID_DIAGNOSTIC */ 121 122 RF_ETIMER_START(timer); 123 124 /* Xor the Wnd buffer into Rod buffer, the difference of old data and 125 * new data is stored in Rod buffer */ 126 for (k = 0; k < EpdaIndex; k += 2) { 127 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector); 128 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length); 129 } 130 /* Start to encoding the buffer storing the difference of old data and 131 * new data into 'E' buffer */ 132 for (i = 0; i < EpdaIndex; i += 2) 133 if (node->params[i + 1].p != node->results[0]) { /* results[0] is buf ptr 134 * of E */ 135 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 136 srcbuf = (char *) node->params[i + 1].p; 137 scol = rf_EUCol(layoutPtr, pda->raidAddress); 138 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 139 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset); 140 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 141 } 142 /* Recover the original old data to be used by parity encoding 143 * function in XorNode */ 144 for (k = 0; k < EpdaIndex; k += 2) { 145 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector); 146 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length); 147 } 148 RF_ETIMER_STOP(timer); 149 RF_ETIMER_EVAL(timer); 150 tracerec->q_us += RF_ETIMER_VAL_US(timer); 151 rf_GenericWakeupFunc(node, 0); 152#if 1 153 return (0); /* XXX this was missing.. GO */ 154#endif 155} 156 157int 158rf_SimpleONEFunc(RF_DagNode_t *node) 159{ 160 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 161 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 162 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 163 int retcode = 0; 164 char *srcbuf, *destbuf; 165 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 166 int length; 167 RF_RowCol_t scol; 168 RF_Etimer_t timer; 169 170 RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q); 171 if (node->dagHdr->status == rf_enable) { 172 RF_ETIMER_START(timer); 173 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector); /* this is a pda of 174 * writeDataNodes */ 175 /* bxor to buffer of readDataNodes */ 176 retcode = rf_bxor(node->params[5].p, node->params[1].p, length); 177 /* find out the corresponding colume in encoding matrix for 178 * write colume to be encoded into redundant disk 'E' */ 179 scol = rf_EUCol(layoutPtr, pda->raidAddress); 180 srcbuf = node->params[1].p; 181 destbuf = node->params[3].p; 182 /* Start encoding process */ 183 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 184 rf_bxor(node->params[5].p, node->params[1].p, length); 185 RF_ETIMER_STOP(timer); 186 RF_ETIMER_EVAL(timer); 187 tracerec->q_us += RF_ETIMER_VAL_US(timer); 188 189 } 190 return (rf_GenericWakeupFunc(node, retcode)); /* call wake func 191 * explicitly since no 192 * I/O in this node */ 193} 194 195 196/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/ 197void 198rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf) 199{ 200 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 201 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 202 RF_PhysDiskAddr_t *pda; 203 int i, suoffset; 204 RF_RowCol_t scol; 205 char *srcbuf, *destbuf; 206 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 207 RF_Etimer_t timer; 208 209 RF_ETIMER_START(timer); 210 for (i = 0; i < node->numParams - 2; i += 2) { 211 RF_ASSERT(node->params[i + 1].p != ebuf); 212 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 213 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 214 scol = rf_EUCol(layoutPtr, pda->raidAddress); 215 srcbuf = (char *) node->params[i + 1].p; 216 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset); 217 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 218 } 219 RF_ETIMER_STOP(timer); 220 RF_ETIMER_EVAL(timer); 221 tracerec->xor_us += RF_ETIMER_VAL_US(timer); 222} 223 224 225/******************************************************************************************* 226 * Used in EO_001_CreateLargeWriteDAG 227 ******************************************************************************************/ 228int 229rf_RegularEFunc(RF_DagNode_t *node) 230{ 231 rf_RegularESubroutine(node, node->results[0]); 232 rf_GenericWakeupFunc(node, 0); 233#if 1 234 return (0); /* XXX this was missing?.. GO */ 235#endif 236} 237/******************************************************************************************* 238 * This degraded function allow only two case: 239 * 1. when write access the full failed stripe unit, then the access can be more than 240 * one tripe units. 241 * 2. when write access only part of the failed SU, we assume accesses of more than 242 * one stripe unit is not allowed so that the write can be dealt with like a 243 * large write. 244 * The following function is based on these assumptions. So except in the second case, 245 * it looks the same as a large write encodeing function. But this is not exactly the 246 * normal way for doing a degraded write, since raidframe have to break cases of access 247 * other than the above two into smaller accesses. We may have to change 248 * DegrESubroutin in the future. 249 *******************************************************************************************/ 250void 251rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf) 252{ 253 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 254 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 255 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 256 RF_PhysDiskAddr_t *pda; 257 int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 258 RF_RowCol_t scol; 259 char *srcbuf, *destbuf; 260 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 261 RF_Etimer_t timer; 262 263 RF_ETIMER_START(timer); 264 for (i = 0; i < node->numParams - 2; i += 2) { 265 RF_ASSERT(node->params[i + 1].p != ebuf); 266 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 267 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 268 scol = rf_EUCol(layoutPtr, pda->raidAddress); 269 srcbuf = (char *) node->params[i + 1].p; 270 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 271 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 272 } 273 274 RF_ETIMER_STOP(timer); 275 RF_ETIMER_EVAL(timer); 276 tracerec->q_us += RF_ETIMER_VAL_US(timer); 277} 278 279 280/************************************************************************************** 281 * This function is used in case where one data disk failed and both redundant disks 282 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk 283 * failed in the stripe but not accessed at this time, then we should, instead, use 284 * the rf_EOWriteDoubleRecoveryFunc(). 285 **************************************************************************************/ 286int 287rf_Degraded_100_EOFunc(RF_DagNode_t *node) 288{ 289 rf_DegrESubroutine(node, node->results[1]); 290 rf_RecoveryXorFunc(node); /* does the wakeup here! */ 291#if 1 292 return (0); /* XXX this was missing... SHould these be 293 * void functions??? GO */ 294#endif 295} 296/************************************************************************************** 297 * This function is to encode one sector in one of the data disks to the E disk. 298 * However, in evenodd this function can also be used as decoding function to recover 299 * data from dead disk in the case of parity failure and a single data failure. 300 **************************************************************************************/ 301void 302rf_e_EncOneSect( 303 RF_RowCol_t srcLogicCol, 304 char *srcSecbuf, 305 RF_RowCol_t destLogicCol, 306 char *destSecbuf, 307 int bytesPerSector) 308{ 309 int S_index; /* index of the EU in the src col which need 310 * be Xored into all EUs in a dest sector */ 311 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1; 312 RF_RowCol_t j, indexInDest, /* row index of an encoding unit in 313 * the destination colume of encoding 314 * matrix */ 315 indexInSrc; /* row index of an encoding unit in the source 316 * colume used for recovery */ 317 int bytesPerEU = bytesPerSector / numRowInEncMatix; 318 319#if RF_EO_MATRIX_DIM > 17 320 int shortsPerEU = bytesPerEU / sizeof(short); 321 short *destShortBuf, *srcShortBuf1, *srcShortBuf2; 322 short temp1; 323#elif RF_EO_MATRIX_DIM == 17 324 int longsPerEU = bytesPerEU / sizeof(long); 325 long *destLongBuf, *srcLongBuf1, *srcLongBuf2; 326 long temp1; 327#endif 328 329#if RF_EO_MATRIX_DIM > 17 330 RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1); 331 RF_ASSERT(bytesPerEU % sizeof(short) == 0); 332#elif RF_EO_MATRIX_DIM == 17 333 RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4); 334 RF_ASSERT(bytesPerEU % sizeof(long) == 0); 335#endif 336 337 S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM); 338#if RF_EO_MATRIX_DIM > 17 339 srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU); 340#elif RF_EO_MATRIX_DIM == 17 341 srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU); 342#endif 343 344 for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) { 345 indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM); 346 347#if RF_EO_MATRIX_DIM > 17 348 destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU); 349 srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU); 350 for (j = 0; j < shortsPerEU; j++) { 351 temp1 = destShortBuf[j] ^ srcShortBuf1[j]; 352 /* note: S_index won't be at the end row for any src 353 * col! */ 354 if (indexInSrc != RF_EO_MATRIX_DIM - 1) 355 destShortBuf[j] = (srcShortBuf2[j]) ^ temp1; 356 /* if indexInSrc is at the end row, ie. 357 * RF_EO_MATRIX_DIM -1, then all elements are zero! */ 358 else 359 destShortBuf[j] = temp1; 360 } 361 362#elif RF_EO_MATRIX_DIM == 17 363 destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU); 364 srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU); 365 for (j = 0; j < longsPerEU; j++) { 366 temp1 = destLongBuf[j] ^ srcLongBuf1[j]; 367 if (indexInSrc != RF_EO_MATRIX_DIM - 1) 368 destLongBuf[j] = (srcLongBuf2[j]) ^ temp1; 369 else 370 destLongBuf[j] = temp1; 371 } 372#endif 373 } 374} 375 376void 377rf_e_encToBuf( 378 RF_Raid_t * raidPtr, 379 RF_RowCol_t srcLogicCol, 380 char *srcbuf, 381 RF_RowCol_t destLogicCol, 382 char *destbuf, 383 int numSector) 384{ 385 int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 386 387 for (i = 0; i < numSector; i++) { 388 rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector); 389 srcbuf += bytesPerSector; 390 destbuf += bytesPerSector; 391 } 392} 393/************************************************************************************** 394 * when parity die and one data die, We use second redundant information, 'E', 395 * to recover the data in dead disk. This function is used in the recovery node of 396 * for EO_110_CreateReadDAG 397 **************************************************************************************/ 398int 399rf_RecoveryEFunc(RF_DagNode_t *node) 400{ 401 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 402 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 403 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 404 RF_RowCol_t scol, /* source logical column */ 405 fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress); /* logical column of 406 * failed SU */ 407 int i; 408 RF_PhysDiskAddr_t *pda; 409 int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 410 char *srcbuf, *destbuf; 411 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 412 RF_Etimer_t timer; 413 414 memset(node->results[0], 0, 415 rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); 416 if (node->dagHdr->status == rf_enable) { 417 RF_ETIMER_START(timer); 418 for (i = 0; i < node->numParams - 2; i += 2) 419 if (node->params[i + 1].p != node->results[0]) { 420 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 421 if (i == node->numParams - 4) 422 scol = RF_EO_MATRIX_DIM - 2; /* the colume of 423 * redundant E */ 424 else 425 scol = rf_EUCol(layoutPtr, pda->raidAddress); 426 srcbuf = (char *) node->params[i + 1].p; 427 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 428 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 429 rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector); 430 } 431 RF_ETIMER_STOP(timer); 432 RF_ETIMER_EVAL(timer); 433 tracerec->xor_us += RF_ETIMER_VAL_US(timer); 434 } 435 return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */ 436} 437/************************************************************************************** 438 * This function is used in the case where one data and the parity have filed. 439 * (in EO_110_CreateWriteDAG ) 440 **************************************************************************************/ 441int 442rf_EO_DegradedWriteEFunc(RF_DagNode_t * node) 443{ 444 rf_DegrESubroutine(node, node->results[0]); 445 rf_GenericWakeupFunc(node, 0); 446#if 1 447 return (0); /* XXX Yet another one!! GO */ 448#endif 449} 450 451 452 453/************************************************************************************** 454 * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES 455 **************************************************************************************/ 456 457void 458rf_doubleEOdecode( 459 RF_Raid_t * raidPtr, 460 char **rrdbuf, 461 char **dest, 462 RF_RowCol_t * fcol, 463 char *pbuf, 464 char *ebuf) 465{ 466 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 467 int i, j, k, f1, f2, row; 468 int rrdrow, erow, count = 0; 469 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 470 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1; 471#if 0 472 int pcol = (RF_EO_MATRIX_DIM) - 1; 473#endif 474 int ecol = (RF_EO_MATRIX_DIM) - 2; 475 int bytesPerEU = bytesPerSector / numRowInEncMatix; 476 int numDataCol = layoutPtr->numDataCol; 477#if RF_EO_MATRIX_DIM > 17 478 int shortsPerEU = bytesPerEU / sizeof(short); 479 short *rrdbuf_current, *pbuf_current, *ebuf_current; 480 short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; 481 short *temp; 482 short *P; 483 484 RF_ASSERT(bytesPerEU % sizeof(short) == 0); 485#elif RF_EO_MATRIX_DIM == 17 486 int longsPerEU = bytesPerEU / sizeof(long); 487 long *rrdbuf_current, *pbuf_current, *ebuf_current; 488 long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; 489 long *temp; 490 long *P; 491 492 RF_ASSERT(bytesPerEU % sizeof(long) == 0); 493#endif 494 P = RF_Malloc(bytesPerEU); 495 temp = RF_Malloc(bytesPerEU); 496 RF_ASSERT(*((long *) dest[0]) == 0); 497 RF_ASSERT(*((long *) dest[1]) == 0); 498 RF_ASSERT(*P == 0); 499 /* calculate the 'P' parameter, which, not parity, is the Xor of all 500 * elements in the last two column, ie. 'E' and 'parity' colume, see 501 * the Ref. paper by Blaum, et al 1993 */ 502 for (i = 0; i < numRowInEncMatix; i++) 503 for (k = 0; k < longsPerEU; k++) { 504#if RF_EO_MATRIX_DIM > 17 505 ebuf_current = ((short *) ebuf) + i * shortsPerEU + k; 506 pbuf_current = ((short *) pbuf) + i * shortsPerEU + k; 507#elif RF_EO_MATRIX_DIM == 17 508 ebuf_current = ((long *) ebuf) + i * longsPerEU + k; 509 pbuf_current = ((long *) pbuf) + i * longsPerEU + k; 510#endif 511 P[k] ^= *ebuf_current; 512 P[k] ^= *pbuf_current; 513 } 514 RF_ASSERT(fcol[0] != fcol[1]); 515 if (fcol[0] < fcol[1]) { 516#if RF_EO_MATRIX_DIM > 17 517 dest_smaller = (short *) (dest[0]); 518 dest_larger = (short *) (dest[1]); 519#elif RF_EO_MATRIX_DIM == 17 520 dest_smaller = (long *) (dest[0]); 521 dest_larger = (long *) (dest[1]); 522#endif 523 f1 = fcol[0]; 524 f2 = fcol[1]; 525 } else { 526#if RF_EO_MATRIX_DIM > 17 527 dest_smaller = (short *) (dest[1]); 528 dest_larger = (short *) (dest[0]); 529#elif RF_EO_MATRIX_DIM == 17 530 dest_smaller = (long *) (dest[1]); 531 dest_larger = (long *) (dest[0]); 532#endif 533 f1 = fcol[1]; 534 f2 = fcol[0]; 535 } 536 row = (RF_EO_MATRIX_DIM) - 1; 537 while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) { 538#if RF_EO_MATRIX_DIM > 17 539 dest_larger_current = dest_larger + row * shortsPerEU; 540 dest_smaller_current = dest_smaller + row * shortsPerEU; 541#elif RF_EO_MATRIX_DIM == 17 542 dest_larger_current = dest_larger + row * longsPerEU; 543 dest_smaller_current = dest_smaller + row * longsPerEU; 544#endif 545 /** Do the diagonal recovery. Initially, temp[k] = (failed 1), 546 which is the failed data in the colume which has smaller col index. **/ 547 /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */ 548 for (j = 0; j < numDataCol; j++) { 549 if (j == f1 || j == f2) 550 continue; 551 rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM); 552 if (rrdrow != (RF_EO_MATRIX_DIM) - 1) { 553#if RF_EO_MATRIX_DIM > 17 554 rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU; 555 for (k = 0; k < shortsPerEU; k++) 556 temp[k] ^= *(rrdbuf_current + k); 557#elif RF_EO_MATRIX_DIM == 17 558 rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU; 559 for (k = 0; k < longsPerEU; k++) 560 temp[k] ^= *(rrdbuf_current + k); 561#endif 562 } 563 } 564 /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't 565 * Xor into it E(erow,m-2) = (principle diagonal) ^ (failed 566 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal 567 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle 568 * diagonal) ^ (failed 2) */ 569 570 erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM)); 571 if (erow != (RF_EO_MATRIX_DIM) - 1) { 572#if RF_EO_MATRIX_DIM > 17 573 ebuf_current = (short *) ebuf + shortsPerEU * erow; 574 for (k = 0; k < shortsPerEU; k++) 575 temp[k] ^= *(ebuf_current + k); 576#elif RF_EO_MATRIX_DIM == 17 577 ebuf_current = (long *) ebuf + longsPerEU * erow; 578 for (k = 0; k < longsPerEU; k++) 579 temp[k] ^= *(ebuf_current + k); 580#endif 581 } 582 /* step 3: ^P to obtain the failed data (failed 2). P can be 583 * proved to be actually (principle diagonal) After this 584 * step, temp[k] = (failed 2), the failed data to be recovered */ 585#if RF_EO_MATRIX_DIM > 17 586 for (k = 0; k < shortsPerEU; k++) 587 temp[k] ^= P[k]; 588 /* Put the data to the destination buffer */ 589 for (k = 0; k < shortsPerEU; k++) 590 dest_larger_current[k] = temp[k]; 591#elif RF_EO_MATRIX_DIM == 17 592 for (k = 0; k < longsPerEU; k++) 593 temp[k] ^= P[k]; 594 /* Put the data to the destination buffer */ 595 for (k = 0; k < longsPerEU; k++) 596 dest_larger_current[k] = temp[k]; 597#endif 598 599 /** THE FOLLOWING DO THE HORIZONTAL XOR **/ 600 /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data 601 * columes */ 602 for (j = 0; j < numDataCol; j++) { 603 if (j == f1 || j == f2) 604 continue; 605#if RF_EO_MATRIX_DIM > 17 606 rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU; 607 for (k = 0; k < shortsPerEU; k++) 608 temp[k] ^= *(rrdbuf_current + k); 609#elif RF_EO_MATRIX_DIM == 17 610 rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU; 611 for (k = 0; k < longsPerEU; k++) 612 temp[k] ^= *(rrdbuf_current + k); 613#endif 614 } 615 /* step 2: ^A(row,m-1) */ 616 /* step 3: Put the data to the destination buffer */ 617#if RF_EO_MATRIX_DIM > 17 618 pbuf_current = (short *) pbuf + shortsPerEU * row; 619 for (k = 0; k < shortsPerEU; k++) 620 temp[k] ^= *(pbuf_current + k); 621 for (k = 0; k < shortsPerEU; k++) 622 dest_smaller_current[k] = temp[k]; 623#elif RF_EO_MATRIX_DIM == 17 624 pbuf_current = (long *) pbuf + longsPerEU * row; 625 for (k = 0; k < longsPerEU; k++) 626 temp[k] ^= *(pbuf_current + k); 627 for (k = 0; k < longsPerEU; k++) 628 dest_smaller_current[k] = temp[k]; 629#endif 630 count++; 631 } 632 /* Check if all Encoding Unit in the data buffer have been decoded, 633 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number, 634 * this algorithm will covered all buffer */ 635 RF_ASSERT(count == numRowInEncMatix); 636 RF_Free((char *) P, bytesPerEU); 637 RF_Free((char *) temp, bytesPerEU); 638} 639 640 641/*************************************************************************************** 642* This function is called by double degragded read 643* EO_200_CreateReadDAG 644* 645***************************************************************************************/ 646int 647rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node) 648{ 649 int ndataParam = 0; 650 int np = node->numParams; 651 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; 652 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; 653 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 654 int i, prm, sector, nresults = node->numResults; 655 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 656 unsigned sosAddr; 657 int mallc_one = 0, mallc_two = 0; /* flags to indicate if 658 * memory is allocated */ 659 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 660 RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1, 661 npda; 662 RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol; 663 char **buf, *ebuf, *pbuf, *dest[2]; 664 long *suoff = NULL, *suend = NULL, *prmToCol = NULL, 665 psuoff = 0, esuoff = 0; 666 RF_SectorNum_t startSector, endSector; 667 RF_Etimer_t timer; 668 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 669 670 RF_ETIMER_START(timer); 671 672 /* Find out the number of parameters which are pdas for data 673 * information */ 674 for (i = 0; i <= np; i++) 675 if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) { 676 ndataParam = i; 677 break; 678 } 679 buf = RF_Malloc(numDataCol * sizeof(*buf)); 680 if (ndataParam != 0) { 681 suoff = RF_Malloc(ndataParam * sizeof(*suoff)); 682 suend = RF_Malloc(ndataParam * sizeof(*suend)); 683 prmToCol = RF_Malloc(ndataParam * sizeof(*prmToCol)); 684 } 685 if (asmap->failedPDAs[1] && 686 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { 687 RF_ASSERT(0); /* currently, no support for this situation */ 688 ppda = node->params[np - 6].p; 689 ppda2 = node->params[np - 5].p; 690 RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY); 691 epda = node->params[np - 4].p; 692 epda2 = node->params[np - 3].p; 693 RF_ASSERT(epda2->type == RF_PDA_TYPE_Q); 694 } else { 695 ppda = node->params[np - 4].p; 696 epda = node->params[np - 3].p; 697 psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector); 698 esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector); 699 RF_ASSERT(psuoff == esuoff); 700 } 701 /* 702 the followings have three goals: 703 1. determine the startSector to begin decoding and endSector to end decoding. 704 2. determine the colume numbers of the two failed disks. 705 3. determine the offset and end offset of the access within each failed stripe unit. 706 */ 707 if (nresults == 1) { 708 /* find the startSector to begin decoding */ 709 pda = node->results[0]; 710 memset(pda->bufPtr, 0, bytesPerSector * pda->numSector); 711 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector); 712 fsuend[0] = fsuoff[0] + pda->numSector; 713 fsuoff[1] = 0; 714 fsuend[1] = 0; 715 startSector = fsuoff[0]; 716 endSector = fsuend[0]; 717 718 /* find out the column of failed disk being accessed */ 719 fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress); 720 721 /* find out the other failed colume not accessed */ 722 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 723 for (i = 0; i < numDataCol; i++) { 724 npda.raidAddress = sosAddr + (i * secPerSU); 725 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0); 726 /* skip over dead disks */ 727 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status)) 728 if (i != fcol[0]) 729 break; 730 } 731 RF_ASSERT(i < numDataCol); 732 fcol[1] = i; 733 } else { 734 RF_ASSERT(nresults == 2); 735 pda0 = node->results[0]; 736 memset(pda0->bufPtr, 0, bytesPerSector * pda0->numSector); 737 pda1 = node->results[1]; 738 memset(pda1->bufPtr, 0, bytesPerSector * pda1->numSector); 739 /* determine the failed colume numbers of the two failed 740 * disks. */ 741 fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress); 742 fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress); 743 /* determine the offset and end offset of the access within 744 * each failed stripe unit. */ 745 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector); 746 fsuend[0] = fsuoff[0] + pda0->numSector; 747 fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector); 748 fsuend[1] = fsuoff[1] + pda1->numSector; 749 /* determine the startSector to begin decoding */ 750 startSector = RF_MIN(pda0->startSector, pda1->startSector); 751 /* determine the endSector to end decoding */ 752 endSector = RF_MAX(fsuend[0], fsuend[1]); 753 } 754 /* 755 assign the beginning sector and the end sector for each parameter 756 find out the corresponding colume # for each parameter 757 */ 758 for (prm = 0; prm < ndataParam; prm++) { 759 pda = node->params[prm].p; 760 suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector); 761 suend[prm] = suoff[prm] + pda->numSector; 762 prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress); 763 } 764 /* 'sector' is the sector for the current decoding algorithm. For each 765 * sector in the failed SU, find out the corresponding parameters that 766 * cover the current sector and that are needed for decoding of this 767 * sector in failed SU. 2. Find out if sector is in the shadow of any 768 * accessed failed SU. If not, malloc a temporary space of a sector in 769 * size. */ 770 for (sector = startSector; sector < endSector; sector++) { 771 if (nresults == 2) 772 if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1])) 773 continue; 774 for (prm = 0; prm < ndataParam; prm++) 775 if (suoff[prm] <= sector && sector < suend[prm]) 776 buf[(prmToCol[prm])] = (char *)((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr + 777 rf_RaidAddressToByte(raidPtr, sector - suoff[prm]); 778 /* find out if sector is in the shadow of any accessed failed 779 * SU. If yes, assign dest[0], dest[1] to point at suitable 780 * position of the buffer corresponding to failed SUs. if no, 781 * malloc a temporary space of a sector in size for 782 * destination of decoding. */ 783 RF_ASSERT(nresults == 1 || nresults == 2); 784 if (nresults == 1) { 785 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]); 786 /* Always malloc temp buffer to dest[1] */ 787 dest[1] = RF_Malloc(bytesPerSector); 788 mallc_two = 1; 789 } else { 790 if (fsuoff[0] <= sector && sector < fsuend[0]) 791 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]); 792 else { 793 dest[0] = RF_Malloc(bytesPerSector); 794 mallc_one = 1; 795 } 796 if (fsuoff[1] <= sector && sector < fsuend[1]) 797 dest[1] = (char *)((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]); 798 else { 799 dest[1] = RF_Malloc(bytesPerSector); 800 mallc_two = 1; 801 } 802 RF_ASSERT(mallc_one == 0 || mallc_two == 0); 803 } 804 pbuf = (char *)ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff); 805 ebuf = (char *)epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff); 806 /* 807 * After finish finding all needed sectors, call doubleEOdecode function for decoding 808 * one sector to destination. 809 */ 810 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf); 811 /* free all allocated memory, and mark flag to indicate no 812 * memory is being allocated */ 813 if (mallc_one == 1) 814 RF_Free(dest[0], bytesPerSector); 815 if (mallc_two == 1) 816 RF_Free(dest[1], bytesPerSector); 817 mallc_one = mallc_two = 0; 818 } 819 RF_Free(buf, numDataCol * sizeof(char *)); 820 if (ndataParam != 0) { 821 RF_Free(suoff, ndataParam * sizeof(long)); 822 RF_Free(suend, ndataParam * sizeof(long)); 823 RF_Free(prmToCol, ndataParam * sizeof(long)); 824 } 825 RF_ETIMER_STOP(timer); 826 RF_ETIMER_EVAL(timer); 827 if (tracerec) { 828 tracerec->q_us += RF_ETIMER_VAL_US(timer); 829 } 830 rf_GenericWakeupFunc(node, 0); 831#if 1 832 return (0); /* XXX is this even close!!?!?!!? GO */ 833#endif 834} 835 836 837/* currently, only access of one of the two failed SU is allowed in this function. 838 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into 839 * many accesses of single stripe unit. 840 */ 841 842int 843rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node) 844{ 845 int np = node->numParams; 846 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; 847 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; 848 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 849 RF_SectorNum_t sector; 850 RF_RowCol_t col, scol; 851 int prm, i, j; 852 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 853 unsigned sosAddr; 854 unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 855 RF_int64 numbytes; 856 RF_SectorNum_t startSector, endSector; 857 RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda; 858 RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol; 859 char **buf; /* buf[0], buf[1], buf[2], ...etc. point to 860 * buffer storing data read from col0, col1, 861 * col2 */ 862 char *ebuf, *pbuf, *dest[2], *olddata[2]; 863 RF_Etimer_t timer; 864 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 865 866 RF_ASSERT(asmap->numDataFailed == 1); /* currently only support this 867 * case, the other failed SU 868 * is not being accessed */ 869 RF_ETIMER_START(timer); 870 buf = RF_Malloc(numDataCol * sizeof(*buf)); 871 872 ppda = node->results[0];/* Instead of being buffers, node->results[0] 873 * and [1] are Ppda and Epda */ 874 epda = node->results[1]; 875 fpda = asmap->failedPDAs[0]; 876 877 /* First, recovery the failed old SU using EvenOdd double decoding */ 878 /* determine the startSector and endSector for decoding */ 879 startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector); 880 endSector = startSector + fpda->numSector; 881 /* Assign buf[col] pointers to point to each non-failed colume and 882 * initialize the pbuf and ebuf to point at the beginning of each 883 * source buffers and destination buffers */ 884 for (prm = 0; prm < numDataCol - 2; prm++) { 885 pda = (RF_PhysDiskAddr_t *) node->params[prm].p; 886 col = rf_EUCol(layoutPtr, pda->raidAddress); 887 buf[col] = pda->bufPtr; 888 } 889 /* pbuf and ebuf: they will change values as double recovery decoding 890 * goes on */ 891 pbuf = ppda->bufPtr; 892 ebuf = epda->bufPtr; 893 /* find out the logical colume numbers in the encoding matrix of the 894 * two failed columes */ 895 fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress); 896 897 /* find out the other failed colume not accessed this time */ 898 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 899 for (i = 0; i < numDataCol; i++) { 900 npda.raidAddress = sosAddr + (i * secPerSU); 901 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0); 902 /* skip over dead disks */ 903 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status)) 904 if (i != fcol[0]) 905 break; 906 } 907 RF_ASSERT(i < numDataCol); 908 fcol[1] = i; 909 /* assign temporary space to put recovered failed SU */ 910 numbytes = fpda->numSector * bytesPerSector; 911 olddata[0] = RF_Malloc(numbytes); 912 olddata[1] = RF_Malloc(numbytes); 913 dest[0] = olddata[0]; 914 dest[1] = olddata[1]; 915 /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j] 916 * have already pointed at the beginning of each source buffers and 917 * destination buffers */ 918 for (sector = startSector, i = 0; sector < endSector; sector++, i++) { 919 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf); 920 for (j = 0; j < numDataCol; j++) 921 if ((j != fcol[0]) && (j != fcol[1])) 922 buf[j] += bytesPerSector; 923 dest[0] += bytesPerSector; 924 dest[1] += bytesPerSector; 925 ebuf += bytesPerSector; 926 pbuf += bytesPerSector; 927 } 928 /* after recovery, the buffer pointed by olddata[0] is the old failed 929 * data. With new writing data and this old data, use small write to 930 * calculate the new redundant informations */ 931 /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of 932 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol 933 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[ 934 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol 935 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of 936 * wudNodes; For current implementation, we assume the simplest case: 937 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1 938 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new 939 * data to be writen to the failed disk. We first bxor the new data 940 * into the old recovered data, then do the same things as small 941 * write. */ 942 943 rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes); 944 /* do new 'E' calculation */ 945 /* find out the corresponding colume in encoding matrix for write 946 * colume to be encoded into redundant disk 'E' */ 947 scol = rf_EUCol(layoutPtr, fpda->raidAddress); 948 /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest 949 * buffer pointer */ 950 rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector); 951 952 /* do new 'P' calculation */ 953 rf_bxor(olddata[0], ppda->bufPtr, numbytes); 954 /* Free the allocated buffer */ 955 RF_Free(olddata[0], numbytes); 956 RF_Free(olddata[1], numbytes); 957 RF_Free(buf, numDataCol * sizeof(char *)); 958 959 RF_ETIMER_STOP(timer); 960 RF_ETIMER_EVAL(timer); 961 if (tracerec) { 962 tracerec->q_us += RF_ETIMER_VAL_US(timer); 963 } 964 rf_GenericWakeupFunc(node, 0); 965 return (0); 966} 967#endif /* RF_INCLUDE_EVENODD > 0 */ 968