1/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.25 2022/02/16 22:00:56 andvar Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: ChangMing Wu 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/* 30 * Code for RAID-EVENODD architecture. 31 */ 32 33#include <sys/cdefs.h> 34__KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.25 2022/02/16 22:00:56 andvar Exp $"); 35 36#include "rf_archs.h" 37 38#ifdef _KERNEL_OPT 39#include "opt_raid_diagnostic.h" 40#endif 41 42#if RF_INCLUDE_EVENODD > 0 43 44#include <dev/raidframe/raidframevar.h> 45 46#include "rf_raid.h" 47#include "rf_dag.h" 48#include "rf_dagffrd.h" 49#include "rf_dagffwr.h" 50#include "rf_dagdegrd.h" 51#include "rf_dagdegwr.h" 52#include "rf_dagutils.h" 53#include "rf_dagfuncs.h" 54#include "rf_etimer.h" 55#include "rf_general.h" 56#include "rf_parityscan.h" 57#include "rf_evenodd.h" 58#include "rf_evenodd_dagfuncs.h" 59 60/* These redundant functions are for small write */ 61RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"}; 62RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"}; 63/* These redundant functions are for degraded read */ 64RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"}; 65RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"}; 66/********************************************************************************************** 67 * the following encoding node functions is used in EO_000_CreateLargeWriteDAG 68 **********************************************************************************************/ 69void 70rf_RegularPEFunc(RF_DagNode_t *node) 71{ 72 rf_RegularESubroutine(node, node->results[1]); 73 rf_RegularXorFunc(node);/* does the wakeup here! */ 74} 75 76 77/************************************************************************************************ 78 * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to 79 * be used. The previous case is when write access at least sectors of full stripe unit. 80 * The later function is used when the write access two stripe units but with total sectors 81 * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected 82 * areas in their stripe unit and parity write and 'E' write are both divided into two distinct 83 * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5 84 ************************************************************************************************/ 85 86/* Algorithm: 87 1. Store the difference of old data and new data in the Rod buffer. 88 2. then encode this buffer into the buffer which already have old 'E' information inside it, 89 the result can be shown to be the new 'E' information. 90 3. xor the Wnd buffer into the difference buffer to recover the original old data. 91 Here we have another alternative: to allocate a temporary buffer for storing the difference of 92 old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach 93 take the same speed as the previous, and need more memory. 94*/ 95void 96rf_RegularONEFunc(RF_DagNode_t *node) 97{ 98 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 99 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 100 int EpdaIndex = (node->numParams - 1) / 2 - 1; /* the parameter of node 101 * where you can find 102 * e-pda */ 103 int i, k; 104 int suoffset, length; 105 RF_RowCol_t scol; 106 char *srcbuf, *destbuf; 107 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 108 RF_Etimer_t timer; 109 RF_PhysDiskAddr_t *pda; 110#ifdef RAID_DIAGNOSTIC 111 RF_PhysDiskAddr_t *EPDA = 112 (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p; 113 int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); 114 115 RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q); 116 RF_ASSERT(ESUOffset == 0); 117#endif /* RAID_DIAGNOSTIC */ 118 119 RF_ETIMER_START(timer); 120 121 /* Xor the Wnd buffer into Rod buffer, the difference of old data and 122 * new data is stored in Rod buffer */ 123 for (k = 0; k < EpdaIndex; k += 2) { 124 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector); 125 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length); 126 } 127 /* Start to encoding the buffer storing the difference of old data and 128 * new data into 'E' buffer */ 129 for (i = 0; i < EpdaIndex; i += 2) 130 if (node->params[i + 1].p != node->results[0]) { /* results[0] is buf ptr 131 * of E */ 132 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 133 srcbuf = (char *) node->params[i + 1].p; 134 scol = rf_EUCol(layoutPtr, pda->raidAddress); 135 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 136 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset); 137 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 138 } 139 /* Recover the original old data to be used by parity encoding 140 * function in XorNode */ 141 for (k = 0; k < EpdaIndex; k += 2) { 142 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector); 143 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length); 144 } 145 RF_ETIMER_STOP(timer); 146 RF_ETIMER_EVAL(timer); 147 tracerec->q_us += RF_ETIMER_VAL_US(timer); 148 rf_GenericWakeupFunc(node, 0); 149} 150 151void 152rf_SimpleONEFunc(RF_DagNode_t *node) 153{ 154 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 155 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 156 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 157 int retcode = 0; 158 char *srcbuf, *destbuf; 159 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 160 int length; 161 RF_RowCol_t scol; 162 RF_Etimer_t timer; 163 164 RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q); 165 if (node->dagHdr->status == rf_enable) { 166 RF_ETIMER_START(timer); 167 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector); /* this is a pda of 168 * writeDataNodes */ 169 /* bxor to buffer of readDataNodes */ 170 retcode = rf_bxor(node->params[5].p, node->params[1].p, length); 171 /* find out the corresponding column in encoding matrix for 172 * write column to be encoded into redundant disk 'E' */ 173 scol = rf_EUCol(layoutPtr, pda->raidAddress); 174 srcbuf = node->params[1].p; 175 destbuf = node->params[3].p; 176 /* Start encoding process */ 177 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 178 rf_bxor(node->params[5].p, node->params[1].p, length); 179 RF_ETIMER_STOP(timer); 180 RF_ETIMER_EVAL(timer); 181 tracerec->q_us += RF_ETIMER_VAL_US(timer); 182 183 } 184 rf_GenericWakeupFunc(node, retcode); /* call wake func 185 * explicitly since no 186 * I/O in this node */ 187} 188 189 190/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/ 191void 192rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf) 193{ 194 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 195 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 196 RF_PhysDiskAddr_t *pda; 197 int i, suoffset; 198 RF_RowCol_t scol; 199 char *srcbuf, *destbuf; 200 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 201 RF_Etimer_t timer; 202 203 RF_ETIMER_START(timer); 204 for (i = 0; i < node->numParams - 2; i += 2) { 205 RF_ASSERT(node->params[i + 1].p != ebuf); 206 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 207 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 208 scol = rf_EUCol(layoutPtr, pda->raidAddress); 209 srcbuf = (char *) node->params[i + 1].p; 210 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset); 211 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 212 } 213 RF_ETIMER_STOP(timer); 214 RF_ETIMER_EVAL(timer); 215 tracerec->xor_us += RF_ETIMER_VAL_US(timer); 216} 217 218 219/******************************************************************************************* 220 * Used in EO_001_CreateLargeWriteDAG 221 ******************************************************************************************/ 222void 223rf_RegularEFunc(RF_DagNode_t *node) 224{ 225 rf_RegularESubroutine(node, node->results[0]); 226 rf_GenericWakeupFunc(node, 0); 227} 228/******************************************************************************************* 229 * This degraded function allow only two case: 230 * 1. when write access the full failed stripe unit, then the access can be more than 231 * one tripe units. 232 * 2. when write access only part of the failed SU, we assume accesses of more than 233 * one stripe unit is not allowed so that the write can be dealt with like a 234 * large write. 235 * The following function is based on these assumptions. So except in the second case, 236 * it looks the same as a large write encoding function. But this is not exactly the 237 * normal way for doing a degraded write, since raidframe have to break cases of access 238 * other than the above two into smaller accesses. We may have to change 239 * DegrESubroutine in the future. 240 *******************************************************************************************/ 241void 242rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf) 243{ 244 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 245 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 246 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 247 RF_PhysDiskAddr_t *pda; 248 int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 249 RF_RowCol_t scol; 250 char *srcbuf, *destbuf; 251 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 252 RF_Etimer_t timer; 253 254 RF_ETIMER_START(timer); 255 for (i = 0; i < node->numParams - 2; i += 2) { 256 RF_ASSERT(node->params[i + 1].p != ebuf); 257 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 258 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 259 scol = rf_EUCol(layoutPtr, pda->raidAddress); 260 srcbuf = (char *) node->params[i + 1].p; 261 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 262 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); 263 } 264 265 RF_ETIMER_STOP(timer); 266 RF_ETIMER_EVAL(timer); 267 tracerec->q_us += RF_ETIMER_VAL_US(timer); 268} 269 270 271/************************************************************************************** 272 * This function is used in case where one data disk failed and both redundant disks 273 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk 274 * failed in the stripe but not accessed at this time, then we should, instead, use 275 * the rf_EOWriteDoubleRecoveryFunc(). 276 **************************************************************************************/ 277void 278rf_Degraded_100_EOFunc(RF_DagNode_t *node) 279{ 280 rf_DegrESubroutine(node, node->results[1]); 281 rf_RecoveryXorFunc(node); /* does the wakeup here! */ 282} 283/************************************************************************************** 284 * This function is to encode one sector in one of the data disks to the E disk. 285 * However, in evenodd this function can also be used as decoding function to recover 286 * data from dead disk in the case of parity failure and a single data failure. 287 **************************************************************************************/ 288void 289rf_e_EncOneSect( 290 RF_RowCol_t srcLogicCol, 291 char *srcSecbuf, 292 RF_RowCol_t destLogicCol, 293 char *destSecbuf, 294 int bytesPerSector) 295{ 296 int S_index; /* index of the EU in the src col which need 297 * be Xored into all EUs in a dest sector */ 298 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1; 299 RF_RowCol_t j, indexInDest, /* row index of an encoding unit in 300 * the destination column of encoding 301 * matrix */ 302 indexInSrc; /* row index of an encoding unit in the source 303 * column used for recovery */ 304 int bytesPerEU = bytesPerSector / numRowInEncMatix; 305 306#if RF_EO_MATRIX_DIM > 17 307 int shortsPerEU = bytesPerEU / sizeof(short); 308 short *destShortBuf, *srcShortBuf1, *srcShortBuf2; 309 short temp1; 310#elif RF_EO_MATRIX_DIM == 17 311 int longsPerEU = bytesPerEU / sizeof(long); 312 long *destLongBuf, *srcLongBuf1, *srcLongBuf2; 313 long temp1; 314#endif 315 316#if RF_EO_MATRIX_DIM > 17 317 RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1); 318 RF_ASSERT(bytesPerEU % sizeof(short) == 0); 319#elif RF_EO_MATRIX_DIM == 17 320 RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4); 321 RF_ASSERT(bytesPerEU % sizeof(long) == 0); 322#endif 323 324 S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM); 325#if RF_EO_MATRIX_DIM > 17 326 srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU); 327#elif RF_EO_MATRIX_DIM == 17 328 srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU); 329#endif 330 331 for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) { 332 indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM); 333 334#if RF_EO_MATRIX_DIM > 17 335 destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU); 336 srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU); 337 for (j = 0; j < shortsPerEU; j++) { 338 temp1 = destShortBuf[j] ^ srcShortBuf1[j]; 339 /* note: S_index won't be at the end row for any src 340 * col! */ 341 if (indexInSrc != RF_EO_MATRIX_DIM - 1) 342 destShortBuf[j] = (srcShortBuf2[j]) ^ temp1; 343 /* if indexInSrc is at the end row, ie. 344 * RF_EO_MATRIX_DIM -1, then all elements are zero! */ 345 else 346 destShortBuf[j] = temp1; 347 } 348 349#elif RF_EO_MATRIX_DIM == 17 350 destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU); 351 srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU); 352 for (j = 0; j < longsPerEU; j++) { 353 temp1 = destLongBuf[j] ^ srcLongBuf1[j]; 354 if (indexInSrc != RF_EO_MATRIX_DIM - 1) 355 destLongBuf[j] = (srcLongBuf2[j]) ^ temp1; 356 else 357 destLongBuf[j] = temp1; 358 } 359#endif 360 } 361} 362 363void 364rf_e_encToBuf( 365 RF_Raid_t * raidPtr, 366 RF_RowCol_t srcLogicCol, 367 char *srcbuf, 368 RF_RowCol_t destLogicCol, 369 char *destbuf, 370 int numSector) 371{ 372 int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 373 374 for (i = 0; i < numSector; i++) { 375 rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector); 376 srcbuf += bytesPerSector; 377 destbuf += bytesPerSector; 378 } 379} 380/************************************************************************************** 381 * when parity die and one data die, We use second redundant information, 'E', 382 * to recover the data in dead disk. This function is used in the recovery node of 383 * for EO_110_CreateReadDAG 384 **************************************************************************************/ 385void 386rf_RecoveryEFunc(RF_DagNode_t *node) 387{ 388 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 389 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 390 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 391 RF_RowCol_t scol, /* source logical column */ 392 fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress); /* logical column of 393 * failed SU */ 394 int i; 395 RF_PhysDiskAddr_t *pda; 396 int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 397 char *srcbuf, *destbuf; 398 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 399 RF_Etimer_t timer; 400 401 memset(node->results[0], 0, 402 rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); 403 if (node->dagHdr->status == rf_enable) { 404 RF_ETIMER_START(timer); 405 for (i = 0; i < node->numParams - 2; i += 2) 406 if (node->params[i + 1].p != node->results[0]) { 407 pda = (RF_PhysDiskAddr_t *) node->params[i].p; 408 if (i == node->numParams - 4) 409 scol = RF_EO_MATRIX_DIM - 2; /* the column of 410 * redundant E */ 411 else 412 scol = rf_EUCol(layoutPtr, pda->raidAddress); 413 srcbuf = (char *) node->params[i + 1].p; 414 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 415 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 416 rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector); 417 } 418 RF_ETIMER_STOP(timer); 419 RF_ETIMER_EVAL(timer); 420 tracerec->xor_us += RF_ETIMER_VAL_US(timer); 421 } 422 rf_GenericWakeupFunc(node, 0); /* node execute successfully */ 423} 424/************************************************************************************** 425 * This function is used in the case where one data and the parity have filed. 426 * (in EO_110_CreateWriteDAG ) 427 **************************************************************************************/ 428void 429rf_EO_DegradedWriteEFunc(RF_DagNode_t * node) 430{ 431 rf_DegrESubroutine(node, node->results[0]); 432 rf_GenericWakeupFunc(node, 0); 433} 434 435 436 437/************************************************************************************** 438 * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES 439 **************************************************************************************/ 440 441void 442rf_doubleEOdecode( 443 RF_Raid_t * raidPtr, 444 char **rrdbuf, 445 char **dest, 446 RF_RowCol_t * fcol, 447 char *pbuf, 448 char *ebuf) 449{ 450 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 451 int i, j, k, f1, f2, row; 452 int rrdrow, erow, count = 0; 453 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 454 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1; 455#if 0 456 int pcol = (RF_EO_MATRIX_DIM) - 1; 457#endif 458 int ecol = (RF_EO_MATRIX_DIM) - 2; 459 int bytesPerEU = bytesPerSector / numRowInEncMatix; 460 int numDataCol = layoutPtr->numDataCol; 461#if RF_EO_MATRIX_DIM > 17 462 int shortsPerEU = bytesPerEU / sizeof(short); 463 short *rrdbuf_current, *pbuf_current, *ebuf_current; 464 short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; 465 short *temp; 466 short *P; 467 468 RF_ASSERT(bytesPerEU % sizeof(short) == 0); 469#elif RF_EO_MATRIX_DIM == 17 470 int longsPerEU = bytesPerEU / sizeof(long); 471 long *rrdbuf_current, *pbuf_current, *ebuf_current; 472 long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; 473 long *temp; 474 long *P; 475 476 RF_ASSERT(bytesPerEU % sizeof(long) == 0); 477#endif 478 P = RF_Malloc(bytesPerEU); 479 temp = RF_Malloc(bytesPerEU); 480 RF_ASSERT(*((long *) dest[0]) == 0); 481 RF_ASSERT(*((long *) dest[1]) == 0); 482 RF_ASSERT(*P == 0); 483 /* calculate the 'P' parameter, which, not parity, is the Xor of all 484 * elements in the last two columns, ie. 'E' and 'parity' columns, see 485 * the Ref. paper by Blaum, et al 1993 */ 486 for (i = 0; i < numRowInEncMatix; i++) 487 for (k = 0; k < longsPerEU; k++) { 488#if RF_EO_MATRIX_DIM > 17 489 ebuf_current = ((short *) ebuf) + i * shortsPerEU + k; 490 pbuf_current = ((short *) pbuf) + i * shortsPerEU + k; 491#elif RF_EO_MATRIX_DIM == 17 492 ebuf_current = ((long *) ebuf) + i * longsPerEU + k; 493 pbuf_current = ((long *) pbuf) + i * longsPerEU + k; 494#endif 495 P[k] ^= *ebuf_current; 496 P[k] ^= *pbuf_current; 497 } 498 RF_ASSERT(fcol[0] != fcol[1]); 499 if (fcol[0] < fcol[1]) { 500#if RF_EO_MATRIX_DIM > 17 501 dest_smaller = (short *) (dest[0]); 502 dest_larger = (short *) (dest[1]); 503#elif RF_EO_MATRIX_DIM == 17 504 dest_smaller = (long *) (dest[0]); 505 dest_larger = (long *) (dest[1]); 506#endif 507 f1 = fcol[0]; 508 f2 = fcol[1]; 509 } else { 510#if RF_EO_MATRIX_DIM > 17 511 dest_smaller = (short *) (dest[1]); 512 dest_larger = (short *) (dest[0]); 513#elif RF_EO_MATRIX_DIM == 17 514 dest_smaller = (long *) (dest[1]); 515 dest_larger = (long *) (dest[0]); 516#endif 517 f1 = fcol[1]; 518 f2 = fcol[0]; 519 } 520 row = (RF_EO_MATRIX_DIM) - 1; 521 while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) { 522#if RF_EO_MATRIX_DIM > 17 523 dest_larger_current = dest_larger + row * shortsPerEU; 524 dest_smaller_current = dest_smaller + row * shortsPerEU; 525#elif RF_EO_MATRIX_DIM == 17 526 dest_larger_current = dest_larger + row * longsPerEU; 527 dest_smaller_current = dest_smaller + row * longsPerEU; 528#endif 529 /** Do the diagonal recovery. Initially, temp[k] = (failed 1), 530 which is the failed data in the column which has smaller col index. **/ 531 /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */ 532 for (j = 0; j < numDataCol; j++) { 533 if (j == f1 || j == f2) 534 continue; 535 rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM); 536 if (rrdrow != (RF_EO_MATRIX_DIM) - 1) { 537#if RF_EO_MATRIX_DIM > 17 538 rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU; 539 for (k = 0; k < shortsPerEU; k++) 540 temp[k] ^= *(rrdbuf_current + k); 541#elif RF_EO_MATRIX_DIM == 17 542 rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU; 543 for (k = 0; k < longsPerEU; k++) 544 temp[k] ^= *(rrdbuf_current + k); 545#endif 546 } 547 } 548 /* step 2: ^E(erow,m-2), If erow is at the bottom row, don't 549 * Xor into it E(erow,m-2) = (principle diagonal) ^ (failed 550 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal 551 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle 552 * diagonal) ^ (failed 2) */ 553 554 erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM)); 555 if (erow != (RF_EO_MATRIX_DIM) - 1) { 556#if RF_EO_MATRIX_DIM > 17 557 ebuf_current = (short *) ebuf + shortsPerEU * erow; 558 for (k = 0; k < shortsPerEU; k++) 559 temp[k] ^= *(ebuf_current + k); 560#elif RF_EO_MATRIX_DIM == 17 561 ebuf_current = (long *) ebuf + longsPerEU * erow; 562 for (k = 0; k < longsPerEU; k++) 563 temp[k] ^= *(ebuf_current + k); 564#endif 565 } 566 /* step 3: ^P to obtain the failed data (failed 2). P can be 567 * proved to be actually (principle diagonal) After this 568 * step, temp[k] = (failed 2), the failed data to be recovered */ 569#if RF_EO_MATRIX_DIM > 17 570 for (k = 0; k < shortsPerEU; k++) 571 temp[k] ^= P[k]; 572 /* Put the data to the destination buffer */ 573 for (k = 0; k < shortsPerEU; k++) 574 dest_larger_current[k] = temp[k]; 575#elif RF_EO_MATRIX_DIM == 17 576 for (k = 0; k < longsPerEU; k++) 577 temp[k] ^= P[k]; 578 /* Put the data to the destination buffer */ 579 for (k = 0; k < longsPerEU; k++) 580 dest_larger_current[k] = temp[k]; 581#endif 582 583 /** THE FOLLOWING DO THE HORIZONTAL XOR **/ 584 /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data 585 * columns */ 586 for (j = 0; j < numDataCol; j++) { 587 if (j == f1 || j == f2) 588 continue; 589#if RF_EO_MATRIX_DIM > 17 590 rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU; 591 for (k = 0; k < shortsPerEU; k++) 592 temp[k] ^= *(rrdbuf_current + k); 593#elif RF_EO_MATRIX_DIM == 17 594 rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU; 595 for (k = 0; k < longsPerEU; k++) 596 temp[k] ^= *(rrdbuf_current + k); 597#endif 598 } 599 /* step 2: ^A(row,m-1) */ 600 /* step 3: Put the data to the destination buffer */ 601#if RF_EO_MATRIX_DIM > 17 602 pbuf_current = (short *) pbuf + shortsPerEU * row; 603 for (k = 0; k < shortsPerEU; k++) 604 temp[k] ^= *(pbuf_current + k); 605 for (k = 0; k < shortsPerEU; k++) 606 dest_smaller_current[k] = temp[k]; 607#elif RF_EO_MATRIX_DIM == 17 608 pbuf_current = (long *) pbuf + longsPerEU * row; 609 for (k = 0; k < longsPerEU; k++) 610 temp[k] ^= *(pbuf_current + k); 611 for (k = 0; k < longsPerEU; k++) 612 dest_smaller_current[k] = temp[k]; 613#endif 614 count++; 615 } 616 /* Check if all Encoding Unit in the data buffer have been decoded, 617 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number, 618 * this algorithm will covered all buffer */ 619 RF_ASSERT(count == numRowInEncMatix); 620 RF_Free((char *) P, bytesPerEU); 621 RF_Free((char *) temp, bytesPerEU); 622} 623 624 625/*************************************************************************************** 626* This function is called by double degraded read 627* EO_200_CreateReadDAG 628* 629***************************************************************************************/ 630void 631rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node) 632{ 633 int ndataParam = 0; 634 int np = node->numParams; 635 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; 636 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; 637 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 638 int i, prm, sector, nresults = node->numResults; 639 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 640 unsigned sosAddr; 641 int mallc_one = 0, mallc_two = 0; /* flags to indicate if 642 * memory is allocated */ 643 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 644 RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1, 645 npda; 646 RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol; 647 char **buf, *ebuf, *pbuf, *dest[2]; 648 long *suoff = NULL, *suend = NULL, *prmToCol = NULL, 649 psuoff = 0, esuoff = 0; 650 RF_SectorNum_t startSector, endSector; 651 RF_Etimer_t timer; 652 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 653 654 RF_ETIMER_START(timer); 655 656 /* Find out the number of parameters which are pdas for data 657 * information */ 658 for (i = 0; i <= np; i++) 659 if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) { 660 ndataParam = i; 661 break; 662 } 663 buf = RF_Malloc(numDataCol * sizeof(*buf)); 664 if (ndataParam != 0) { 665 suoff = RF_Malloc(ndataParam * sizeof(*suoff)); 666 suend = RF_Malloc(ndataParam * sizeof(*suend)); 667 prmToCol = RF_Malloc(ndataParam * sizeof(*prmToCol)); 668 } 669 if (asmap->failedPDAs[1] && 670 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { 671 RF_ASSERT(0); /* currently, no support for this situation */ 672 ppda = node->params[np - 6].p; 673 ppda2 = node->params[np - 5].p; 674 RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY); 675 epda = node->params[np - 4].p; 676 epda2 = node->params[np - 3].p; 677 RF_ASSERT(epda2->type == RF_PDA_TYPE_Q); 678 } else { 679 ppda = node->params[np - 4].p; 680 epda = node->params[np - 3].p; 681 psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector); 682 esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector); 683 RF_ASSERT(psuoff == esuoff); 684 } 685 /* 686 the followings have three goals: 687 1. determine the startSector to begin decoding and endSector to end decoding. 688 2. determine the column numbers of the two failed disks. 689 3. determine the offset and end offset of the access within each failed stripe unit. 690 */ 691 if (nresults == 1) { 692 /* find the startSector to begin decoding */ 693 pda = node->results[0]; 694 memset(pda->bufPtr, 0, bytesPerSector * pda->numSector); 695 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector); 696 fsuend[0] = fsuoff[0] + pda->numSector; 697 fsuoff[1] = 0; 698 fsuend[1] = 0; 699 startSector = fsuoff[0]; 700 endSector = fsuend[0]; 701 702 /* find out the column of failed disk being accessed */ 703 fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress); 704 705 /* find out the other failed column not accessed */ 706 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 707 for (i = 0; i < numDataCol; i++) { 708 npda.raidAddress = sosAddr + (i * secPerSU); 709 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0); 710 /* skip over dead disks */ 711 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status)) 712 if (i != fcol[0]) 713 break; 714 } 715 RF_ASSERT(i < numDataCol); 716 fcol[1] = i; 717 } else { 718 RF_ASSERT(nresults == 2); 719 pda0 = node->results[0]; 720 memset(pda0->bufPtr, 0, bytesPerSector * pda0->numSector); 721 pda1 = node->results[1]; 722 memset(pda1->bufPtr, 0, bytesPerSector * pda1->numSector); 723 /* determine the failed column numbers of the two failed 724 * disks. */ 725 fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress); 726 fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress); 727 /* determine the offset and end offset of the access within 728 * each failed stripe unit. */ 729 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector); 730 fsuend[0] = fsuoff[0] + pda0->numSector; 731 fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector); 732 fsuend[1] = fsuoff[1] + pda1->numSector; 733 /* determine the startSector to begin decoding */ 734 startSector = RF_MIN(pda0->startSector, pda1->startSector); 735 /* determine the endSector to end decoding */ 736 endSector = RF_MAX(fsuend[0], fsuend[1]); 737 } 738 /* 739 assign the beginning sector and the end sector for each parameter 740 find out the corresponding column # for each parameter 741 */ 742 for (prm = 0; prm < ndataParam; prm++) { 743 pda = node->params[prm].p; 744 suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector); 745 suend[prm] = suoff[prm] + pda->numSector; 746 prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress); 747 } 748 /* 'sector' is the sector for the current decoding algorithm. For each 749 * sector in the failed SU, find out the corresponding parameters that 750 * cover the current sector and that are needed for decoding of this 751 * sector in failed SU. 2. Find out if sector is in the shadow of any 752 * accessed failed SU. If not, malloc a temporary space of a sector in 753 * size. */ 754 for (sector = startSector; sector < endSector; sector++) { 755 if (nresults == 2) 756 if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1])) 757 continue; 758 for (prm = 0; prm < ndataParam; prm++) 759 if (suoff[prm] <= sector && sector < suend[prm]) 760 buf[(prmToCol[prm])] = (char *)((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr + 761 rf_RaidAddressToByte(raidPtr, sector - suoff[prm]); 762 /* find out if sector is in the shadow of any accessed failed 763 * SU. If yes, assign dest[0], dest[1] to point at suitable 764 * position of the buffer corresponding to failed SUs. if no, 765 * malloc a temporary space of a sector in size for 766 * destination of decoding. */ 767 RF_ASSERT(nresults == 1 || nresults == 2); 768 if (nresults == 1) { 769 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]); 770 /* Always malloc temp buffer to dest[1] */ 771 dest[1] = RF_Malloc(bytesPerSector); 772 mallc_two = 1; 773 } else { 774 if (fsuoff[0] <= sector && sector < fsuend[0]) 775 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]); 776 else { 777 dest[0] = RF_Malloc(bytesPerSector); 778 mallc_one = 1; 779 } 780 if (fsuoff[1] <= sector && sector < fsuend[1]) 781 dest[1] = (char *)((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]); 782 else { 783 dest[1] = RF_Malloc(bytesPerSector); 784 mallc_two = 1; 785 } 786 RF_ASSERT(mallc_one == 0 || mallc_two == 0); 787 } 788 pbuf = (char *)ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff); 789 ebuf = (char *)epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff); 790 /* 791 * After finish finding all needed sectors, call doubleEOdecode function for decoding 792 * one sector to destination. 793 */ 794 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf); 795 /* free all allocated memory, and mark flag to indicate no 796 * memory is being allocated */ 797 if (mallc_one == 1) 798 RF_Free(dest[0], bytesPerSector); 799 if (mallc_two == 1) 800 RF_Free(dest[1], bytesPerSector); 801 mallc_one = mallc_two = 0; 802 } 803 RF_Free(buf, numDataCol * sizeof(char *)); 804 if (ndataParam != 0) { 805 RF_Free(suoff, ndataParam * sizeof(long)); 806 RF_Free(suend, ndataParam * sizeof(long)); 807 RF_Free(prmToCol, ndataParam * sizeof(long)); 808 } 809 RF_ETIMER_STOP(timer); 810 RF_ETIMER_EVAL(timer); 811 if (tracerec) { 812 tracerec->q_us += RF_ETIMER_VAL_US(timer); 813 } 814 rf_GenericWakeupFunc(node, 0); 815} 816 817 818/* currently, only access of one of the two failed SU is allowed in this function. 819 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into 820 * many accesses of single stripe unit. 821 */ 822 823void 824rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node) 825{ 826 int np = node->numParams; 827 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; 828 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; 829 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 830 RF_SectorNum_t sector; 831 RF_RowCol_t col, scol; 832 int prm, i, j; 833 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 834 unsigned sosAddr; 835 unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); 836 RF_int64 numbytes; 837 RF_SectorNum_t startSector, endSector; 838 RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda; 839 RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol; 840 char **buf; /* buf[0], buf[1], buf[2], ...etc. point to 841 * buffer storing data read from col0, col1, 842 * col2 */ 843 char *ebuf, *pbuf, *dest[2], *olddata[2]; 844 RF_Etimer_t timer; 845 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 846 847 RF_ASSERT(asmap->numDataFailed == 1); /* currently only support this 848 * case, the other failed SU 849 * is not being accessed */ 850 RF_ETIMER_START(timer); 851 buf = RF_Malloc(numDataCol * sizeof(*buf)); 852 853 ppda = node->results[0];/* Instead of being buffers, node->results[0] 854 * and [1] are Ppda and Epda */ 855 epda = node->results[1]; 856 fpda = asmap->failedPDAs[0]; 857 858 /* First, recovery the failed old SU using EvenOdd double decoding */ 859 /* determine the startSector and endSector for decoding */ 860 startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector); 861 endSector = startSector + fpda->numSector; 862 /* Assign buf[col] pointers to point to each non-failed column and 863 * initialize the pbuf and ebuf to point at the beginning of each 864 * source buffers and destination buffers */ 865 for (prm = 0; prm < numDataCol - 2; prm++) { 866 pda = (RF_PhysDiskAddr_t *) node->params[prm].p; 867 col = rf_EUCol(layoutPtr, pda->raidAddress); 868 buf[col] = pda->bufPtr; 869 } 870 /* pbuf and ebuf: they will change values as double recovery decoding 871 * goes on */ 872 pbuf = ppda->bufPtr; 873 ebuf = epda->bufPtr; 874 /* find out the logical column numbers in the encoding matrix of the 875 * two failed columns */ 876 fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress); 877 878 /* find out the other failed column not accessed this time */ 879 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 880 for (i = 0; i < numDataCol; i++) { 881 npda.raidAddress = sosAddr + (i * secPerSU); 882 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0); 883 /* skip over dead disks */ 884 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status)) 885 if (i != fcol[0]) 886 break; 887 } 888 RF_ASSERT(i < numDataCol); 889 fcol[1] = i; 890 /* assign temporary space to put recovered failed SU */ 891 numbytes = fpda->numSector * bytesPerSector; 892 olddata[0] = RF_Malloc(numbytes); 893 olddata[1] = RF_Malloc(numbytes); 894 dest[0] = olddata[0]; 895 dest[1] = olddata[1]; 896 /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j] 897 * have already pointed at the beginning of each source buffers and 898 * destination buffers */ 899 for (sector = startSector, i = 0; sector < endSector; sector++, i++) { 900 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf); 901 for (j = 0; j < numDataCol; j++) 902 if ((j != fcol[0]) && (j != fcol[1])) 903 buf[j] += bytesPerSector; 904 dest[0] += bytesPerSector; 905 dest[1] += bytesPerSector; 906 ebuf += bytesPerSector; 907 pbuf += bytesPerSector; 908 } 909 /* after recovery, the buffer pointed by olddata[0] is the old failed 910 * data. With new writing data and this old data, use small write to 911 * calculate the new redundant information */ 912 /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of 913 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol 914 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[ 915 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol 916 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of 917 * wudNodes; For current implementation, we assume the simplest case: 918 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1 919 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new 920 * data to be written to the failed disk. We first bxor the new data 921 * into the old recovered data, then do the same things as small 922 * write. */ 923 924 rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes); 925 /* do new 'E' calculation */ 926 /* find out the corresponding column in encoding matrix for write 927 * column to be encoded into redundant disk 'E' */ 928 scol = rf_EUCol(layoutPtr, fpda->raidAddress); 929 /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest 930 * buffer pointer */ 931 rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector); 932 933 /* do new 'P' calculation */ 934 rf_bxor(olddata[0], ppda->bufPtr, numbytes); 935 /* Free the allocated buffer */ 936 RF_Free(olddata[0], numbytes); 937 RF_Free(olddata[1], numbytes); 938 RF_Free(buf, numDataCol * sizeof(char *)); 939 940 RF_ETIMER_STOP(timer); 941 RF_ETIMER_EVAL(timer); 942 if (tracerec) { 943 tracerec->q_us += RF_ETIMER_VAL_US(timer); 944 } 945 rf_GenericWakeupFunc(node, 0); 946} 947#endif /* RF_INCLUDE_EVENODD > 0 */ 948