1/* $NetBSD: rf_pqdegdags.c,v 1.17 2023/10/15 18:15:20 oster Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Daniel Stodolsky 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/* 30 * rf_pqdegdags.c 31 * Degraded mode dags for double fault cases. 32*/ 33 34 35#include <sys/cdefs.h> 36__KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.17 2023/10/15 18:15:20 oster Exp $"); 37 38#include "rf_archs.h" 39 40#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 41 42#include <dev/raidframe/raidframevar.h> 43 44#include "rf_raid.h" 45#include "rf_dag.h" 46#include "rf_dagdegrd.h" 47#include "rf_dagdegwr.h" 48#include "rf_dagfuncs.h" 49#include "rf_dagutils.h" 50#include "rf_etimer.h" 51#include "rf_acctrace.h" 52#include "rf_general.h" 53#include "rf_pqdegdags.h" 54#include "rf_pq.h" 55 56static void 57applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda, 58 RF_PhysDiskAddr_t * qpda, const struct buf *bp); 59 60/* 61 Two data drives have failed, and we are doing a read that covers one of them. 62 We may also be reading some of the surviving drives. 63 64 65 ***************************************************************************************** 66 * 67 * creates a DAG to perform a degraded-mode read of data within one stripe. 68 * This DAG is as follows: 69 * 70 * Hdr 71 * | 72 * Block 73 * / / \ \ \ \ 74 * Rud ... Rud Rrd ... Rrd Rp Rq 75 * | \ | \ | \ | \ | \ | \ 76 * 77 * | | 78 * Unblock X 79 * \ / 80 * ------ T ------ 81 * 82 * Each R node is a successor of the L node 83 * One successor arc from each R node goes to U, and the other to X 84 * There is one Rud for each chunk of surviving user data requested by the user, 85 * and one Rrd for each chunk of surviving user data _not_ being read by the user 86 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata 87 * X = pq recovery node, T = terminate 88 * 89 * The block & unblock nodes are leftovers from a previous version. They 90 * do nothing, but I haven't deleted them because it would be a tremendous 91 * effort to put them back in. 92 * 93 * Note: The target buffer for the XOR node is set to the actual user buffer where the 94 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus, 95 * if you create a degraded read dag, use it, and then re-use, you have to be sure to 96 * zero the target buffer prior to the re-use. 97 * 98 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats 99 * needs and what's not. 100 ****************************************************************************************/ 101/* init a disk node with 2 successors and one predecessor */ 102#define INIT_DISK_NODE(node,name) \ 103rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ 104(node)->succedents[0] = unblockNode; \ 105(node)->succedents[1] = recoveryNode; \ 106(node)->antecedents[0] = blockNode; \ 107(node)->antType[0] = rf_control 108 109#define DISK_NODE_PARAMS(_node_,_p_) \ 110 (_node_).params[0].p = _p_ ; \ 111 (_node_).params[1].p = (_p_)->bufPtr; \ 112 (_node_).params[2].v = parityStripeID; \ 113 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru) 114 115#define DISK_NODE_PDA(node) ((node)->params[0].p) 116 117RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead) 118{ 119 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList, 120 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc); 121} 122 123static void 124applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, const struct buf *bp) 125{ 126 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 127 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector); 128 RF_SectorCount_t s0len = ppda->numSector, len; 129 RF_SectorNum_t suoffset; 130 unsigned coeff; 131 char *pbuf = ppda->bufPtr; 132 char *qbuf = qpda->bufPtr; 133 char *buf; 134 int delta; 135 136 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 137 len = pda->numSector; 138 /* see if pda intersects a recovery pda */ 139 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) { 140 buf = pda->bufPtr; 141 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); 142 coeff = (coeff % raidPtr->Layout.numDataCol); 143 144 if (suoffset < s0off) { 145 delta = s0off - suoffset; 146 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); 147 suoffset = s0off; 148 len -= delta; 149 } 150 if (suoffset > s0off) { 151 delta = suoffset - s0off; 152 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); 153 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); 154 } 155 if ((suoffset + len) > (s0len + s0off)) 156 len = s0len + s0off - suoffset; 157 158 /* src, dest, len */ 159 /* rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp); */ 160 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len)); 161 162 /* dest, src, len, coeff */ 163 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff); 164 } 165} 166/* 167 Recover data in the case of a double failure. There can be two 168 result buffers, one for each chunk of data trying to be recovered. 169 The params are pda's that have not been range restricted or otherwise 170 politely massaged - this should be done here. The last params are the 171 pdas of P and Q, followed by the raidPtr. The list can look like 172 173 pda, pda, ... , p pda, q pda, raidptr, asm 174 175 or 176 177 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm 178 179 depending on whether two chunks of recovery data were required. 180 181 The second condition only arises if there are two failed buffers 182 whose lengths do not add up a stripe unit. 183*/ 184 185 186void 187rf_PQDoubleRecoveryFunc(RF_DagNode_t *node) 188{ 189 int np = node->numParams; 190 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; 191 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; 192 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 193 int d, i; 194 unsigned coeff; 195 RF_RaidAddr_t sosAddr; /* , suoffset; */ 196 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 197 int two = 0; 198 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda; 199 /* char *buf; */ 200 int numDataCol = layoutPtr->numDataCol; 201 RF_Etimer_t timer; 202 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 203 204 RF_ETIMER_START(timer); 205 206 if (asmap->failedPDAs[1] && 207 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { 208 RF_ASSERT(0); 209 ppda = node->params[np - 6].p; 210 /* ppda2 = node->params[np - 5].p; */ 211 qpda = node->params[np - 4].p; 212 /* qpda2 = node->params[np - 3].p; */ 213 d = (np - 6); 214 two = 1; 215 } else { 216 ppda = node->params[np - 4].p; 217 qpda = node->params[np - 3].p; 218 d = (np - 4); 219 } 220 221 for (i = 0; i < d; i++) { 222 pda = node->params[i].p; 223 /* buf = pda->bufPtr; */ 224 /* suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); */ 225 /* len = pda->numSector; */ 226 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); 227 /* compute the data unit offset within the column */ 228 coeff = (coeff % raidPtr->Layout.numDataCol); 229 /* see if pda intersects a recovery pda */ 230 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); 231 if (two) 232 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); 233 } 234 235 /* ok, we got the parity back to the point where we can recover. We 236 * now need to determine the coeff of the columns that need to be 237 * recovered. We can also only need to recover a single stripe unit. */ 238 239 if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit 240 * to recover. */ 241 pda = asmap->failedPDAs[0]; 242 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 243 /* need to determine the column of the other failed disk */ 244 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); 245 /* compute the data unit offset within the column */ 246 coeff = (coeff % raidPtr->Layout.numDataCol); 247 for (i = 0; i < numDataCol; i++) { 248 npda.raidAddress = sosAddr + (i * secPerSU); 249 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0); 250 /* skip over dead disks */ 251 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status)) 252 if (i != coeff) 253 break; 254 } 255 RF_ASSERT(i < numDataCol); 256 RF_ASSERT(two == 0); 257 /* recover the data. Since we need only want to recover one 258 * column, we overwrite the parity with the other one. */ 259 if (coeff < i) /* recovering 'a' */ 260 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); 261 else /* recovering 'b' */ 262 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); 263 } else 264 RF_PANIC(); 265 266 RF_ETIMER_STOP(timer); 267 RF_ETIMER_EVAL(timer); 268 if (tracerec) 269 tracerec->q_us += RF_ETIMER_VAL_US(timer); 270 rf_GenericWakeupFunc(node, 0); 271} 272 273void 274rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node) 275{ 276 /* The situation: 277 * 278 * We are doing a write that hits only one failed data unit. The other 279 * failed data unit is not being overwritten, so we need to generate 280 * it. 281 * 282 * For the moment, we assume all the nonfailed data being written is in 283 * the shadow of the failed data unit. (i.e,, either a single data 284 * unit write or the entire failed stripe unit is being overwritten. ) 285 * 286 * Recovery strategy: apply the recovery data to the parity and q. Use P 287 * & Q to recover the second failed data unit in P. Zero fill Q, then 288 * apply the recovered data to p. Then apply the data being written to 289 * the failed drive. Then walk through the surviving drives, applying 290 * new data when it exists, otherwise the recovery data. Quite a mess. 291 * 292 * 293 * The params 294 * 295 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... , 296 * write pda (numStripeUnitAccess - numDataFailed), failed pda, 297 * raidPtr, asmap */ 298 299 int np = node->numParams; 300 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; 301 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; 302 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); 303 int i; 304 RF_RaidAddr_t sosAddr; 305 unsigned coeff; 306 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 307 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda; 308 int numDataCol = layoutPtr->numDataCol; 309 RF_Etimer_t timer; 310 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 311 312 RF_ASSERT(node->numResults == 2); 313 RF_ASSERT(asmap->failedPDAs[1] == NULL); 314 RF_ETIMER_START(timer); 315 ppda = node->results[0]; 316 qpda = node->results[1]; 317 /* apply the recovery data */ 318 for (i = 0; i < numDataCol - 2; i++) 319 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); 320 321 /* determine the other failed data unit */ 322 pda = asmap->failedPDAs[0]; 323 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 324 /* need to determine the column of the other failed disk */ 325 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); 326 /* compute the data unit offset within the column */ 327 coeff = (coeff % raidPtr->Layout.numDataCol); 328 for (i = 0; i < numDataCol; i++) { 329 npda.raidAddress = sosAddr + (i * secPerSU); 330 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0); 331 /* skip over dead disks */ 332 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status)) 333 if (i != coeff) 334 break; 335 } 336 RF_ASSERT(i < numDataCol); 337 /* recover the data. The column we want to recover we write over the 338 * parity. The column we don't care about we dump in q. */ 339 if (coeff < i) /* recovering 'a' */ 340 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); 341 else /* recovering 'b' */ 342 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); 343 344 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */ 345 memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector)); 346 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i); 347 348 /* now apply all the write data to the buffer */ 349 /* single stripe unit write case: the failed data is only thing we are 350 * writing. */ 351 RF_ASSERT(asmap->numStripeUnitsAccessed == 1); 352 /* dest, src, len, coeff */ 353 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff); 354 /* rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp); */ 355 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector)); 356 357 /* now apply all the recovery data */ 358 for (i = 0; i < numDataCol - 2; i++) 359 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); 360 361 RF_ETIMER_STOP(timer); 362 RF_ETIMER_EVAL(timer); 363 if (tracerec) 364 tracerec->q_us += RF_ETIMER_VAL_US(timer); 365 366 rf_GenericWakeupFunc(node, 0); 367} 368RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite) 369{ 370 RF_PANIC(); 371} 372/* 373 Two lost data unit write case. 374 375 There are really two cases here: 376 377 (1) The write completely covers the two lost data units. 378 In that case, a reconstruct write that doesn't write the 379 failed data units will do the correct thing. So in this case, 380 the dag looks like 381 382 full stripe read of surviving data units (not being overwritten) 383 write new data (ignoring failed units) compute P&Q 384 write P&Q 385 386 387 (2) The write does not completely cover both failed data units 388 (but touches at least one of them). Then we need to do the 389 equivalent of a reconstruct read to recover the missing data 390 unit from the other stripe. 391 392 For any data we are writing that is not in the "shadow" 393 of the failed units, we need to do a four cycle update. 394 PANIC on this case. for now 395 396*/ 397 398RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG) 399{ 400 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 401 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit; 402 int sum; 403 int nf = asmap->numDataFailed; 404 405 sum = asmap->failedPDAs[0]->numSector; 406 if (nf == 2) 407 sum += asmap->failedPDAs[1]->numSector; 408 409 if ((nf == 2) && (sum == (2 * sectorsPerSU))) { 410 /* large write case */ 411 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList); 412 return; 413 } 414 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) { 415 /* small write case, no user data not in shadow */ 416 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList); 417 return; 418 } 419 RF_PANIC(); 420} 421RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite) 422{ 423 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc); 424} 425#endif /* (RF_INCLUDE_DECL_PQ > 0) || 426 * (RF_INCLUDE_RAID6 > 0) */ 427