1/*	$NetBSD: rf_pqdegdags.c,v 1.17 2023/10/15 18:15:20 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32*/
33
34
35#include <sys/cdefs.h>
36__KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.17 2023/10/15 18:15:20 oster Exp $");
37
38#include "rf_archs.h"
39
40#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
41
42#include <dev/raidframe/raidframevar.h>
43
44#include "rf_raid.h"
45#include "rf_dag.h"
46#include "rf_dagdegrd.h"
47#include "rf_dagdegwr.h"
48#include "rf_dagfuncs.h"
49#include "rf_dagutils.h"
50#include "rf_etimer.h"
51#include "rf_acctrace.h"
52#include "rf_general.h"
53#include "rf_pqdegdags.h"
54#include "rf_pq.h"
55
56static void
57applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
58    RF_PhysDiskAddr_t * qpda, const struct buf *bp);
59
60/*
61   Two data drives have failed, and we are doing a read that covers one of them.
62   We may also be reading some of the surviving drives.
63
64
65 *****************************************************************************************
66 *
67 * creates a DAG to perform a degraded-mode read of data within one stripe.
68 * This DAG is as follows:
69 *
70 *                                      Hdr
71 *                                       |
72 *                                     Block
73 *                       /         /           \         \     \   \
74 *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
75 *                      | \       | \         | \       | \    | \ | \
76 *
77 *                                 |                 |
78 *                              Unblock              X
79 *                                  \               /
80 *                                   ------ T ------
81 *
82 * Each R node is a successor of the L node
83 * One successor arc from each R node goes to U, and the other to X
84 * There is one Rud for each chunk of surviving user data requested by the user,
85 * and one Rrd for each chunk of surviving user data _not_ being read by the user
86 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
87 * X = pq recovery node, T = terminate
88 *
89 * The block & unblock nodes are leftovers from a previous version.  They
90 * do nothing, but I haven't deleted them because it would be a tremendous
91 * effort to put them back in.
92 *
93 * Note:  The target buffer for the XOR node is set to the actual user buffer where the
94 * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
95 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
96 * zero the target buffer prior to the re-use.
97 *
98 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
99 * needs and what's not.
100 ****************************************************************************************/
101/*   init a disk node with 2 successors and one predecessor */
102#define INIT_DISK_NODE(node,name) \
103rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
104(node)->succedents[0] = unblockNode; \
105(node)->succedents[1] = recoveryNode; \
106(node)->antecedents[0] = blockNode; \
107(node)->antType[0] = rf_control
108
109#define DISK_NODE_PARAMS(_node_,_p_) \
110  (_node_).params[0].p = _p_ ; \
111  (_node_).params[1].p = (_p_)->bufPtr; \
112  (_node_).params[2].v = parityStripeID; \
113  (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru)
114
115#define DISK_NODE_PDA(node)  ((node)->params[0].p)
116
117RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
118{
119	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
120	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
121}
122
123static void
124applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, const struct buf *bp)
125{
126	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
127	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
128	RF_SectorCount_t s0len = ppda->numSector, len;
129	RF_SectorNum_t suoffset;
130	unsigned coeff;
131	char   *pbuf = ppda->bufPtr;
132	char   *qbuf = qpda->bufPtr;
133	char   *buf;
134	int     delta;
135
136	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
137	len = pda->numSector;
138	/* see if pda intersects a recovery pda */
139	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
140		buf = pda->bufPtr;
141		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
142		coeff = (coeff % raidPtr->Layout.numDataCol);
143
144		if (suoffset < s0off) {
145			delta = s0off - suoffset;
146			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
147			suoffset = s0off;
148			len -= delta;
149		}
150		if (suoffset > s0off) {
151			delta = suoffset - s0off;
152			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
153			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
154		}
155		if ((suoffset + len) > (s0len + s0off))
156			len = s0len + s0off - suoffset;
157
158		/* src, dest, len */
159		/* rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp); */
160		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len));
161
162		/* dest, src, len, coeff */
163		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
164	}
165}
166/*
167   Recover data in the case of a double failure. There can be two
168   result buffers, one for each chunk of data trying to be recovered.
169   The params are pda's that have not been range restricted or otherwise
170   politely massaged - this should be done here. The last params are the
171   pdas of P and Q, followed by the raidPtr. The list can look like
172
173   pda, pda, ... , p pda, q pda, raidptr, asm
174
175   or
176
177   pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
178
179   depending on whether two chunks of recovery data were required.
180
181   The second condition only arises if there are two failed buffers
182   whose lengths do not add up a stripe unit.
183*/
184
185
186void
187rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
188{
189	int     np = node->numParams;
190	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
191	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
192	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
193	int     d, i;
194	unsigned coeff;
195	RF_RaidAddr_t sosAddr; /* , suoffset; */
196	RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
197	int     two = 0;
198	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
199	/* char   *buf; */
200	int     numDataCol = layoutPtr->numDataCol;
201	RF_Etimer_t timer;
202	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
203
204	RF_ETIMER_START(timer);
205
206	if (asmap->failedPDAs[1] &&
207	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
208		RF_ASSERT(0);
209		ppda = node->params[np - 6].p;
210		/*		ppda2 = node->params[np - 5].p; */
211		qpda = node->params[np - 4].p;
212		/* 		qpda2 = node->params[np - 3].p; */
213		d = (np - 6);
214		two = 1;
215	} else {
216		ppda = node->params[np - 4].p;
217		qpda = node->params[np - 3].p;
218		d = (np - 4);
219	}
220
221	for (i = 0; i < d; i++) {
222		pda = node->params[i].p;
223		/* 		buf = pda->bufPtr; */
224		/* 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); */
225		/* 		len = pda->numSector; */
226		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
227		/* compute the data unit offset within the column */
228		coeff = (coeff % raidPtr->Layout.numDataCol);
229		/* see if pda intersects a recovery pda */
230		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
231		if (two)
232			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
233	}
234
235	/* ok, we got the parity back to the point where we can recover. We
236	 * now need to determine the coeff of the columns that need to be
237	 * recovered. We can also only need to recover a single stripe unit. */
238
239	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
240						 * to recover. */
241		pda = asmap->failedPDAs[0];
242		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
243		/* need to determine the column of the other failed disk */
244		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
245		/* compute the data unit offset within the column */
246		coeff = (coeff % raidPtr->Layout.numDataCol);
247		for (i = 0; i < numDataCol; i++) {
248			npda.raidAddress = sosAddr + (i * secPerSU);
249			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
250			/* skip over dead disks */
251			if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
252				if (i != coeff)
253					break;
254		}
255		RF_ASSERT(i < numDataCol);
256		RF_ASSERT(two == 0);
257		/* recover the data. Since we need only want to recover one
258		 * column, we overwrite the parity with the other one. */
259		if (coeff < i)	/* recovering 'a' */
260			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
261		else		/* recovering 'b' */
262			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
263	} else
264		RF_PANIC();
265
266	RF_ETIMER_STOP(timer);
267	RF_ETIMER_EVAL(timer);
268	if (tracerec)
269		tracerec->q_us += RF_ETIMER_VAL_US(timer);
270	rf_GenericWakeupFunc(node, 0);
271}
272
273void
274rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
275{
276	/* The situation:
277	 *
278	 * We are doing a write that hits only one failed data unit. The other
279	 * failed data unit is not being overwritten, so we need to generate
280	 * it.
281	 *
282	 * For the moment, we assume all the nonfailed data being written is in
283	 * the shadow of the failed data unit. (i.e,, either a single data
284	 * unit write or the entire failed stripe unit is being overwritten. )
285	 *
286	 * Recovery strategy: apply the recovery data to the parity and q. Use P
287	 * & Q to recover the second failed data unit in P. Zero fill Q, then
288	 * apply the recovered data to p. Then apply the data being written to
289	 * the failed drive. Then walk through the surviving drives, applying
290	 * new data when it exists, otherwise the recovery data. Quite a mess.
291	 *
292	 *
293	 * The params
294	 *
295	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
296	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
297	 * raidPtr, asmap */
298
299	int     np = node->numParams;
300	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
301	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
302	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
303	int     i;
304	RF_RaidAddr_t sosAddr;
305	unsigned coeff;
306	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
307	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
308	int     numDataCol = layoutPtr->numDataCol;
309	RF_Etimer_t timer;
310	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
311
312	RF_ASSERT(node->numResults == 2);
313	RF_ASSERT(asmap->failedPDAs[1] == NULL);
314	RF_ETIMER_START(timer);
315	ppda = node->results[0];
316	qpda = node->results[1];
317	/* apply the recovery data */
318	for (i = 0; i < numDataCol - 2; i++)
319		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
320
321	/* determine the other failed data unit */
322	pda = asmap->failedPDAs[0];
323	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
324	/* need to determine the column of the other failed disk */
325	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
326	/* compute the data unit offset within the column */
327	coeff = (coeff % raidPtr->Layout.numDataCol);
328	for (i = 0; i < numDataCol; i++) {
329		npda.raidAddress = sosAddr + (i * secPerSU);
330		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
331		/* skip over dead disks */
332		if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
333			if (i != coeff)
334				break;
335	}
336	RF_ASSERT(i < numDataCol);
337	/* recover the data. The column we want to recover we write over the
338	 * parity. The column we don't care about we dump in q. */
339	if (coeff < i)		/* recovering 'a' */
340		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
341	else			/* recovering 'b' */
342		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
343
344	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
345	memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
346	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
347
348	/* now apply all the write data to the buffer */
349	/* single stripe unit write case: the failed data is only thing we are
350	 * writing. */
351	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
352	/* dest, src, len, coeff */
353	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
354	/* rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp); */
355	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector));
356
357	/* now apply all the recovery data */
358	for (i = 0; i < numDataCol - 2; i++)
359		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
360
361	RF_ETIMER_STOP(timer);
362	RF_ETIMER_EVAL(timer);
363	if (tracerec)
364		tracerec->q_us += RF_ETIMER_VAL_US(timer);
365
366	rf_GenericWakeupFunc(node, 0);
367}
368RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
369{
370	RF_PANIC();
371}
372/*
373   Two lost data unit write case.
374
375   There are really two cases here:
376
377   (1) The write completely covers the two lost data units.
378       In that case, a reconstruct write that doesn't write the
379       failed data units will do the correct thing. So in this case,
380       the dag looks like
381
382            full stripe read of surviving data units (not being overwritten)
383	    write new data (ignoring failed units)   compute P&Q
384	                                             write P&Q
385
386
387   (2) The write does not completely cover both failed data units
388       (but touches at least one of them). Then we need to do the
389       equivalent of a reconstruct read to recover the missing data
390       unit from the other stripe.
391
392       For any data we are writing that is not in the "shadow"
393       of the failed units, we need to do a four cycle update.
394       PANIC on this case. for now
395
396*/
397
398RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
399{
400	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
401	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
402	int     sum;
403	int     nf = asmap->numDataFailed;
404
405	sum = asmap->failedPDAs[0]->numSector;
406	if (nf == 2)
407		sum += asmap->failedPDAs[1]->numSector;
408
409	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
410		/* large write case */
411		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
412		return;
413	}
414	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
415		/* small write case, no user data not in shadow */
416		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
417		return;
418	}
419	RF_PANIC();
420}
421RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
422{
423	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
424}
425#endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
426				 * (RF_INCLUDE_RAID6 > 0) */
427