1/*	$NetBSD: rf_parityscan.c,v 1.33 2009/11/17 18:54:26 jld Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*****************************************************************************
30 *
31 * rf_parityscan.c -- misc utilities related to parity verification
32 *
33 ****************************************************************************/
34
35#include <sys/cdefs.h>
36__KERNEL_RCSID(0, "$NetBSD: rf_parityscan.c,v 1.33 2009/11/17 18:54:26 jld Exp $");
37
38#include <dev/raidframe/raidframevar.h>
39
40#include "rf_raid.h"
41#include "rf_dag.h"
42#include "rf_dagfuncs.h"
43#include "rf_dagutils.h"
44#include "rf_mcpair.h"
45#include "rf_general.h"
46#include "rf_engine.h"
47#include "rf_parityscan.h"
48#include "rf_map.h"
49#include "rf_paritymap.h"
50
51/*****************************************************************************
52 *
53 * walk through the entire arry and write new parity.  This works by
54 * creating two DAGs, one to read a stripe of data and one to write
55 * new parity.  The first is executed, the data is xored together, and
56 * then the second is executed.  To avoid constantly building and
57 * tearing down the DAGs, we create them a priori and fill them in
58 * with the mapping information as we go along.
59 *
60 * there should never be more than one thread running this.
61 *
62 ****************************************************************************/
63
64int
65rf_RewriteParity(RF_Raid_t *raidPtr)
66{
67	if (raidPtr->parity_map != NULL)
68		return rf_paritymap_rewrite(raidPtr->parity_map);
69	else
70		return rf_RewriteParityRange(raidPtr, 0, raidPtr->totalSectors);
71}
72
73int
74rf_RewriteParityRange(RF_Raid_t *raidPtr, RF_SectorNum_t sec_begin,
75    RF_SectorNum_t sec_len)
76{
77	/*
78	 * Note: It is the caller's responsibility to ensure that
79	 * sec_begin and sec_len are stripe-aligned.
80	 */
81	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
82	RF_AccessStripeMapHeader_t *asm_h;
83	int ret_val;
84	int rc;
85	RF_SectorNum_t i;
86
87	if (raidPtr->Layout.map->faultsTolerated == 0) {
88		/* There isn't any parity. Call it "okay." */
89		return (RF_PARITY_OKAY);
90	}
91	if (raidPtr->status != rf_rs_optimal) {
92		/*
93		 * We're in degraded mode.  Don't try to verify parity now!
94		 * XXX: this should be a "we don't want to", not a
95		 * "we can't" error.
96		 */
97		return (RF_PARITY_COULD_NOT_VERIFY);
98	}
99
100	ret_val = 0;
101
102	rc = RF_PARITY_OKAY;
103
104	for (i = sec_begin; i < sec_begin + sec_len &&
105		     rc <= RF_PARITY_CORRECTED;
106	     i += layoutPtr->dataSectorsPerStripe) {
107		if (raidPtr->waitShutdown) {
108			/* Someone is pulling the plug on this set...
109			   abort the re-write */
110			return (1);
111		}
112		asm_h = rf_MapAccess(raidPtr, i,
113				     layoutPtr->dataSectorsPerStripe,
114				     NULL, RF_DONT_REMAP);
115		raidPtr->parity_rewrite_stripes_done =
116			i / layoutPtr->dataSectorsPerStripe ;
117		rc = rf_VerifyParity(raidPtr, asm_h->stripeMap, 1, 0);
118
119		switch (rc) {
120		case RF_PARITY_OKAY:
121		case RF_PARITY_CORRECTED:
122			break;
123		case RF_PARITY_BAD:
124			printf("Parity bad during correction\n");
125			ret_val = 1;
126			break;
127		case RF_PARITY_COULD_NOT_CORRECT:
128			printf("Could not correct bad parity\n");
129			ret_val = 1;
130			break;
131		case RF_PARITY_COULD_NOT_VERIFY:
132			printf("Could not verify parity\n");
133			ret_val = 1;
134			break;
135		default:
136			printf("Bad rc=%d from VerifyParity in RewriteParity\n", rc);
137			ret_val = 1;
138		}
139		rf_FreeAccessStripeMap(asm_h);
140	}
141	return (ret_val);
142}
143/*****************************************************************************
144 *
145 * verify that the parity in a particular stripe is correct.  we
146 * validate only the range of parity defined by parityPDA, since this
147 * is all we have locked.  The way we do this is to create an asm that
148 * maps the whole stripe and then range-restrict it to the parity
149 * region defined by the parityPDA.
150 *
151 ****************************************************************************/
152int
153rf_VerifyParity(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *aasm,
154		int correct_it, RF_RaidAccessFlags_t flags)
155{
156	RF_PhysDiskAddr_t *parityPDA;
157	RF_AccessStripeMap_t *doasm;
158	const RF_LayoutSW_t *lp;
159	int     lrc, rc;
160
161	lp = raidPtr->Layout.map;
162	if (lp->faultsTolerated == 0) {
163		/*
164	         * There isn't any parity. Call it "okay."
165	         */
166		return (RF_PARITY_OKAY);
167	}
168	rc = RF_PARITY_OKAY;
169	if (lp->VerifyParity) {
170		for (doasm = aasm; doasm; doasm = doasm->next) {
171			for (parityPDA = doasm->parityInfo; parityPDA;
172			     parityPDA = parityPDA->next) {
173				lrc = lp->VerifyParity(raidPtr,
174						       doasm->raidAddress,
175						       parityPDA,
176						       correct_it, flags);
177				if (lrc > rc) {
178					/* see rf_parityscan.h for why this
179					 * works */
180					rc = lrc;
181				}
182			}
183		}
184	} else {
185		rc = RF_PARITY_COULD_NOT_VERIFY;
186	}
187	return (rc);
188}
189
190int
191rf_VerifyParityBasic(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
192		     RF_PhysDiskAddr_t *parityPDA, int correct_it,
193		     RF_RaidAccessFlags_t flags)
194{
195	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
196	RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
197								     raidAddr);
198	RF_SectorCount_t numsector = parityPDA->numSector;
199	int     numbytes = rf_RaidAddressToByte(raidPtr, numsector);
200	int     bytesPerStripe = numbytes * layoutPtr->numDataCol;
201	RF_DagHeader_t *rd_dag_h, *wr_dag_h;	/* read, write dag */
202	RF_DagNode_t *blockNode, *wrBlock;
203	RF_AccessStripeMapHeader_t *asm_h;
204	RF_AccessStripeMap_t *asmap;
205	RF_AllocListElem_t *alloclist;
206	RF_PhysDiskAddr_t *pda;
207	char   *pbuf, *bf, *end_p, *p;
208	int     i, retcode;
209	RF_ReconUnitNum_t which_ru;
210	RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr,
211							     raidAddr,
212							     &which_ru);
213	int     stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
214#if RF_ACC_TRACE > 0
215	RF_AccTraceEntry_t tracerec;
216#endif
217	RF_MCPair_t *mcpair;
218
219	retcode = RF_PARITY_OKAY;
220
221	mcpair = rf_AllocMCPair();
222	rf_MakeAllocList(alloclist);
223	RF_MallocAndAdd(bf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
224	RF_MallocAndAdd(pbuf, numbytes, (char *), alloclist);
225	end_p = bf + bytesPerStripe;
226
227	rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, bf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
228	    "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
229	blockNode = rd_dag_h->succedents[0];
230
231	/* map the stripe and fill in the PDAs in the dag */
232	asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, bf, RF_DONT_REMAP);
233	asmap = asm_h->stripeMap;
234
235	for (pda = asmap->physInfo, i = 0; i < layoutPtr->numDataCol; i++, pda = pda->next) {
236		RF_ASSERT(pda);
237		rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
238		RF_ASSERT(pda->numSector != 0);
239		if (rf_TryToRedirectPDA(raidPtr, pda, 0))
240			goto out;	/* no way to verify parity if disk is
241					 * dead.  return w/ good status */
242		blockNode->succedents[i]->params[0].p = pda;
243		blockNode->succedents[i]->params[2].v = psID;
244		blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
245	}
246
247	RF_ASSERT(!asmap->parityInfo->next);
248	rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
249	RF_ASSERT(asmap->parityInfo->numSector != 0);
250	if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
251		goto out;
252	blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo;
253
254	/* fire off the DAG */
255#if RF_ACC_TRACE > 0
256	memset((char *) &tracerec, 0, sizeof(tracerec));
257	rd_dag_h->tracerec = &tracerec;
258#endif
259#if 0
260	if (rf_verifyParityDebug) {
261		printf("Parity verify read dag:\n");
262		rf_PrintDAGList(rd_dag_h);
263	}
264#endif
265	RF_LOCK_MCPAIR(mcpair);
266	mcpair->flag = 0;
267	RF_UNLOCK_MCPAIR(mcpair);
268
269	rf_DispatchDAG(rd_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
270	    (void *) mcpair);
271
272	RF_LOCK_MCPAIR(mcpair);
273	while (!mcpair->flag)
274		RF_WAIT_MCPAIR(mcpair);
275	RF_UNLOCK_MCPAIR(mcpair);
276	if (rd_dag_h->status != rf_enable) {
277		RF_ERRORMSG("Unable to verify parity:  can't read the stripe\n");
278		retcode = RF_PARITY_COULD_NOT_VERIFY;
279		goto out;
280	}
281	for (p = bf; p < end_p; p += numbytes) {
282		rf_bxor(p, pbuf, numbytes);
283	}
284	for (i = 0; i < numbytes; i++) {
285		if (pbuf[i] != bf[bytesPerStripe + i]) {
286			if (!correct_it)
287				RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
288				    i, (u_char) bf[bytesPerStripe + i], (u_char) pbuf[i]);
289			retcode = RF_PARITY_BAD;
290			break;
291		}
292	}
293
294	if (retcode && correct_it) {
295		wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
296		    "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
297		wrBlock = wr_dag_h->succedents[0];
298		wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
299		wrBlock->succedents[0]->params[2].v = psID;
300		wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
301#if RF_ACC_TRACE > 0
302		memset((char *) &tracerec, 0, sizeof(tracerec));
303		wr_dag_h->tracerec = &tracerec;
304#endif
305#if 0
306		if (rf_verifyParityDebug) {
307			printf("Parity verify write dag:\n");
308			rf_PrintDAGList(wr_dag_h);
309		}
310#endif
311		RF_LOCK_MCPAIR(mcpair);
312		mcpair->flag = 0;
313		RF_UNLOCK_MCPAIR(mcpair);
314
315		rf_DispatchDAG(wr_dag_h, (void (*) (void *)) rf_MCPairWakeupFunc,
316		    (void *) mcpair);
317
318		RF_LOCK_MCPAIR(mcpair);
319		while (!mcpair->flag)
320			RF_WAIT_MCPAIR(mcpair);
321		RF_UNLOCK_MCPAIR(mcpair);
322		if (wr_dag_h->status != rf_enable) {
323			RF_ERRORMSG("Unable to correct parity in VerifyParity:  can't write the stripe\n");
324			retcode = RF_PARITY_COULD_NOT_CORRECT;
325		}
326		rf_FreeDAG(wr_dag_h);
327		if (retcode == RF_PARITY_BAD)
328			retcode = RF_PARITY_CORRECTED;
329	}
330out:
331	rf_FreeAccessStripeMap(asm_h);
332	rf_FreeAllocList(alloclist);
333	rf_FreeDAG(rd_dag_h);
334	rf_FreeMCPair(mcpair);
335	return (retcode);
336}
337
338int
339rf_TryToRedirectPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
340    int parity)
341{
342	if (raidPtr->Disks[pda->col].status == rf_ds_reconstructing) {
343		if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, pda->startSector)) {
344#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
345			if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
346#if RF_DEBUG_VERIFYPARITY
347				RF_RowCol_t oc = pda->col;
348				RF_SectorNum_t os = pda->startSector;
349#endif
350				if (parity) {
351					(raidPtr->Layout.map->MapParity) (raidPtr, pda->raidAddress, &pda->col, &pda->startSector, RF_REMAP);
352#if RF_DEBUG_VERIFYPARITY
353					if (rf_verifyParityDebug)
354						printf("VerifyParity: Redir P c %d sect %ld -> c %d sect %ld\n",
355						    oc, (long) os, pda->col, (long) pda->startSector);
356#endif
357				} else {
358					(raidPtr->Layout.map->MapSector) (raidPtr, pda->raidAddress, &pda->col, &pda->startSector, RF_REMAP);
359#if RF_DEBUG_VERIFYPARITY
360					if (rf_verifyParityDebug)
361						printf("VerifyParity: Redir D c %d sect %ld -> c %d sect %ld\n",
362						   oc, (long) os, pda->col, (long) pda->startSector);
363#endif
364				}
365			} else {
366#endif
367				RF_RowCol_t spCol = raidPtr->Disks[pda->col].spareCol;
368				pda->col = spCol;
369#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
370			}
371#endif
372		}
373	}
374	if (RF_DEAD_DISK(raidPtr->Disks[pda->col].status))
375		return (1);
376	return (0);
377}
378/*****************************************************************************
379 *
380 * currently a stub.
381 *
382 * takes as input an ASM describing a write operation and containing
383 * one failure, and verifies that the parity was correctly updated to
384 * reflect the write.
385 *
386 * if it's a data unit that's failed, we read the other data units in
387 * the stripe and the parity unit, XOR them together, and verify that
388 * we get the data intended for the failed disk.  Since it's easy, we
389 * also validate that the right data got written to the surviving data
390 * disks.
391 *
392 * If it's the parity that failed, there's really no validation we can
393 * do except the above verification that the right data got written to
394 * all disks.  This is because the new data intended for the failed
395 * disk is supplied in the ASM, but this is of course not the case for
396 * the new parity.
397 *
398 ****************************************************************************/
399#if 0
400int
401rf_VerifyDegrModeWrite(RF_Raid_t *raidPtr, RF_AccessStripeMapHeader_t *asmh)
402{
403	return (0);
404}
405#endif
406/* creates a simple DAG with a header, a block-recon node at level 1,
407 * nNodes nodes at level 2, an unblock-recon node at level 3, and a
408 * terminator node at level 4.  The stripe address field in the block
409 * and unblock nodes are not touched, nor are the pda fields in the
410 * second-level nodes, so they must be filled in later.
411 *
412 * commit point is established at unblock node - this means that any
413 * failure during dag execution causes the dag to fail
414 *
415 * name - node names at the second level
416 */
417RF_DagHeader_t *
418rf_MakeSimpleDAG(RF_Raid_t *raidPtr, int nNodes, int bytesPerSU, char *databuf,
419		 int (*doFunc) (RF_DagNode_t * node),
420		 int (*undoFunc) (RF_DagNode_t * node),
421		 const char *name, RF_AllocListElem_t *alloclist,
422		 RF_RaidAccessFlags_t flags, int priority)
423{
424	RF_DagHeader_t *dag_h;
425	RF_DagNode_t *nodes, *termNode, *blockNode, *unblockNode, *tmpNode;
426	int     i;
427
428	/* grab a DAG header... */
429
430	dag_h = rf_AllocDAGHeader();
431	dag_h->raidPtr = (void *) raidPtr;
432	dag_h->allocList = NULL;/* we won't use this alloc list */
433	dag_h->status = rf_enable;
434	dag_h->numSuccedents = 1;
435	dag_h->creator = "SimpleDAG";
436
437	/* this dag can not commit until the unblock node is reached errors
438	 * prior to the commit point imply the dag has failed */
439	dag_h->numCommitNodes = 1;
440	dag_h->numCommits = 0;
441
442	/* create the nodes, the block & unblock nodes, and the terminator
443	 * node */
444
445	for (i = 0; i < nNodes; i++) {
446		tmpNode = rf_AllocDAGNode();
447		tmpNode->list_next = dag_h->nodes;
448		dag_h->nodes = tmpNode;
449	}
450	nodes = dag_h->nodes;
451
452	blockNode = rf_AllocDAGNode();
453	blockNode->list_next = dag_h->nodes;
454	dag_h->nodes = blockNode;
455
456	unblockNode = rf_AllocDAGNode();
457	unblockNode->list_next = dag_h->nodes;
458	dag_h->nodes = unblockNode;
459
460	termNode = rf_AllocDAGNode();
461	termNode->list_next = dag_h->nodes;
462	dag_h->nodes = termNode;
463
464	dag_h->succedents[0] = blockNode;
465	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", alloclist);
466	rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", alloclist);
467	unblockNode->succedents[0] = termNode;
468	tmpNode = nodes;
469	for (i = 0; i < nNodes; i++) {
470		blockNode->succedents[i] = unblockNode->antecedents[i] = tmpNode;
471		unblockNode->antType[i] = rf_control;
472		rf_InitNode(tmpNode, rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, alloclist);
473		tmpNode->succedents[0] = unblockNode;
474		tmpNode->antecedents[0] = blockNode;
475		tmpNode->antType[0] = rf_control;
476		tmpNode->params[1].p = (databuf + (i * bytesPerSU));
477		tmpNode = tmpNode->list_next;
478	}
479	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", alloclist);
480	termNode->antecedents[0] = unblockNode;
481	termNode->antType[0] = rf_control;
482	return (dag_h);
483}
484