rf_dagfuncs.c revision 1.3
1/*	$NetBSD: rf_dagfuncs.c,v 1.3 1999/02/05 00:06:08 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * dagfuncs.c -- DAG node execution routines
31 *
32 * Rules:
33 * 1. Every DAG execution function must eventually cause node->status to
34 *    get set to "good" or "bad", and "FinishNode" to be called. In the
35 *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
36 *    the node execution function can do these two things directly. In
37 *    the case of nodes that have to wait for some event (a disk read to
38 *    complete, a lock to be released, etc) to occur before they can
39 *    complete, this is typically achieved by having whatever module
40 *    is doing the operation call GenericWakeupFunc upon completion.
41 * 2. DAG execution functions should check the status in the DAG header
42 *    and NOP out their operations if the status is not "enable". However,
43 *    execution functions that release resources must be sure to release
44 *    them even when they NOP out the function that would use them.
45 *    Functions that acquire resources should go ahead and acquire them
46 *    even when they NOP, so that a downstream release node will not have
47 *    to check to find out whether or not the acquire was suppressed.
48 */
49
50#include <sys/ioctl.h>
51#include <sys/param.h>
52
53#include "rf_archs.h"
54#include "rf_raid.h"
55#include "rf_dag.h"
56#include "rf_layout.h"
57#include "rf_etimer.h"
58#include "rf_acctrace.h"
59#include "rf_diskqueue.h"
60#include "rf_dagfuncs.h"
61#include "rf_general.h"
62#include "rf_engine.h"
63#include "rf_dagutils.h"
64
65#include "rf_kintf.h"
66
67#if RF_INCLUDE_PARITYLOGGING > 0
68#include "rf_paritylog.h"
69#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
70
71int     (*rf_DiskReadFunc) (RF_DagNode_t *);
72int     (*rf_DiskWriteFunc) (RF_DagNode_t *);
73int     (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
74int     (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
75int     (*rf_DiskUnlockFunc) (RF_DagNode_t *);
76int     (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
77int     (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
78int     (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
79int     (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
80
81/*****************************************************************************************
82 * main (only) configuration routine for this module
83 ****************************************************************************************/
84int
85rf_ConfigureDAGFuncs(listp)
86	RF_ShutdownList_t **listp;
87{
88	RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
89	rf_DiskReadFunc = rf_DiskReadFuncForThreads;
90	rf_DiskReadUndoFunc = rf_DiskUndoFunc;
91	rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
92	rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
93	rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
94	rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
95	rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
96	rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
97	rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
98	return (0);
99}
100
101
102
103/*****************************************************************************************
104 * the execution function associated with a terminate node
105 ****************************************************************************************/
106int
107rf_TerminateFunc(node)
108	RF_DagNode_t *node;
109{
110	RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
111	node->status = rf_good;
112	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
113}
114
115int
116rf_TerminateUndoFunc(node)
117	RF_DagNode_t *node;
118{
119	return (0);
120}
121
122
123/*****************************************************************************************
124 * execution functions associated with a mirror node
125 *
126 * parameters:
127 *
128 * 0 - physical disk addres of data
129 * 1 - buffer for holding read data
130 * 2 - parity stripe ID
131 * 3 - flags
132 * 4 - physical disk address of mirror (parity)
133 *
134 ****************************************************************************************/
135
136int
137rf_DiskReadMirrorIdleFunc(node)
138	RF_DagNode_t *node;
139{
140	/* select the mirror copy with the shortest queue and fill in node
141	 * parameters with physical disk address */
142
143	rf_SelectMirrorDiskIdle(node);
144	return (rf_DiskReadFunc(node));
145}
146
147int
148rf_DiskReadMirrorPartitionFunc(node)
149	RF_DagNode_t *node;
150{
151	/* select the mirror copy with the shortest queue and fill in node
152	 * parameters with physical disk address */
153
154	rf_SelectMirrorDiskPartition(node);
155	return (rf_DiskReadFunc(node));
156}
157
158int
159rf_DiskReadMirrorUndoFunc(node)
160	RF_DagNode_t *node;
161{
162	return (0);
163}
164
165
166
167#if RF_INCLUDE_PARITYLOGGING > 0
168/*****************************************************************************************
169 * the execution function associated with a parity log update node
170 ****************************************************************************************/
171int
172rf_ParityLogUpdateFunc(node)
173	RF_DagNode_t *node;
174{
175	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
176	caddr_t buf = (caddr_t) node->params[1].p;
177	RF_ParityLogData_t *logData;
178	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
179	RF_Etimer_t timer;
180
181	if (node->dagHdr->status == rf_enable) {
182		RF_ETIMER_START(timer);
183		logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
184		    (RF_Raid_t *) (node->dagHdr->raidPtr),
185		    node->wakeFunc, (void *) node,
186		    node->dagHdr->tracerec, timer);
187		if (logData)
188			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
189		else {
190			RF_ETIMER_STOP(timer);
191			RF_ETIMER_EVAL(timer);
192			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
193			(node->wakeFunc) (node, ENOMEM);
194		}
195	}
196	return (0);
197}
198
199
200/*****************************************************************************************
201 * the execution function associated with a parity log overwrite node
202 ****************************************************************************************/
203int
204rf_ParityLogOverwriteFunc(node)
205	RF_DagNode_t *node;
206{
207	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
208	caddr_t buf = (caddr_t) node->params[1].p;
209	RF_ParityLogData_t *logData;
210	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
211	RF_Etimer_t timer;
212
213	if (node->dagHdr->status == rf_enable) {
214		RF_ETIMER_START(timer);
215		logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
216		    node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
217		if (logData)
218			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
219		else {
220			RF_ETIMER_STOP(timer);
221			RF_ETIMER_EVAL(timer);
222			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
223			(node->wakeFunc) (node, ENOMEM);
224		}
225	}
226	return (0);
227}
228#else				/* RF_INCLUDE_PARITYLOGGING > 0 */
229
230int
231rf_ParityLogUpdateFunc(node)
232	RF_DagNode_t *node;
233{
234	return (0);
235}
236int
237rf_ParityLogOverwriteFunc(node)
238	RF_DagNode_t *node;
239{
240	return (0);
241}
242#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
243
244int
245rf_ParityLogUpdateUndoFunc(node)
246	RF_DagNode_t *node;
247{
248	return (0);
249}
250
251int
252rf_ParityLogOverwriteUndoFunc(node)
253	RF_DagNode_t *node;
254{
255	return (0);
256}
257/*****************************************************************************************
258 * the execution function associated with a NOP node
259 ****************************************************************************************/
260int
261rf_NullNodeFunc(node)
262	RF_DagNode_t *node;
263{
264	node->status = rf_good;
265	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
266}
267
268int
269rf_NullNodeUndoFunc(node)
270	RF_DagNode_t *node;
271{
272	node->status = rf_undone;
273	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
274}
275
276
277/*****************************************************************************************
278 * the execution function associated with a disk-read node
279 ****************************************************************************************/
280int
281rf_DiskReadFuncForThreads(node)
282	RF_DagNode_t *node;
283{
284	RF_DiskQueueData_t *req;
285	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
286	caddr_t buf = (caddr_t) node->params[1].p;
287	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
288	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
289	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
290	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
291	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
292	RF_DiskQueueDataFlags_t flags = 0;
293	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
294	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
295	void   *b_proc = NULL;
296#if RF_BACKWARD > 0
297	caddr_t undoBuf;
298#endif
299
300	if (node->dagHdr->bp)
301		b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
302
303	RF_ASSERT(!(lock && unlock));
304	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
305	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
306#if RF_BACKWARD > 0
307	/* allocate and zero the undo buffer. this is equivalent to copying
308	 * the original buffer's contents to the undo buffer prior to
309	 * performing the disk read. XXX hardcoded 512 bytes per sector! */
310	if (node->dagHdr->allocList == NULL)
311		rf_MakeAllocList(node->dagHdr->allocList);
312	RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
313#endif				/* RF_BACKWARD > 0 */
314	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
315	    buf, parityStripeID, which_ru,
316	    (int (*) (void *, int)) node->wakeFunc,
317	    node, NULL, node->dagHdr->tracerec,
318	    (void *) (node->dagHdr->raidPtr), flags, b_proc);
319	if (!req) {
320		(node->wakeFunc) (node, ENOMEM);
321	} else {
322		node->dagFuncData = (void *) req;
323		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
324	}
325	return (0);
326}
327
328
329/*****************************************************************************************
330 * the execution function associated with a disk-write node
331 ****************************************************************************************/
332int
333rf_DiskWriteFuncForThreads(node)
334	RF_DagNode_t *node;
335{
336	RF_DiskQueueData_t *req;
337	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
338	caddr_t buf = (caddr_t) node->params[1].p;
339	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
340	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
341	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
342	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
343	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
344	RF_DiskQueueDataFlags_t flags = 0;
345	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
346	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
347	void   *b_proc = NULL;
348#if RF_BACKWARD > 0
349	caddr_t undoBuf;
350#endif
351
352	if (node->dagHdr->bp)
353		b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
354
355#if RF_BACKWARD > 0
356	/* This area is used only for backward error recovery experiments
357	 * First, schedule allocate a buffer and schedule a pre-read of the
358	 * disk After the pre-read, proceed with the normal disk write */
359	if (node->status == rf_bwd2) {
360		/* just finished undo logging, now perform real function */
361		node->status = rf_fired;
362		RF_ASSERT(!(lock && unlock));
363		flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
364		flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
365		req = rf_CreateDiskQueueData(iotype,
366		    pda->startSector, pda->numSector, buf, parityStripeID, which_ru,
367		    node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
368		    (void *) (node->dagHdr->raidPtr), flags, b_proc);
369
370		if (!req) {
371			(node->wakeFunc) (node, ENOMEM);
372		} else {
373			node->dagFuncData = (void *) req;
374			rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
375		}
376	} else {
377		/* node status should be rf_fired */
378		/* schedule a disk pre-read */
379		node->status = rf_bwd1;
380		RF_ASSERT(!(lock && unlock));
381		flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
382		flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
383		if (node->dagHdr->allocList == NULL)
384			rf_MakeAllocList(node->dagHdr->allocList);
385		RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
386		req = rf_CreateDiskQueueData(RF_IO_TYPE_READ,
387		    pda->startSector, pda->numSector, undoBuf, parityStripeID, which_ru,
388		    node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
389		    (void *) (node->dagHdr->raidPtr), flags, b_proc);
390
391		if (!req) {
392			(node->wakeFunc) (node, ENOMEM);
393		} else {
394			node->dagFuncData = (void *) req;
395			rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
396		}
397	}
398	return (0);
399#endif				/* RF_BACKWARD > 0 */
400
401	/* normal processing (rollaway or forward recovery) begins here */
402	RF_ASSERT(!(lock && unlock));
403	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
404	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
405	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
406	    buf, parityStripeID, which_ru,
407	    (int (*) (void *, int)) node->wakeFunc,
408	    (void *) node, NULL,
409	    node->dagHdr->tracerec,
410	    (void *) (node->dagHdr->raidPtr),
411	    flags, b_proc);
412
413	if (!req) {
414		(node->wakeFunc) (node, ENOMEM);
415	} else {
416		node->dagFuncData = (void *) req;
417		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
418	}
419
420	return (0);
421}
422/*****************************************************************************************
423 * the undo function for disk nodes
424 * Note:  this is not a proper undo of a write node, only locks are released.
425 *        old data is not restored to disk!
426 ****************************************************************************************/
427int
428rf_DiskUndoFunc(node)
429	RF_DagNode_t *node;
430{
431	RF_DiskQueueData_t *req;
432	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
433	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
434
435	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
436	    0L, 0, NULL, 0L, 0,
437	    (int (*) (void *, int)) node->wakeFunc,
438	    (void *) node,
439	    NULL, node->dagHdr->tracerec,
440	    (void *) (node->dagHdr->raidPtr),
441	    RF_UNLOCK_DISK_QUEUE, NULL);
442	if (!req)
443		(node->wakeFunc) (node, ENOMEM);
444	else {
445		node->dagFuncData = (void *) req;
446		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
447	}
448
449	return (0);
450}
451/*****************************************************************************************
452 * the execution function associated with an "unlock disk queue" node
453 ****************************************************************************************/
454int
455rf_DiskUnlockFuncForThreads(node)
456	RF_DagNode_t *node;
457{
458	RF_DiskQueueData_t *req;
459	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
460	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
461
462	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
463	    0L, 0, NULL, 0L, 0,
464	    (int (*) (void *, int)) node->wakeFunc,
465	    (void *) node,
466	    NULL, node->dagHdr->tracerec,
467	    (void *) (node->dagHdr->raidPtr),
468	    RF_UNLOCK_DISK_QUEUE, NULL);
469	if (!req)
470		(node->wakeFunc) (node, ENOMEM);
471	else {
472		node->dagFuncData = (void *) req;
473		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
474	}
475
476	return (0);
477}
478/*****************************************************************************************
479 * Callback routine for DiskRead and DiskWrite nodes.  When the disk op completes,
480 * the routine is called to set the node status and inform the execution engine that
481 * the node has fired.
482 ****************************************************************************************/
483int
484rf_GenericWakeupFunc(node, status)
485	RF_DagNode_t *node;
486	int     status;
487{
488	switch (node->status) {
489	case rf_bwd1:
490		node->status = rf_bwd2;
491		if (node->dagFuncData)
492			rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
493		return (rf_DiskWriteFuncForThreads(node));
494		break;
495	case rf_fired:
496		if (status)
497			node->status = rf_bad;
498		else
499			node->status = rf_good;
500		break;
501	case rf_recover:
502		/* probably should never reach this case */
503		if (status)
504			node->status = rf_panic;
505		else
506			node->status = rf_undone;
507		break;
508	default:
509		RF_PANIC();
510		break;
511	}
512	if (node->dagFuncData)
513		rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
514	return (rf_FinishNode(node, RF_INTR_CONTEXT));
515}
516
517
518/*****************************************************************************************
519 * there are three distinct types of xor nodes
520 * A "regular xor" is used in the fault-free case where the access spans a complete
521 * stripe unit.  It assumes that the result buffer is one full stripe unit in size,
522 * and uses the stripe-unit-offset values that it computes from the PDAs to determine
523 * where within the stripe unit to XOR each argument buffer.
524 *
525 * A "simple xor" is used in the fault-free case where the access touches only a portion
526 * of one (or two, in some cases) stripe unit(s).  It assumes that all the argument
527 * buffers are of the same size and have the same stripe unit offset.
528 *
529 * A "recovery xor" is used in the degraded-mode case.  It's similar to the regular
530 * xor function except that it takes the failed PDA as an additional parameter, and
531 * uses it to determine what portions of the argument buffers need to be xor'd into
532 * the result buffer, and where in the result buffer they should go.
533 ****************************************************************************************/
534
535/* xor the params together and store the result in the result field.
536 * assume the result field points to a buffer that is the size of one SU,
537 * and use the pda params to determine where within the buffer to XOR
538 * the input buffers.
539 */
540int
541rf_RegularXorFunc(node)
542	RF_DagNode_t *node;
543{
544	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
545	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
546	RF_Etimer_t timer;
547	int     i, retcode;
548#if RF_BACKWARD > 0
549	RF_PhysDiskAddr_t *pda;
550	caddr_t undoBuf;
551#endif
552
553	retcode = 0;
554	if (node->dagHdr->status == rf_enable) {
555		/* don't do the XOR if the input is the same as the output */
556		RF_ETIMER_START(timer);
557		for (i = 0; i < node->numParams - 1; i += 2)
558			if (node->params[i + 1].p != node->results[0]) {
559#if RF_BACKWARD > 0
560				/* This section mimics undo logging for
561				 * backward error recovery experiments b
562				 * allocating and initializing a buffer XXX
563				 * 512 byte sector size is hard coded! */
564				pda = node->params[i].p;
565				if (node->dagHdr->allocList == NULL)
566					rf_MakeAllocList(node->dagHdr->allocList);
567				RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
568#endif				/* RF_BACKWARD > 0 */
569				retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
570				    (char *) node->params[i + 1].p, (char *) node->results[0], node->dagHdr->bp);
571			}
572		RF_ETIMER_STOP(timer);
573		RF_ETIMER_EVAL(timer);
574		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
575	}
576	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
577							 * explicitly since no
578							 * I/O in this node */
579}
580/* xor the inputs into the result buffer, ignoring placement issues */
581int
582rf_SimpleXorFunc(node)
583	RF_DagNode_t *node;
584{
585	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
586	int     i, retcode = 0;
587	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
588	RF_Etimer_t timer;
589#if RF_BACKWARD > 0
590	RF_PhysDiskAddr_t *pda;
591	caddr_t undoBuf;
592#endif
593
594	if (node->dagHdr->status == rf_enable) {
595		RF_ETIMER_START(timer);
596		/* don't do the XOR if the input is the same as the output */
597		for (i = 0; i < node->numParams - 1; i += 2)
598			if (node->params[i + 1].p != node->results[0]) {
599#if RF_BACKWARD > 0
600				/* This section mimics undo logging for
601				 * backward error recovery experiments b
602				 * allocating and initializing a buffer XXX
603				 * 512 byte sector size is hard coded! */
604				pda = node->params[i].p;
605				if (node->dagHdr->allocList == NULL)
606					rf_MakeAllocList(node->dagHdr->allocList);
607				RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
608#endif				/* RF_BACKWARD > 0 */
609				retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0],
610				    rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector),
611				    (struct buf *) node->dagHdr->bp);
612			}
613		RF_ETIMER_STOP(timer);
614		RF_ETIMER_EVAL(timer);
615		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
616	}
617	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
618							 * explicitly since no
619							 * I/O in this node */
620}
621/* this xor is used by the degraded-mode dag functions to recover lost data.
622 * the second-to-last parameter is the PDA for the failed portion of the access.
623 * the code here looks at this PDA and assumes that the xor target buffer is
624 * equal in size to the number of sectors in the failed PDA.  It then uses
625 * the other PDAs in the parameter list to determine where within the target
626 * buffer the corresponding data should be xored.
627 */
628int
629rf_RecoveryXorFunc(node)
630	RF_DagNode_t *node;
631{
632	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
633	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
634	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
635	int     i, retcode = 0;
636	RF_PhysDiskAddr_t *pda;
637	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
638	char   *srcbuf, *destbuf;
639	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
640	RF_Etimer_t timer;
641#if RF_BACKWARD > 0
642	caddr_t undoBuf;
643#endif
644
645	if (node->dagHdr->status == rf_enable) {
646		RF_ETIMER_START(timer);
647		for (i = 0; i < node->numParams - 2; i += 2)
648			if (node->params[i + 1].p != node->results[0]) {
649				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
650#if RF_BACKWARD > 0
651				/* This section mimics undo logging for
652				 * backward error recovery experiments b
653				 * allocating and initializing a buffer XXX
654				 * 512 byte sector size is hard coded! */
655				if (node->dagHdr->allocList == NULL)
656					rf_MakeAllocList(node->dagHdr->allocList);
657				RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
658#endif				/* RF_BACKWARD > 0 */
659				srcbuf = (char *) node->params[i + 1].p;
660				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
661				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
662				retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
663			}
664		RF_ETIMER_STOP(timer);
665		RF_ETIMER_EVAL(timer);
666		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
667	}
668	return (rf_GenericWakeupFunc(node, retcode));
669}
670/*****************************************************************************************
671 * The next three functions are utilities used by the above xor-execution functions.
672 ****************************************************************************************/
673
674
675/*
676 * this is just a glorified buffer xor.  targbuf points to a buffer that is one full stripe unit
677 * in size.  srcbuf points to a buffer that may be less than 1 SU, but never more.  When the
678 * access described by pda is one SU in size (which by implication means it's SU-aligned),
679 * all that happens is (targbuf) <- (srcbuf ^ targbuf).  When the access is less than one
680 * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
681 */
682
683int
684rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
685	RF_Raid_t *raidPtr;
686	RF_PhysDiskAddr_t *pda;
687	char   *srcbuf;
688	char   *targbuf;
689	void   *bp;
690{
691	char   *targptr;
692	int     sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
693	int     SUOffset = pda->startSector % sectPerSU;
694	int     length, retcode = 0;
695
696	RF_ASSERT(pda->numSector <= sectPerSU);
697
698	targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
699	length = rf_RaidAddressToByte(raidPtr, pda->numSector);
700	retcode = rf_bxor(srcbuf, targptr, length, bp);
701	return (retcode);
702}
703/* it really should be the case that the buffer pointers (returned by malloc)
704 * are aligned to the natural word size of the machine, so this is the only
705 * case we optimize for.  The length should always be a multiple of the sector
706 * size, so there should be no problem with leftover bytes at the end.
707 */
708int
709rf_bxor(src, dest, len, bp)
710	char   *src;
711	char   *dest;
712	int     len;
713	void   *bp;
714{
715	unsigned mask = sizeof(long) - 1, retcode = 0;
716
717	if (!(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len & mask)) {
718		retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len >> RF_LONGSHIFT, bp);
719	} else {
720		RF_ASSERT(0);
721	}
722	return (retcode);
723}
724/* map a user buffer into kernel space, if necessary */
725#define REMAP_VA(_bp,x,y) (y) = (x)
726
727/* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
728 * We don't want to assume anything about which input buffers are in kernel/user
729 * space, nor about their alignment, so in each loop we compute the maximum number
730 * of bytes that we can xor without crossing any page boundaries, and do only this many
731 * bytes before the next remap.
732 */
733int
734rf_longword_bxor(src, dest, len, bp)
735	register unsigned long *src;
736	register unsigned long *dest;
737	int     len;		/* longwords */
738	void   *bp;
739{
740	register unsigned long *end = src + len;
741	register unsigned long d0, d1, d2, d3, s0, s1, s2, s3;	/* temps */
742	register unsigned long *pg_src, *pg_dest;	/* per-page source/dest
743							 * pointers */
744	int     longs_this_time;/* # longwords to xor in the current iteration */
745
746	REMAP_VA(bp, src, pg_src);
747	REMAP_VA(bp, dest, pg_dest);
748	if (!pg_src || !pg_dest)
749		return (EFAULT);
750
751	while (len >= 4) {
752		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT);	/* note len in longwords */
753		src += longs_this_time;
754		dest += longs_this_time;
755		len -= longs_this_time;
756		while (longs_this_time >= 4) {
757			d0 = pg_dest[0];
758			d1 = pg_dest[1];
759			d2 = pg_dest[2];
760			d3 = pg_dest[3];
761			s0 = pg_src[0];
762			s1 = pg_src[1];
763			s2 = pg_src[2];
764			s3 = pg_src[3];
765			pg_dest[0] = d0 ^ s0;
766			pg_dest[1] = d1 ^ s1;
767			pg_dest[2] = d2 ^ s2;
768			pg_dest[3] = d3 ^ s3;
769			pg_src += 4;
770			pg_dest += 4;
771			longs_this_time -= 4;
772		}
773		while (longs_this_time > 0) {	/* cannot cross any page
774						 * boundaries here */
775			*pg_dest++ ^= *pg_src++;
776			longs_this_time--;
777		}
778
779		/* either we're done, or we've reached a page boundary on one
780		 * (or possibly both) of the pointers */
781		if (len) {
782			if (RF_PAGE_ALIGNED(src))
783				REMAP_VA(bp, src, pg_src);
784			if (RF_PAGE_ALIGNED(dest))
785				REMAP_VA(bp, dest, pg_dest);
786			if (!pg_src || !pg_dest)
787				return (EFAULT);
788		}
789	}
790	while (src < end) {
791		*pg_dest++ ^= *pg_src++;
792		src++;
793		dest++;
794		len--;
795		if (RF_PAGE_ALIGNED(src))
796			REMAP_VA(bp, src, pg_src);
797		if (RF_PAGE_ALIGNED(dest))
798			REMAP_VA(bp, dest, pg_dest);
799	}
800	RF_ASSERT(len == 0);
801	return (0);
802}
803
804
805/*
806   dst = a ^ b ^ c;
807   a may equal dst
808   see comment above longword_bxor
809*/
810int
811rf_longword_bxor3(dst, a, b, c, len, bp)
812	register unsigned long *dst;
813	register unsigned long *a;
814	register unsigned long *b;
815	register unsigned long *c;
816	int     len;		/* length in longwords */
817	void   *bp;
818{
819	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
820	register unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;	/* per-page source/dest
821								 * pointers */
822	int     longs_this_time;/* # longs to xor in the current iteration */
823	char    dst_is_a = 0;
824
825	REMAP_VA(bp, a, pg_a);
826	REMAP_VA(bp, b, pg_b);
827	REMAP_VA(bp, c, pg_c);
828	if (a == dst) {
829		pg_dst = pg_a;
830		dst_is_a = 1;
831	} else {
832		REMAP_VA(bp, dst, pg_dst);
833	}
834
835	/* align dest to cache line.  Can't cross a pg boundary on dst here. */
836	while ((((unsigned long) pg_dst) & 0x1f)) {
837		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
838		dst++;
839		a++;
840		b++;
841		c++;
842		if (RF_PAGE_ALIGNED(a)) {
843			REMAP_VA(bp, a, pg_a);
844			if (!pg_a)
845				return (EFAULT);
846		}
847		if (RF_PAGE_ALIGNED(b)) {
848			REMAP_VA(bp, a, pg_b);
849			if (!pg_b)
850				return (EFAULT);
851		}
852		if (RF_PAGE_ALIGNED(c)) {
853			REMAP_VA(bp, a, pg_c);
854			if (!pg_c)
855				return (EFAULT);
856		}
857		len--;
858	}
859
860	while (len > 4) {
861		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
862		a += longs_this_time;
863		b += longs_this_time;
864		c += longs_this_time;
865		dst += longs_this_time;
866		len -= longs_this_time;
867		while (longs_this_time >= 4) {
868			a0 = pg_a[0];
869			longs_this_time -= 4;
870
871			a1 = pg_a[1];
872			a2 = pg_a[2];
873
874			a3 = pg_a[3];
875			pg_a += 4;
876
877			b0 = pg_b[0];
878			b1 = pg_b[1];
879
880			b2 = pg_b[2];
881			b3 = pg_b[3];
882			/* start dual issue */
883			a0 ^= b0;
884			b0 = pg_c[0];
885
886			pg_b += 4;
887			a1 ^= b1;
888
889			a2 ^= b2;
890			a3 ^= b3;
891
892			b1 = pg_c[1];
893			a0 ^= b0;
894
895			b2 = pg_c[2];
896			a1 ^= b1;
897
898			b3 = pg_c[3];
899			a2 ^= b2;
900
901			pg_dst[0] = a0;
902			a3 ^= b3;
903			pg_dst[1] = a1;
904			pg_c += 4;
905			pg_dst[2] = a2;
906			pg_dst[3] = a3;
907			pg_dst += 4;
908		}
909		while (longs_this_time > 0) {	/* cannot cross any page
910						 * boundaries here */
911			*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
912			longs_this_time--;
913		}
914
915		if (len) {
916			if (RF_PAGE_ALIGNED(a)) {
917				REMAP_VA(bp, a, pg_a);
918				if (!pg_a)
919					return (EFAULT);
920				if (dst_is_a)
921					pg_dst = pg_a;
922			}
923			if (RF_PAGE_ALIGNED(b)) {
924				REMAP_VA(bp, b, pg_b);
925				if (!pg_b)
926					return (EFAULT);
927			}
928			if (RF_PAGE_ALIGNED(c)) {
929				REMAP_VA(bp, c, pg_c);
930				if (!pg_c)
931					return (EFAULT);
932			}
933			if (!dst_is_a)
934				if (RF_PAGE_ALIGNED(dst)) {
935					REMAP_VA(bp, dst, pg_dst);
936					if (!pg_dst)
937						return (EFAULT);
938				}
939		}
940	}
941	while (len) {
942		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
943		dst++;
944		a++;
945		b++;
946		c++;
947		if (RF_PAGE_ALIGNED(a)) {
948			REMAP_VA(bp, a, pg_a);
949			if (!pg_a)
950				return (EFAULT);
951			if (dst_is_a)
952				pg_dst = pg_a;
953		}
954		if (RF_PAGE_ALIGNED(b)) {
955			REMAP_VA(bp, b, pg_b);
956			if (!pg_b)
957				return (EFAULT);
958		}
959		if (RF_PAGE_ALIGNED(c)) {
960			REMAP_VA(bp, c, pg_c);
961			if (!pg_c)
962				return (EFAULT);
963		}
964		if (!dst_is_a)
965			if (RF_PAGE_ALIGNED(dst)) {
966				REMAP_VA(bp, dst, pg_dst);
967				if (!pg_dst)
968					return (EFAULT);
969			}
970		len--;
971	}
972	return (0);
973}
974
975int
976rf_bxor3(dst, a, b, c, len, bp)
977	register unsigned char *dst;
978	register unsigned char *a;
979	register unsigned char *b;
980	register unsigned char *c;
981	unsigned long len;
982	void   *bp;
983{
984	RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0);
985
986	return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
987		(unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp));
988}
989