rf_dagfuncs.c revision 1.8
1/*	$NetBSD: rf_dagfuncs.c,v 1.8 2001/11/13 07:11:13 lukem Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*
30 * dagfuncs.c -- DAG node execution routines
31 *
32 * Rules:
33 * 1. Every DAG execution function must eventually cause node->status to
34 *    get set to "good" or "bad", and "FinishNode" to be called. In the
35 *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
36 *    the node execution function can do these two things directly. In
37 *    the case of nodes that have to wait for some event (a disk read to
38 *    complete, a lock to be released, etc) to occur before they can
39 *    complete, this is typically achieved by having whatever module
40 *    is doing the operation call GenericWakeupFunc upon completion.
41 * 2. DAG execution functions should check the status in the DAG header
42 *    and NOP out their operations if the status is not "enable". However,
43 *    execution functions that release resources must be sure to release
44 *    them even when they NOP out the function that would use them.
45 *    Functions that acquire resources should go ahead and acquire them
46 *    even when they NOP, so that a downstream release node will not have
47 *    to check to find out whether or not the acquire was suppressed.
48 */
49
50#include <sys/cdefs.h>
51__KERNEL_RCSID(0, "$NetBSD: rf_dagfuncs.c,v 1.8 2001/11/13 07:11:13 lukem Exp $");
52
53#include <sys/param.h>
54#include <sys/ioctl.h>
55
56#include "rf_archs.h"
57#include "rf_raid.h"
58#include "rf_dag.h"
59#include "rf_layout.h"
60#include "rf_etimer.h"
61#include "rf_acctrace.h"
62#include "rf_diskqueue.h"
63#include "rf_dagfuncs.h"
64#include "rf_general.h"
65#include "rf_engine.h"
66#include "rf_dagutils.h"
67
68#include "rf_kintf.h"
69
70#if RF_INCLUDE_PARITYLOGGING > 0
71#include "rf_paritylog.h"
72#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
73
74int     (*rf_DiskReadFunc) (RF_DagNode_t *);
75int     (*rf_DiskWriteFunc) (RF_DagNode_t *);
76int     (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
77int     (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
78int     (*rf_DiskUnlockFunc) (RF_DagNode_t *);
79int     (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
80int     (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
81int     (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
82int     (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
83
84/*****************************************************************************************
85 * main (only) configuration routine for this module
86 ****************************************************************************************/
87int
88rf_ConfigureDAGFuncs(listp)
89	RF_ShutdownList_t **listp;
90{
91	RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
92	rf_DiskReadFunc = rf_DiskReadFuncForThreads;
93	rf_DiskReadUndoFunc = rf_DiskUndoFunc;
94	rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
95	rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
96	rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
97	rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
98	rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
99	rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
100	rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
101	return (0);
102}
103
104
105
106/*****************************************************************************************
107 * the execution function associated with a terminate node
108 ****************************************************************************************/
109int
110rf_TerminateFunc(node)
111	RF_DagNode_t *node;
112{
113	RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
114	node->status = rf_good;
115	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
116}
117
118int
119rf_TerminateUndoFunc(node)
120	RF_DagNode_t *node;
121{
122	return (0);
123}
124
125
126/*****************************************************************************************
127 * execution functions associated with a mirror node
128 *
129 * parameters:
130 *
131 * 0 - physical disk addres of data
132 * 1 - buffer for holding read data
133 * 2 - parity stripe ID
134 * 3 - flags
135 * 4 - physical disk address of mirror (parity)
136 *
137 ****************************************************************************************/
138
139int
140rf_DiskReadMirrorIdleFunc(node)
141	RF_DagNode_t *node;
142{
143	/* select the mirror copy with the shortest queue and fill in node
144	 * parameters with physical disk address */
145
146	rf_SelectMirrorDiskIdle(node);
147	return (rf_DiskReadFunc(node));
148}
149
150int
151rf_DiskReadMirrorPartitionFunc(node)
152	RF_DagNode_t *node;
153{
154	/* select the mirror copy with the shortest queue and fill in node
155	 * parameters with physical disk address */
156
157	rf_SelectMirrorDiskPartition(node);
158	return (rf_DiskReadFunc(node));
159}
160
161int
162rf_DiskReadMirrorUndoFunc(node)
163	RF_DagNode_t *node;
164{
165	return (0);
166}
167
168
169
170#if RF_INCLUDE_PARITYLOGGING > 0
171/*****************************************************************************************
172 * the execution function associated with a parity log update node
173 ****************************************************************************************/
174int
175rf_ParityLogUpdateFunc(node)
176	RF_DagNode_t *node;
177{
178	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
179	caddr_t buf = (caddr_t) node->params[1].p;
180	RF_ParityLogData_t *logData;
181	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
182	RF_Etimer_t timer;
183
184	if (node->dagHdr->status == rf_enable) {
185		RF_ETIMER_START(timer);
186		logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
187		    (RF_Raid_t *) (node->dagHdr->raidPtr),
188		    node->wakeFunc, (void *) node,
189		    node->dagHdr->tracerec, timer);
190		if (logData)
191			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
192		else {
193			RF_ETIMER_STOP(timer);
194			RF_ETIMER_EVAL(timer);
195			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
196			(node->wakeFunc) (node, ENOMEM);
197		}
198	}
199	return (0);
200}
201
202
203/*****************************************************************************************
204 * the execution function associated with a parity log overwrite node
205 ****************************************************************************************/
206int
207rf_ParityLogOverwriteFunc(node)
208	RF_DagNode_t *node;
209{
210	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
211	caddr_t buf = (caddr_t) node->params[1].p;
212	RF_ParityLogData_t *logData;
213	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
214	RF_Etimer_t timer;
215
216	if (node->dagHdr->status == rf_enable) {
217		RF_ETIMER_START(timer);
218		logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
219		    node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
220		if (logData)
221			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
222		else {
223			RF_ETIMER_STOP(timer);
224			RF_ETIMER_EVAL(timer);
225			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
226			(node->wakeFunc) (node, ENOMEM);
227		}
228	}
229	return (0);
230}
231#else				/* RF_INCLUDE_PARITYLOGGING > 0 */
232
233int
234rf_ParityLogUpdateFunc(node)
235	RF_DagNode_t *node;
236{
237	return (0);
238}
239int
240rf_ParityLogOverwriteFunc(node)
241	RF_DagNode_t *node;
242{
243	return (0);
244}
245#endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
246
247int
248rf_ParityLogUpdateUndoFunc(node)
249	RF_DagNode_t *node;
250{
251	return (0);
252}
253
254int
255rf_ParityLogOverwriteUndoFunc(node)
256	RF_DagNode_t *node;
257{
258	return (0);
259}
260/*****************************************************************************************
261 * the execution function associated with a NOP node
262 ****************************************************************************************/
263int
264rf_NullNodeFunc(node)
265	RF_DagNode_t *node;
266{
267	node->status = rf_good;
268	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
269}
270
271int
272rf_NullNodeUndoFunc(node)
273	RF_DagNode_t *node;
274{
275	node->status = rf_undone;
276	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
277}
278
279
280/*****************************************************************************************
281 * the execution function associated with a disk-read node
282 ****************************************************************************************/
283int
284rf_DiskReadFuncForThreads(node)
285	RF_DagNode_t *node;
286{
287	RF_DiskQueueData_t *req;
288	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
289	caddr_t buf = (caddr_t) node->params[1].p;
290	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
291	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
292	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
293	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
294	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
295	RF_DiskQueueDataFlags_t flags = 0;
296	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
297	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
298	void   *b_proc = NULL;
299
300	if (node->dagHdr->bp)
301		b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
302
303	RF_ASSERT(!(lock && unlock));
304	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
305	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
306
307	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
308	    buf, parityStripeID, which_ru,
309	    (int (*) (void *, int)) node->wakeFunc,
310	    node, NULL, node->dagHdr->tracerec,
311	    (void *) (node->dagHdr->raidPtr), flags, b_proc);
312	if (!req) {
313		(node->wakeFunc) (node, ENOMEM);
314	} else {
315		node->dagFuncData = (void *) req;
316		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
317	}
318	return (0);
319}
320
321
322/*****************************************************************************************
323 * the execution function associated with a disk-write node
324 ****************************************************************************************/
325int
326rf_DiskWriteFuncForThreads(node)
327	RF_DagNode_t *node;
328{
329	RF_DiskQueueData_t *req;
330	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
331	caddr_t buf = (caddr_t) node->params[1].p;
332	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
333	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
334	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
335	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
336	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
337	RF_DiskQueueDataFlags_t flags = 0;
338	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
339	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
340	void   *b_proc = NULL;
341
342	if (node->dagHdr->bp)
343		b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
344
345	/* normal processing (rollaway or forward recovery) begins here */
346	RF_ASSERT(!(lock && unlock));
347	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
348	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
349	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
350	    buf, parityStripeID, which_ru,
351	    (int (*) (void *, int)) node->wakeFunc,
352	    (void *) node, NULL,
353	    node->dagHdr->tracerec,
354	    (void *) (node->dagHdr->raidPtr),
355	    flags, b_proc);
356
357	if (!req) {
358		(node->wakeFunc) (node, ENOMEM);
359	} else {
360		node->dagFuncData = (void *) req;
361		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
362	}
363
364	return (0);
365}
366/*****************************************************************************************
367 * the undo function for disk nodes
368 * Note:  this is not a proper undo of a write node, only locks are released.
369 *        old data is not restored to disk!
370 ****************************************************************************************/
371int
372rf_DiskUndoFunc(node)
373	RF_DagNode_t *node;
374{
375	RF_DiskQueueData_t *req;
376	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
377	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
378
379	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
380	    0L, 0, NULL, 0L, 0,
381	    (int (*) (void *, int)) node->wakeFunc,
382	    (void *) node,
383	    NULL, node->dagHdr->tracerec,
384	    (void *) (node->dagHdr->raidPtr),
385	    RF_UNLOCK_DISK_QUEUE, NULL);
386	if (!req)
387		(node->wakeFunc) (node, ENOMEM);
388	else {
389		node->dagFuncData = (void *) req;
390		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
391	}
392
393	return (0);
394}
395/*****************************************************************************************
396 * the execution function associated with an "unlock disk queue" node
397 ****************************************************************************************/
398int
399rf_DiskUnlockFuncForThreads(node)
400	RF_DagNode_t *node;
401{
402	RF_DiskQueueData_t *req;
403	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
404	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
405
406	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
407	    0L, 0, NULL, 0L, 0,
408	    (int (*) (void *, int)) node->wakeFunc,
409	    (void *) node,
410	    NULL, node->dagHdr->tracerec,
411	    (void *) (node->dagHdr->raidPtr),
412	    RF_UNLOCK_DISK_QUEUE, NULL);
413	if (!req)
414		(node->wakeFunc) (node, ENOMEM);
415	else {
416		node->dagFuncData = (void *) req;
417		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
418	}
419
420	return (0);
421}
422/*****************************************************************************************
423 * Callback routine for DiskRead and DiskWrite nodes.  When the disk op completes,
424 * the routine is called to set the node status and inform the execution engine that
425 * the node has fired.
426 ****************************************************************************************/
427int
428rf_GenericWakeupFunc(node, status)
429	RF_DagNode_t *node;
430	int     status;
431{
432	switch (node->status) {
433	case rf_bwd1:
434		node->status = rf_bwd2;
435		if (node->dagFuncData)
436			rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
437		return (rf_DiskWriteFuncForThreads(node));
438		break;
439	case rf_fired:
440		if (status)
441			node->status = rf_bad;
442		else
443			node->status = rf_good;
444		break;
445	case rf_recover:
446		/* probably should never reach this case */
447		if (status)
448			node->status = rf_panic;
449		else
450			node->status = rf_undone;
451		break;
452	default:
453		printf("rf_GenericWakeupFunc:");
454		printf("node->status is %d,", node->status);
455		printf("status is %d \n", status);
456		RF_PANIC();
457		break;
458	}
459	if (node->dagFuncData)
460		rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
461	return (rf_FinishNode(node, RF_INTR_CONTEXT));
462}
463
464
465/*****************************************************************************************
466 * there are three distinct types of xor nodes
467 * A "regular xor" is used in the fault-free case where the access spans a complete
468 * stripe unit.  It assumes that the result buffer is one full stripe unit in size,
469 * and uses the stripe-unit-offset values that it computes from the PDAs to determine
470 * where within the stripe unit to XOR each argument buffer.
471 *
472 * A "simple xor" is used in the fault-free case where the access touches only a portion
473 * of one (or two, in some cases) stripe unit(s).  It assumes that all the argument
474 * buffers are of the same size and have the same stripe unit offset.
475 *
476 * A "recovery xor" is used in the degraded-mode case.  It's similar to the regular
477 * xor function except that it takes the failed PDA as an additional parameter, and
478 * uses it to determine what portions of the argument buffers need to be xor'd into
479 * the result buffer, and where in the result buffer they should go.
480 ****************************************************************************************/
481
482/* xor the params together and store the result in the result field.
483 * assume the result field points to a buffer that is the size of one SU,
484 * and use the pda params to determine where within the buffer to XOR
485 * the input buffers.
486 */
487int
488rf_RegularXorFunc(node)
489	RF_DagNode_t *node;
490{
491	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
492	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
493	RF_Etimer_t timer;
494	int     i, retcode;
495
496	retcode = 0;
497	if (node->dagHdr->status == rf_enable) {
498		/* don't do the XOR if the input is the same as the output */
499		RF_ETIMER_START(timer);
500		for (i = 0; i < node->numParams - 1; i += 2)
501			if (node->params[i + 1].p != node->results[0]) {
502				retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
503				    (char *) node->params[i + 1].p, (char *) node->results[0], node->dagHdr->bp);
504			}
505		RF_ETIMER_STOP(timer);
506		RF_ETIMER_EVAL(timer);
507		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
508	}
509	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
510							 * explicitly since no
511							 * I/O in this node */
512}
513/* xor the inputs into the result buffer, ignoring placement issues */
514int
515rf_SimpleXorFunc(node)
516	RF_DagNode_t *node;
517{
518	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
519	int     i, retcode = 0;
520	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
521	RF_Etimer_t timer;
522
523	if (node->dagHdr->status == rf_enable) {
524		RF_ETIMER_START(timer);
525		/* don't do the XOR if the input is the same as the output */
526		for (i = 0; i < node->numParams - 1; i += 2)
527			if (node->params[i + 1].p != node->results[0]) {
528				retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0],
529				    rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector),
530				    (struct buf *) node->dagHdr->bp);
531			}
532		RF_ETIMER_STOP(timer);
533		RF_ETIMER_EVAL(timer);
534		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
535	}
536	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
537							 * explicitly since no
538							 * I/O in this node */
539}
540/* this xor is used by the degraded-mode dag functions to recover lost data.
541 * the second-to-last parameter is the PDA for the failed portion of the access.
542 * the code here looks at this PDA and assumes that the xor target buffer is
543 * equal in size to the number of sectors in the failed PDA.  It then uses
544 * the other PDAs in the parameter list to determine where within the target
545 * buffer the corresponding data should be xored.
546 */
547int
548rf_RecoveryXorFunc(node)
549	RF_DagNode_t *node;
550{
551	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
552	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
553	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
554	int     i, retcode = 0;
555	RF_PhysDiskAddr_t *pda;
556	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
557	char   *srcbuf, *destbuf;
558	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
559	RF_Etimer_t timer;
560
561	if (node->dagHdr->status == rf_enable) {
562		RF_ETIMER_START(timer);
563		for (i = 0; i < node->numParams - 2; i += 2)
564			if (node->params[i + 1].p != node->results[0]) {
565				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
566				srcbuf = (char *) node->params[i + 1].p;
567				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
568				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
569				retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
570			}
571		RF_ETIMER_STOP(timer);
572		RF_ETIMER_EVAL(timer);
573		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
574	}
575	return (rf_GenericWakeupFunc(node, retcode));
576}
577/*****************************************************************************************
578 * The next three functions are utilities used by the above xor-execution functions.
579 ****************************************************************************************/
580
581
582/*
583 * this is just a glorified buffer xor.  targbuf points to a buffer that is one full stripe unit
584 * in size.  srcbuf points to a buffer that may be less than 1 SU, but never more.  When the
585 * access described by pda is one SU in size (which by implication means it's SU-aligned),
586 * all that happens is (targbuf) <- (srcbuf ^ targbuf).  When the access is less than one
587 * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
588 */
589
590int
591rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
592	RF_Raid_t *raidPtr;
593	RF_PhysDiskAddr_t *pda;
594	char   *srcbuf;
595	char   *targbuf;
596	void   *bp;
597{
598	char   *targptr;
599	int     sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
600	int     SUOffset = pda->startSector % sectPerSU;
601	int     length, retcode = 0;
602
603	RF_ASSERT(pda->numSector <= sectPerSU);
604
605	targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
606	length = rf_RaidAddressToByte(raidPtr, pda->numSector);
607	retcode = rf_bxor(srcbuf, targptr, length, bp);
608	return (retcode);
609}
610/* it really should be the case that the buffer pointers (returned by malloc)
611 * are aligned to the natural word size of the machine, so this is the only
612 * case we optimize for.  The length should always be a multiple of the sector
613 * size, so there should be no problem with leftover bytes at the end.
614 */
615int
616rf_bxor(src, dest, len, bp)
617	char   *src;
618	char   *dest;
619	int     len;
620	void   *bp;
621{
622	unsigned mask = sizeof(long) - 1, retcode = 0;
623
624	if (!(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len & mask)) {
625		retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len >> RF_LONGSHIFT, bp);
626	} else {
627		RF_ASSERT(0);
628	}
629	return (retcode);
630}
631/* map a user buffer into kernel space, if necessary */
632#define REMAP_VA(_bp,x,y) (y) = (x)
633
634/* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
635 * We don't want to assume anything about which input buffers are in kernel/user
636 * space, nor about their alignment, so in each loop we compute the maximum number
637 * of bytes that we can xor without crossing any page boundaries, and do only this many
638 * bytes before the next remap.
639 */
640int
641rf_longword_bxor(src, dest, len, bp)
642	unsigned long *src;
643	unsigned long *dest;
644	int     len;		/* longwords */
645	void   *bp;
646{
647	unsigned long *end = src + len;
648	unsigned long d0, d1, d2, d3, s0, s1, s2, s3;	/* temps */
649	unsigned long *pg_src, *pg_dest;	/* per-page source/dest
650							 * pointers */
651	int     longs_this_time;/* # longwords to xor in the current iteration */
652
653	REMAP_VA(bp, src, pg_src);
654	REMAP_VA(bp, dest, pg_dest);
655	if (!pg_src || !pg_dest)
656		return (EFAULT);
657
658	while (len >= 4) {
659		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT);	/* note len in longwords */
660		src += longs_this_time;
661		dest += longs_this_time;
662		len -= longs_this_time;
663		while (longs_this_time >= 4) {
664			d0 = pg_dest[0];
665			d1 = pg_dest[1];
666			d2 = pg_dest[2];
667			d3 = pg_dest[3];
668			s0 = pg_src[0];
669			s1 = pg_src[1];
670			s2 = pg_src[2];
671			s3 = pg_src[3];
672			pg_dest[0] = d0 ^ s0;
673			pg_dest[1] = d1 ^ s1;
674			pg_dest[2] = d2 ^ s2;
675			pg_dest[3] = d3 ^ s3;
676			pg_src += 4;
677			pg_dest += 4;
678			longs_this_time -= 4;
679		}
680		while (longs_this_time > 0) {	/* cannot cross any page
681						 * boundaries here */
682			*pg_dest++ ^= *pg_src++;
683			longs_this_time--;
684		}
685
686		/* either we're done, or we've reached a page boundary on one
687		 * (or possibly both) of the pointers */
688		if (len) {
689			if (RF_PAGE_ALIGNED(src))
690				REMAP_VA(bp, src, pg_src);
691			if (RF_PAGE_ALIGNED(dest))
692				REMAP_VA(bp, dest, pg_dest);
693			if (!pg_src || !pg_dest)
694				return (EFAULT);
695		}
696	}
697	while (src < end) {
698		*pg_dest++ ^= *pg_src++;
699		src++;
700		dest++;
701		len--;
702		if (RF_PAGE_ALIGNED(src))
703			REMAP_VA(bp, src, pg_src);
704		if (RF_PAGE_ALIGNED(dest))
705			REMAP_VA(bp, dest, pg_dest);
706	}
707	RF_ASSERT(len == 0);
708	return (0);
709}
710
711
712/*
713   dst = a ^ b ^ c;
714   a may equal dst
715   see comment above longword_bxor
716*/
717int
718rf_longword_bxor3(dst, a, b, c, len, bp)
719	unsigned long *dst;
720	unsigned long *a;
721	unsigned long *b;
722	unsigned long *c;
723	int     len;		/* length in longwords */
724	void   *bp;
725{
726	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
727	unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;	/* per-page source/dest
728								 * pointers */
729	int     longs_this_time;/* # longs to xor in the current iteration */
730	char    dst_is_a = 0;
731
732	REMAP_VA(bp, a, pg_a);
733	REMAP_VA(bp, b, pg_b);
734	REMAP_VA(bp, c, pg_c);
735	if (a == dst) {
736		pg_dst = pg_a;
737		dst_is_a = 1;
738	} else {
739		REMAP_VA(bp, dst, pg_dst);
740	}
741
742	/* align dest to cache line.  Can't cross a pg boundary on dst here. */
743	while ((((unsigned long) pg_dst) & 0x1f)) {
744		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
745		dst++;
746		a++;
747		b++;
748		c++;
749		if (RF_PAGE_ALIGNED(a)) {
750			REMAP_VA(bp, a, pg_a);
751			if (!pg_a)
752				return (EFAULT);
753		}
754		if (RF_PAGE_ALIGNED(b)) {
755			REMAP_VA(bp, a, pg_b);
756			if (!pg_b)
757				return (EFAULT);
758		}
759		if (RF_PAGE_ALIGNED(c)) {
760			REMAP_VA(bp, a, pg_c);
761			if (!pg_c)
762				return (EFAULT);
763		}
764		len--;
765	}
766
767	while (len > 4) {
768		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
769		a += longs_this_time;
770		b += longs_this_time;
771		c += longs_this_time;
772		dst += longs_this_time;
773		len -= longs_this_time;
774		while (longs_this_time >= 4) {
775			a0 = pg_a[0];
776			longs_this_time -= 4;
777
778			a1 = pg_a[1];
779			a2 = pg_a[2];
780
781			a3 = pg_a[3];
782			pg_a += 4;
783
784			b0 = pg_b[0];
785			b1 = pg_b[1];
786
787			b2 = pg_b[2];
788			b3 = pg_b[3];
789			/* start dual issue */
790			a0 ^= b0;
791			b0 = pg_c[0];
792
793			pg_b += 4;
794			a1 ^= b1;
795
796			a2 ^= b2;
797			a3 ^= b3;
798
799			b1 = pg_c[1];
800			a0 ^= b0;
801
802			b2 = pg_c[2];
803			a1 ^= b1;
804
805			b3 = pg_c[3];
806			a2 ^= b2;
807
808			pg_dst[0] = a0;
809			a3 ^= b3;
810			pg_dst[1] = a1;
811			pg_c += 4;
812			pg_dst[2] = a2;
813			pg_dst[3] = a3;
814			pg_dst += 4;
815		}
816		while (longs_this_time > 0) {	/* cannot cross any page
817						 * boundaries here */
818			*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
819			longs_this_time--;
820		}
821
822		if (len) {
823			if (RF_PAGE_ALIGNED(a)) {
824				REMAP_VA(bp, a, pg_a);
825				if (!pg_a)
826					return (EFAULT);
827				if (dst_is_a)
828					pg_dst = pg_a;
829			}
830			if (RF_PAGE_ALIGNED(b)) {
831				REMAP_VA(bp, b, pg_b);
832				if (!pg_b)
833					return (EFAULT);
834			}
835			if (RF_PAGE_ALIGNED(c)) {
836				REMAP_VA(bp, c, pg_c);
837				if (!pg_c)
838					return (EFAULT);
839			}
840			if (!dst_is_a)
841				if (RF_PAGE_ALIGNED(dst)) {
842					REMAP_VA(bp, dst, pg_dst);
843					if (!pg_dst)
844						return (EFAULT);
845				}
846		}
847	}
848	while (len) {
849		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
850		dst++;
851		a++;
852		b++;
853		c++;
854		if (RF_PAGE_ALIGNED(a)) {
855			REMAP_VA(bp, a, pg_a);
856			if (!pg_a)
857				return (EFAULT);
858			if (dst_is_a)
859				pg_dst = pg_a;
860		}
861		if (RF_PAGE_ALIGNED(b)) {
862			REMAP_VA(bp, b, pg_b);
863			if (!pg_b)
864				return (EFAULT);
865		}
866		if (RF_PAGE_ALIGNED(c)) {
867			REMAP_VA(bp, c, pg_c);
868			if (!pg_c)
869				return (EFAULT);
870		}
871		if (!dst_is_a)
872			if (RF_PAGE_ALIGNED(dst)) {
873				REMAP_VA(bp, dst, pg_dst);
874				if (!pg_dst)
875					return (EFAULT);
876			}
877		len--;
878	}
879	return (0);
880}
881
882int
883rf_bxor3(dst, a, b, c, len, bp)
884	unsigned char *dst;
885	unsigned char *a;
886	unsigned char *b;
887	unsigned char *c;
888	unsigned long len;
889	void   *bp;
890{
891	RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0);
892
893	return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
894		(unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp));
895}
896